summaryrefslogtreecommitdiffstats
path: root/libvpx/vp9
diff options
context:
space:
mode:
Diffstat (limited to 'libvpx/vp9')
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c91
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c47
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm36
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm8
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm44
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm570
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm8
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm8
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm8
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm16
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm14
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm14
-rw-r--r--libvpx/vp9/common/generic/vp9_systemdependent.c2
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h117
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c281
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c833
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c784
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c713
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c266
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c695
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c1038
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c1284
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c923
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c396
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c1315
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c1073
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c1013
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c438
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c745
-rw-r--r--libvpx/vp9/common/vp9_alloccommon.c42
-rw-r--r--libvpx/vp9/common/vp9_blockd.h147
-rw-r--r--libvpx/vp9/common/vp9_common.h12
-rw-r--r--libvpx/vp9/common/vp9_common_data.c11
-rw-r--r--libvpx/vp9/common/vp9_common_data.h3
-rw-r--r--libvpx/vp9/common/vp9_convolve.c12
-rw-r--r--libvpx/vp9/common/vp9_convolve.h13
-rw-r--r--libvpx/vp9/common/vp9_debugmodes.c4
-rw-r--r--libvpx/vp9/common/vp9_default_coef_probs.h3
-rw-r--r--libvpx/vp9/common/vp9_entropy.c325
-rw-r--r--libvpx/vp9/common/vp9_entropy.h251
-rw-r--r--libvpx/vp9/common/vp9_entropymode.c478
-rw-r--r--libvpx/vp9/common/vp9_entropymode.h22
-rw-r--r--libvpx/vp9/common/vp9_entropymv.c98
-rw-r--r--libvpx/vp9/common/vp9_entropymv.h23
-rw-r--r--libvpx/vp9/common/vp9_enums.h13
-rw-r--r--libvpx/vp9/common/vp9_filter.c31
-rw-r--r--libvpx/vp9/common/vp9_filter.h30
-rw-r--r--libvpx/vp9/common/vp9_findnearmv.c19
-rw-r--r--libvpx/vp9/common/vp9_findnearmv.h62
-rw-r--r--libvpx/vp9/common/vp9_idct.c273
-rw-r--r--libvpx/vp9/common/vp9_idct.h25
-rw-r--r--libvpx/vp9/common/vp9_loopfilter.c52
-rw-r--r--libvpx/vp9/common/vp9_loopfilter.h15
-rw-r--r--libvpx/vp9/common/vp9_loopfilter_filters.c2
-rw-r--r--libvpx/vp9/common/vp9_mvref_common.c28
-rw-r--r--libvpx/vp9/common/vp9_mvref_common.h4
-rw-r--r--libvpx/vp9/common/vp9_onyx.h63
-rw-r--r--libvpx/vp9/common/vp9_onyxc_int.h166
-rw-r--r--libvpx/vp9/common/vp9_postproc.c40
-rw-r--r--libvpx/vp9/common/vp9_pred_common.c111
-rw-r--r--libvpx/vp9/common/vp9_pred_common.h55
-rw-r--r--libvpx/vp9/common/vp9_quant_common.c118
-rw-r--r--libvpx/vp9/common/vp9_reconinter.c163
-rw-r--r--libvpx/vp9/common/vp9_reconinter.h8
-rw-r--r--libvpx/vp9/common/vp9_reconintra.c4
-rw-r--r--libvpx/vp9/common/vp9_reconintra.h8
-rw-r--r--libvpx/vp9/common/vp9_rtcd.c4
-rw-r--r--libvpx/vp9/common/vp9_rtcd_defs.sh179
-rw-r--r--libvpx/vp9/common/vp9_scale.c115
-rw-r--r--libvpx/vp9/common/vp9_scale.h30
-rw-r--r--libvpx/vp9/common/vp9_scan.c357
-rw-r--r--libvpx/vp9/common/vp9_scan.h199
-rw-r--r--libvpx/vp9/common/vp9_seg_common.c2
-rw-r--r--libvpx/vp9/common/vp9_seg_common.h2
-rw-r--r--libvpx/vp9/common/vp9_subpelvar.h145
-rw-r--r--libvpx/vp9/common/vp9_systemdependent.h5
-rw-r--r--libvpx/vp9/common/vp9_tile_common.c21
-rw-r--r--libvpx/vp9/common/vp9_tile_common.h12
-rw-r--r--libvpx/vp9/common/vp9_treecoder.c34
-rw-r--r--libvpx/vp9/common/vp9_treecoder.h45
-rw-r--r--libvpx/vp9/common/x86/vp9_asm_stubs.c319
-rw-r--r--libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c1322
-rw-r--r--libvpx/vp9/common/x86/vp9_intrapred_sse2.asm32
-rw-r--r--libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm813
-rw-r--r--libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c943
-rw-r--r--libvpx/vp9/common/x86/vp9_postproc_x86.h2
-rw-r--r--libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm987
-rw-r--r--libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm274
-rw-r--r--libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm230
-rw-r--r--libvpx/vp9/decoder/vp9_dboolhuff.h2
-rw-r--r--libvpx/vp9/decoder/vp9_decodemv.c513
-rw-r--r--libvpx/vp9/decoder/vp9_decodemv.h6
-rw-r--r--libvpx/vp9/decoder/vp9_decodframe.c1061
-rw-r--r--libvpx/vp9/decoder/vp9_detokenize.c115
-rw-r--r--libvpx/vp9/decoder/vp9_detokenize.h5
-rw-r--r--libvpx/vp9/decoder/vp9_dsubexp.c15
-rw-r--r--libvpx/vp9/decoder/vp9_idct_blk.c152
-rw-r--r--libvpx/vp9/decoder/vp9_idct_blk.h30
-rw-r--r--libvpx/vp9/decoder/vp9_onyxd.h8
-rw-r--r--libvpx/vp9/decoder/vp9_onyxd_if.c104
-rw-r--r--libvpx/vp9/decoder/vp9_onyxd_int.h16
-rw-r--r--libvpx/vp9/decoder/vp9_read_bit_buffer.h6
-rw-r--r--libvpx/vp9/decoder/vp9_thread.c17
-rw-r--r--libvpx/vp9/decoder/vp9_thread.h13
-rw-r--r--libvpx/vp9/decoder/vp9_treereader.h3
-rw-r--r--libvpx/vp9/decoder/x86/vp9_dequantize_sse2.c220
-rw-r--r--libvpx/vp9/encoder/vp9_bitstream.c365
-rw-r--r--libvpx/vp9/encoder/vp9_block.h94
-rw-r--r--libvpx/vp9/encoder/vp9_boolhuff.c39
-rw-r--r--libvpx/vp9/encoder/vp9_dct.c216
-rw-r--r--libvpx/vp9/encoder/vp9_dct.h24
-rw-r--r--libvpx/vp9/encoder/vp9_encodeframe.c989
-rw-r--r--libvpx/vp9/encoder/vp9_encodeframe.h2
-rw-r--r--libvpx/vp9/encoder/vp9_encodeintra.c2
-rw-r--r--libvpx/vp9/encoder/vp9_encodemb.c230
-rw-r--r--libvpx/vp9/encoder/vp9_encodemb.h27
-rw-r--r--libvpx/vp9/encoder/vp9_encodemv.c100
-rw-r--r--libvpx/vp9/encoder/vp9_encodemv.h4
-rw-r--r--libvpx/vp9/encoder/vp9_firstpass.c372
-rw-r--r--libvpx/vp9/encoder/vp9_firstpass.h1
-rw-r--r--libvpx/vp9/encoder/vp9_lookahead.c4
-rw-r--r--libvpx/vp9/encoder/vp9_mbgraph.c81
-rw-r--r--libvpx/vp9/encoder/vp9_mcomp.c766
-rw-r--r--libvpx/vp9/encoder/vp9_mcomp.h33
-rw-r--r--libvpx/vp9/encoder/vp9_modecosts.c4
-rw-r--r--libvpx/vp9/encoder/vp9_onyx_if.c1424
-rw-r--r--libvpx/vp9/encoder/vp9_onyx_int.h141
-rw-r--r--libvpx/vp9/encoder/vp9_picklpf.c43
-rw-r--r--libvpx/vp9/encoder/vp9_psnr.c2
-rw-r--r--libvpx/vp9/encoder/vp9_quantize.c73
-rw-r--r--libvpx/vp9/encoder/vp9_quantize.h26
-rw-r--r--libvpx/vp9/encoder/vp9_ratectrl.c93
-rw-r--r--libvpx/vp9/encoder/vp9_ratectrl.h4
-rw-r--r--libvpx/vp9/encoder/vp9_rdopt.c1887
-rw-r--r--libvpx/vp9/encoder/vp9_rdopt.h33
-rw-r--r--libvpx/vp9/encoder/vp9_segmentation.c43
-rw-r--r--libvpx/vp9/encoder/vp9_ssim.c4
-rw-r--r--libvpx/vp9/encoder/vp9_subexp.c3
-rw-r--r--libvpx/vp9/encoder/vp9_subexp.h2
-rw-r--r--libvpx/vp9/encoder/vp9_temporal_filter.c56
-rw-r--r--libvpx/vp9/encoder/vp9_tokenize.c182
-rw-r--r--libvpx/vp9/encoder/vp9_tokenize.h10
-rw-r--r--libvpx/vp9/encoder/vp9_vaq.c147
-rw-r--r--libvpx/vp9/encoder/vp9_vaq.h26
-rw-r--r--libvpx/vp9/encoder/vp9_variance.h15
-rw-r--r--libvpx/vp9/encoder/vp9_variance_c.c147
-rw-r--r--libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c290
-rw-r--r--libvpx/vp9/encoder/x86/vp9_dct_sse2.c69
-rw-r--r--libvpx/vp9/encoder/x86/vp9_variance_mmx.c45
-rw-r--r--libvpx/vp9/encoder/x86/vp9_variance_sse2.c29
-rw-r--r--libvpx/vp9/vp9_common.mk25
-rw-r--r--libvpx/vp9/vp9_cx_iface.c241
-rw-r--r--libvpx/vp9/vp9_dx_iface.c63
-rw-r--r--libvpx/vp9/vp9cx.mk3
-rw-r--r--libvpx/vp9/vp9dx.mk5
155 files changed, 23894 insertions, 8462 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c b/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
index 3e3e400..0b9fc09 100644
--- a/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -11,45 +11,47 @@
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_common.h"
-extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
- int16_t *output,
- int output_stride);
-extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
- int16_t *output,
- int16_t *pass1Output,
- int16_t skip_adding,
- uint8_t *dest,
- int dest_stride);
-extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input,
- int16_t *output,
- int output_stride);
-extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
- int16_t *output,
- int16_t *pass1Output,
- int16_t skip_adding,
- uint8_t *dest,
- int dest_stride);
-extern void save_neon_registers();
-extern void restore_neon_registers();
-
-
-void vp9_short_idct16x16_add_neon(int16_t *input,
- uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
+ int16_t *output,
+ int output_stride);
+void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
+ int16_t *output,
+ int16_t *pass1Output,
+ int16_t skip_adding,
+ uint8_t *dest,
+ int dest_stride);
+void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
+ int16_t *output,
+ int output_stride);
+void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
+ int16_t *output,
+ int16_t *pass1Output,
+ int16_t skip_adding,
+ uint8_t *dest,
+ int dest_stride);
+
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
+extern void vp9_push_neon(int64_t *store);
+extern void vp9_pop_neon(int64_t *store);
+
+void vp9_idct16x16_256_add_neon(const int16_t *input,
+ uint8_t *dest, int dest_stride) {
+ int64_t store_reg[8];
int16_t pass1_output[16*16] = {0};
int16_t row_idct_output[16*16] = {0};
// save d8-d15 register values.
- save_neon_registers();
+ vp9_push_neon(store_reg);
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
+ vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
- vp9_short_idct16x16_add_neon_pass2(input+1,
+ vp9_idct16x16_256_add_neon_pass2(input+1,
row_idct_output,
pass1_output,
0,
@@ -59,12 +61,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
/* Parallel idct on the lower 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
+ vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
- vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
+ vp9_idct16x16_256_add_neon_pass2(input+8*16+1,
row_idct_output+8,
pass1_output,
0,
@@ -74,12 +76,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+ vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
- vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+ vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
row_idct_output,
pass1_output,
1,
@@ -89,12 +91,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+ vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
- vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+ vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
row_idct_output+8,
pass1_output,
1,
@@ -102,28 +104,29 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
dest_stride);
// restore d8-d15 register values.
- restore_neon_registers();
+ vp9_pop_neon(store_reg);
return;
}
-void vp9_short_idct10_16x16_add_neon(int16_t *input,
- uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_10_add_neon(const int16_t *input,
+ uint8_t *dest, int dest_stride) {
+ int64_t store_reg[8];
int16_t pass1_output[16*16] = {0};
int16_t row_idct_output[16*16] = {0};
// save d8-d15 register values.
- save_neon_registers();
+ vp9_push_neon(store_reg);
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8);
+ vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
- vp9_short_idct10_16x16_add_neon_pass2(input+1,
+ vp9_idct16x16_10_add_neon_pass2(input+1,
row_idct_output,
pass1_output,
0,
@@ -135,12 +138,12 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input,
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+ vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
- vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+ vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
row_idct_output,
pass1_output,
1,
@@ -150,12 +153,12 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input,
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+ vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
- vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+ vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
row_idct_output+8,
pass1_output,
1,
@@ -163,7 +166,7 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input,
dest_stride);
// restore d8-d15 register values.
- restore_neon_registers();
+ vp9_pop_neon(store_reg);
return;
}
diff --git a/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c b/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c
deleted file mode 100644
index ceecd6f..0000000
--- a/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/common/vp9_common.h"
-
-// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
-extern void idct32_transpose_and_transform(int16_t *transpose_buffer,
- int16_t *output, int16_t *input);
-extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
-
-
-// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
-extern void save_neon_registers();
-extern void restore_neon_registers();
-
-void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest,
- int dest_stride) {
- // TODO(cd): move the creation of these buffers within the ASM file
- // internal buffer used to transpose 8 lines into before transforming them
- int16_t transpose_buffer[32 * 8];
- // results of the first pass (transpose and transform rows)
- int16_t pass1[32 * 32];
- // results of the second pass (transpose and transform columns)
- int16_t pass2[32 * 32];
-
- // save register we need to preserve
- save_neon_registers();
- // process rows
- idct32_transpose_and_transform(transpose_buffer, pass1, input);
- // process columns
- // TODO(cd): do these two steps/passes within the ASM file
- idct32_transpose_and_transform(transpose_buffer, pass2, pass1);
- // combine and add to dest
- // TODO(cd): integrate this within the last storage step of the second pass
- idct32_combine_add(dest, pass2, dest_stride);
- // restore register we need to preserve
- restore_neon_registers();
-}
-
-// TODO(cd): Eliminate this file altogether when everything is in ASM file
diff --git a/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm
new file mode 100644
index 0000000..71c3e70
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm
@@ -0,0 +1,36 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_push_neon|
+ EXPORT |vp9_pop_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_push_neon| PROC
+ vst1.i64 {d8, d9, d10, d11}, [r0]!
+ vst1.i64 {d12, d13, d14, d15}, [r0]!
+ bx lr
+
+ ENDP
+
+|vp9_pop_neon| PROC
+ vld1.i64 {d8, d9, d10, d11}, [r0]!
+ vld1.i64 {d12, d13, d14, d15}, [r0]!
+ bx lr
+
+ ENDP
+
+ END
+
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
index cf5c8f7..b1fd21b 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
@@ -8,21 +8,21 @@
;
- EXPORT |vp9_short_idct16x16_1_add_neon|
+ EXPORT |vp9_idct16x16_1_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
; int dest_stride)
;
; r0 int16_t input
; r1 uint8_t *dest
; r2 int dest_stride)
-|vp9_short_idct16x16_1_add_neon| PROC
+|vp9_idct16x16_1_add_neon| PROC
ldrsh r0, [r0]
; generate cospi_16_64 = 11585
@@ -193,6 +193,6 @@
vst1.64 {d31}, [r12], r2
bx lr
- ENDP ; |vp9_short_idct16x16_1_add_neon|
+ ENDP ; |vp9_idct16x16_1_add_neon|
END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
index 7464e80..a13c0d0 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
@@ -8,12 +8,10 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_short_idct16x16_add_neon_pass1|
- EXPORT |vp9_short_idct16x16_add_neon_pass2|
- EXPORT |vp9_short_idct10_16x16_add_neon_pass1|
- EXPORT |vp9_short_idct10_16x16_add_neon_pass2|
- EXPORT |save_neon_registers|
- EXPORT |restore_neon_registers|
+ EXPORT |vp9_idct16x16_256_add_neon_pass1|
+ EXPORT |vp9_idct16x16_256_add_neon_pass2|
+ EXPORT |vp9_idct16x16_10_add_neon_pass1|
+ EXPORT |vp9_idct16x16_10_add_neon_pass2|
ARM
REQUIRE8
PRESERVE8
@@ -38,7 +36,7 @@
MEND
AREA Block, CODE, READONLY ; name this block of code
-;void |vp9_short_idct16x16_add_neon_pass1|(int16_t *input,
+;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input,
; int16_t *output, int output_stride)
;
; r0 int16_t input
@@ -48,7 +46,7 @@
; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
; will be stored back into q8-q15 registers. This function will touch q0-q7
; registers and use them as buffer during calculation.
-|vp9_short_idct16x16_add_neon_pass1| PROC
+|vp9_idct16x16_256_add_neon_pass1| PROC
; TODO(hkuang): Find a better way to load the elements.
; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
@@ -275,9 +273,9 @@
vst1.64 {d31}, [r1], r2
bx lr
- ENDP ; |vp9_short_idct16x16_add_neon_pass1|
+ ENDP ; |vp9_idct16x16_256_add_neon_pass1|
-;void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
+;void vp9_idct16x16_256_add_neon_pass2(int16_t *src,
; int16_t *output,
; int16_t *pass1Output,
; int16_t skip_adding,
@@ -294,7 +292,7 @@
; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
; will be stored back into q8-q15 registers. This function will touch q0-q7
; registers and use them as buffer during calculation.
-|vp9_short_idct16x16_add_neon_pass2| PROC
+|vp9_idct16x16_256_add_neon_pass2| PROC
push {r3-r9}
; TODO(hkuang): Find a better way to load the elements.
@@ -786,9 +784,9 @@ skip_adding_dest
end_idct16x16_pass2
pop {r3-r9}
bx lr
- ENDP ; |vp9_short_idct16x16_add_neon_pass2|
+ ENDP ; |vp9_idct16x16_256_add_neon_pass2|
-;void |vp9_short_idct10_16x16_add_neon_pass1|(int16_t *input,
+;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input,
; int16_t *output, int output_stride)
;
; r0 int16_t input
@@ -798,7 +796,7 @@ end_idct16x16_pass2
; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
; will be stored back into q8-q15 registers. This function will touch q0-q7
; registers and use them as buffer during calculation.
-|vp9_short_idct10_16x16_add_neon_pass1| PROC
+|vp9_idct16x16_10_add_neon_pass1| PROC
; TODO(hkuang): Find a better way to load the elements.
; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
@@ -907,9 +905,9 @@ end_idct16x16_pass2
vst1.64 {d31}, [r1], r2
bx lr
- ENDP ; |vp9_short_idct10_16x16_add_neon_pass1|
+ ENDP ; |vp9_idct16x16_10_add_neon_pass1|
-;void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
+;void vp9_idct16x16_10_add_neon_pass2(int16_t *src,
; int16_t *output,
; int16_t *pass1Output,
; int16_t skip_adding,
@@ -926,7 +924,7 @@ end_idct16x16_pass2
; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
; will be stored back into q8-q15 registers. This function will touch q0-q7
; registers and use them as buffer during calculation.
-|vp9_short_idct10_16x16_add_neon_pass2| PROC
+|vp9_idct16x16_10_add_neon_pass2| PROC
push {r3-r9}
; TODO(hkuang): Find a better way to load the elements.
@@ -1177,15 +1175,5 @@ end_idct16x16_pass2
end_idct10_16x16_pass2
pop {r3-r9}
bx lr
- ENDP ; |vp9_short_idct10_16x16_add_neon_pass2|
-;void |save_neon_registers|()
-|save_neon_registers| PROC
- vpush {d8-d15}
- bx lr
- ENDP ; |save_registers|
-;void |restore_neon_registers|()
-|restore_neon_registers| PROC
- vpop {d8-d15}
- bx lr
- ENDP ; |restore_registers|
+ ENDP ; |vp9_idct16x16_10_add_neon_pass2|
END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
index 5c097cc..f00d027 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
@@ -43,8 +43,7 @@ cospi_30_64 EQU 1606
cospi_31_64 EQU 804
- EXPORT |idct32_transpose_and_transform|
- EXPORT |idct32_combine_add|
+ EXPORT |vp9_idct32x32_1024_add_neon|
ARM
REQUIRE8
PRESERVE8
@@ -100,6 +99,142 @@ cospi_31_64 EQU 804
vst1.16 {$reg2}, [r1]
MEND
; --------------------------------------------------------------------------
+ ; Combine-add results with current destination content
+ ; q6-q9 contain the results (out[j * 32 + 0-31])
+ MACRO
+ STORE_COMBINE_CENTER_RESULTS
+ ; load dest[j * dest_stride + 0-31]
+ vld1.s16 {d8}, [r10], r2
+ vld1.s16 {d11}, [r9], r11
+ vld1.s16 {d9}, [r10]
+ vld1.s16 {d10}, [r9]
+ ; ROUND_POWER_OF_TWO
+ vrshr.s16 q7, q7, #6
+ vrshr.s16 q8, q8, #6
+ vrshr.s16 q9, q9, #6
+ vrshr.s16 q6, q6, #6
+ ; add to dest[j * dest_stride + 0-31]
+ vaddw.u8 q7, q7, d9
+ vaddw.u8 q8, q8, d10
+ vaddw.u8 q9, q9, d11
+ vaddw.u8 q6, q6, d8
+ ; clip pixel
+ vqmovun.s16 d9, q7
+ vqmovun.s16 d10, q8
+ vqmovun.s16 d11, q9
+ vqmovun.s16 d8, q6
+ ; store back into dest[j * dest_stride + 0-31]
+ vst1.16 {d9}, [r10], r11
+ vst1.16 {d10}, [r9], r2
+ vst1.16 {d8}, [r10]
+ vst1.16 {d11}, [r9]
+ ; update pointers (by dest_stride * 2)
+ sub r9, r9, r2, lsl #1
+ add r10, r10, r2, lsl #1
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Combine-add results with current destination content
+ ; q6-q9 contain the results (out[j * 32 + 0-31])
+ MACRO
+ STORE_COMBINE_CENTER_RESULTS_LAST
+ ; load dest[j * dest_stride + 0-31]
+ vld1.s16 {d8}, [r10], r2
+ vld1.s16 {d11}, [r9], r11
+ vld1.s16 {d9}, [r10]
+ vld1.s16 {d10}, [r9]
+ ; ROUND_POWER_OF_TWO
+ vrshr.s16 q7, q7, #6
+ vrshr.s16 q8, q8, #6
+ vrshr.s16 q9, q9, #6
+ vrshr.s16 q6, q6, #6
+ ; add to dest[j * dest_stride + 0-31]
+ vaddw.u8 q7, q7, d9
+ vaddw.u8 q8, q8, d10
+ vaddw.u8 q9, q9, d11
+ vaddw.u8 q6, q6, d8
+ ; clip pixel
+ vqmovun.s16 d9, q7
+ vqmovun.s16 d10, q8
+ vqmovun.s16 d11, q9
+ vqmovun.s16 d8, q6
+ ; store back into dest[j * dest_stride + 0-31]
+ vst1.16 {d9}, [r10], r11
+ vst1.16 {d10}, [r9], r2
+ vst1.16 {d8}, [r10]!
+ vst1.16 {d11}, [r9]!
+ ; update pointers (by dest_stride * 2)
+ sub r9, r9, r2, lsl #1
+ add r10, r10, r2, lsl #1
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Combine-add results with current destination content
+ ; q4-q7 contain the results (out[j * 32 + 0-31])
+ MACRO
+ STORE_COMBINE_EXTREME_RESULTS
+ ; load dest[j * dest_stride + 0-31]
+ vld1.s16 {d4}, [r7], r2
+ vld1.s16 {d7}, [r6], r11
+ vld1.s16 {d5}, [r7]
+ vld1.s16 {d6}, [r6]
+ ; ROUND_POWER_OF_TWO
+ vrshr.s16 q5, q5, #6
+ vrshr.s16 q6, q6, #6
+ vrshr.s16 q7, q7, #6
+ vrshr.s16 q4, q4, #6
+ ; add to dest[j * dest_stride + 0-31]
+ vaddw.u8 q5, q5, d5
+ vaddw.u8 q6, q6, d6
+ vaddw.u8 q7, q7, d7
+ vaddw.u8 q4, q4, d4
+ ; clip pixel
+ vqmovun.s16 d5, q5
+ vqmovun.s16 d6, q6
+ vqmovun.s16 d7, q7
+ vqmovun.s16 d4, q4
+ ; store back into dest[j * dest_stride + 0-31]
+ vst1.16 {d5}, [r7], r11
+ vst1.16 {d6}, [r6], r2
+ vst1.16 {d7}, [r6]
+ vst1.16 {d4}, [r7]
+ ; update pointers (by dest_stride * 2)
+ sub r6, r6, r2, lsl #1
+ add r7, r7, r2, lsl #1
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Combine-add results with current destination content
+ ; q4-q7 contain the results (out[j * 32 + 0-31])
+ MACRO
+ STORE_COMBINE_EXTREME_RESULTS_LAST
+ ; load dest[j * dest_stride + 0-31]
+ vld1.s16 {d4}, [r7], r2
+ vld1.s16 {d7}, [r6], r11
+ vld1.s16 {d5}, [r7]
+ vld1.s16 {d6}, [r6]
+ ; ROUND_POWER_OF_TWO
+ vrshr.s16 q5, q5, #6
+ vrshr.s16 q6, q6, #6
+ vrshr.s16 q7, q7, #6
+ vrshr.s16 q4, q4, #6
+ ; add to dest[j * dest_stride + 0-31]
+ vaddw.u8 q5, q5, d5
+ vaddw.u8 q6, q6, d6
+ vaddw.u8 q7, q7, d7
+ vaddw.u8 q4, q4, d4
+ ; clip pixel
+ vqmovun.s16 d5, q5
+ vqmovun.s16 d6, q6
+ vqmovun.s16 d7, q7
+ vqmovun.s16 d4, q4
+ ; store back into dest[j * dest_stride + 0-31]
+ vst1.16 {d5}, [r7], r11
+ vst1.16 {d6}, [r6], r2
+ vst1.16 {d7}, [r6]!
+ vst1.16 {d4}, [r7]!
+ ; update pointers (by dest_stride * 2)
+ sub r6, r6, r2, lsl #1
+ add r7, r7, r2, lsl #1
+ MEND
+ ; --------------------------------------------------------------------------
; Touches q8-q12, q15 (q13-q14 are preserved)
; valid output registers are anything but q8-q11
MACRO
@@ -110,12 +245,12 @@ cospi_31_64 EQU 804
; additions/substractions before the multiplies.
; generate the constants
; generate scalar constants
- mov r3, #$first_constant & 0xFF00
- add r3, #$first_constant & 0x00FF
+ mov r8, #$first_constant & 0xFF00
mov r12, #$second_constant & 0xFF00
+ add r8, #$first_constant & 0x00FF
add r12, #$second_constant & 0x00FF
; generate vector constants
- vdup.16 d30, r3
+ vdup.16 d30, r8
vdup.16 d31, r12
; (used) two for inputs (regA-regD), one for constants (q15)
; do some multiplications (ordered for maximum latency hiding)
@@ -153,15 +288,22 @@ cospi_31_64 EQU 804
MEND
; --------------------------------------------------------------------------
-;void idct32_transpose_and_transform(int16_t *transpose_buffer, int16_t *output, int16_t *input);
+;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
;
-; r0 int16_t *transpose_buffer
-; r1 int16_t *output
-; r2 int16_t *input)
-; TODO(cd): have more logical parameter ordering but this issue will disappear
-; when functions are combined.
+; r0 int16_t *input,
+; r1 uint8_t *dest,
+; r2 int dest_stride)
+; loop counters
+; r4 bands loop counter
+; r5 pass loop counter
+; r8 transpose loop counter
+; combine-add pointers
+; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...)
+; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...)
+; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...)
+; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...)
-|idct32_transpose_and_transform| PROC
+|vp9_idct32x32_1024_add_neon| PROC
; This function does one pass of idct32x32 transform.
;
; This is done by transposing the input and then doing a 1d transform on
@@ -171,43 +313,73 @@ cospi_31_64 EQU 804
; The 1d transform is done by looping over bands of eight columns (the
; idct32_bands loop). For each band, the transform input transposition
; is done on demand, one band of four 8x8 matrices at a time. The four
- ; matrices are trsnposed by pairs (the idct32_transpose_pair loop).
- push {r4}
- mov r4, #0 ; initialize bands loop counter
+ ; matrices are transposed by pairs (the idct32_transpose_pair loop).
+ push {r4-r11}
+ vpush {d8-d15}
+ ; stack operation
+ ; internal buffer used to transpose 8 lines into before transforming them
+ ; int16_t transpose_buffer[32 * 8];
+ ; at sp + [4096, 4607]
+ ; results of the first pass (transpose and transform rows)
+ ; int16_t pass1[32 * 32];
+ ; at sp + [0, 2047]
+ ; results of the second pass (transpose and transform columns)
+ ; int16_t pass2[32 * 32];
+ ; at sp + [2048, 4095]
+ sub sp, sp, #512+2048+2048
+
+ ; r6 = dest + 31 * dest_stride
+ ; r7 = dest + 0 * dest_stride
+ ; r9 = dest + 15 * dest_stride
+ ; r10 = dest + 16 * dest_stride
+ rsb r6, r2, r2, lsl #5
+ rsb r9, r2, r2, lsl #4
+ add r10, r1, r2, lsl #4
+ mov r7, r1
+ add r6, r6, r1
+ add r9, r9, r1
+ ; r11 = -dest_stride
+ neg r11, r2
+ ; r3 = input
+ mov r3, r0
+ ; parameters for first pass
+ ; r0 = transpose_buffer[32 * 8]
+ add r0, sp, #4096
+ ; r1 = pass1[32 * 32]
+ mov r1, sp
+
+ mov r5, #0 ; initialize pass loop counter
+idct32_pass_loop
+ mov r4, #4 ; initialize bands loop counter
idct32_bands_loop
- ; TODO(cd) get rid of these push/pop by properly adjusting register
- ; content at end of loop
- push {r0}
- push {r1}
- push {r2}
- mov r3, #0 ; initialize transpose loop counter
+ mov r8, #2 ; initialize transpose loop counter
idct32_transpose_pair_loop
; Load two horizontally consecutive 8x8 16bit data matrices. The first one
; into q0-q7 and the second one into q8-q15. There is a stride of 64,
; adjusted to 32 because of the two post-increments.
- vld1.s16 {q8}, [r2]!
- vld1.s16 {q0}, [r2]!
- add r2, #32
- vld1.s16 {q9}, [r2]!
- vld1.s16 {q1}, [r2]!
- add r2, #32
- vld1.s16 {q10}, [r2]!
- vld1.s16 {q2}, [r2]!
- add r2, #32
- vld1.s16 {q11}, [r2]!
- vld1.s16 {q3}, [r2]!
- add r2, #32
- vld1.s16 {q12}, [r2]!
- vld1.s16 {q4}, [r2]!
- add r2, #32
- vld1.s16 {q13}, [r2]!
- vld1.s16 {q5}, [r2]!
- add r2, #32
- vld1.s16 {q14}, [r2]!
- vld1.s16 {q6}, [r2]!
- add r2, #32
- vld1.s16 {q15}, [r2]!
- vld1.s16 {q7}, [r2]!
+ vld1.s16 {q8}, [r3]!
+ vld1.s16 {q0}, [r3]!
+ add r3, #32
+ vld1.s16 {q9}, [r3]!
+ vld1.s16 {q1}, [r3]!
+ add r3, #32
+ vld1.s16 {q10}, [r3]!
+ vld1.s16 {q2}, [r3]!
+ add r3, #32
+ vld1.s16 {q11}, [r3]!
+ vld1.s16 {q3}, [r3]!
+ add r3, #32
+ vld1.s16 {q12}, [r3]!
+ vld1.s16 {q4}, [r3]!
+ add r3, #32
+ vld1.s16 {q13}, [r3]!
+ vld1.s16 {q5}, [r3]!
+ add r3, #32
+ vld1.s16 {q14}, [r3]!
+ vld1.s16 {q6}, [r3]!
+ add r3, #32
+ vld1.s16 {q15}, [r3]!
+ vld1.s16 {q7}, [r3]!
; Transpose the two 8x8 16bit data matrices.
vswp d17, d24
@@ -255,11 +427,13 @@ idct32_transpose_pair_loop
vst1.16 {q7}, [r0]!
; increment pointers by adjusted stride (not necessary for r0/out)
- sub r2, r2, #8*32*2-32-16*2
+ ; go back by 7*32 for the seven lines moved fully by read and add
+ ; go back by 32 for the eigth line only read
+ ; advance by 16*2 to go the next pair
+ sub r3, r3, #7*32*2 + 32 - 16*2
; transpose pair loop processing
- add r3, r3, #1
- cmp r3, #1
- BLE idct32_transpose_pair_loop
+ subs r8, r8, #1
+ bne idct32_transpose_pair_loop
; restore r0/input to its original value
sub r0, r0, #32*8*2
@@ -815,21 +989,26 @@ idct32_transpose_pair_loop
vadd.s16 q9, q5, q0
vsub.s16 q6, q5, q0
vsub.s16 q7, q4, q1
- STORE_IN_OUTPUT 17, 17, 16, q7, q6
- STORE_IN_OUTPUT 16, 15, 14, q9, q8
+
+ cmp r5, #0
+ bgt idct32_bands_end_2nd_pass
+
+idct32_bands_end_1st_pass
+ STORE_IN_OUTPUT 17, 16, 17, q6, q7
+ STORE_IN_OUTPUT 17, 14, 15, q8, q9
; --------------------------------------------------------------------------
; part of final stage
;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
;output[30 * 32] = step1b[1][i] - step1b[30][i];
;output[31 * 32] = step1b[0][i] - step1b[31][i];
- LOAD_FROM_OUTPUT 14, 30, 31, q0, q1
+ LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
vadd.s16 q4, q2, q1
vadd.s16 q5, q3, q0
vsub.s16 q6, q3, q0
vsub.s16 q7, q2, q1
- STORE_IN_OUTPUT 31, 31, 30, q7, q6
- STORE_IN_OUTPUT 30, 0, 1, q4, q5
+ STORE_IN_OUTPUT 31, 30, 31, q6, q7
+ STORE_IN_OUTPUT 31, 0, 1, q4, q5
; --------------------------------------------------------------------------
; part of stage 7
;step1[2] = step1b[2][i] + step1b[13][i];
@@ -848,25 +1027,25 @@ idct32_transpose_pair_loop
;output[18 * 32] = step1b[13][i] - step1b[18][i];
;output[19 * 32] = step1b[12][i] - step1b[19][i];
LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
- vadd.s16 q6, q4, q1
- vadd.s16 q7, q5, q0
- vsub.s16 q8, q5, q0
- vsub.s16 q9, q4, q1
- STORE_IN_OUTPUT 19, 19, 18, q9, q8
- STORE_IN_OUTPUT 18, 13, 12, q7, q6
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_IN_OUTPUT 19, 18, 19, q6, q7
+ STORE_IN_OUTPUT 19, 12, 13, q8, q9
; --------------------------------------------------------------------------
; part of final stage
;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
;output[28 * 32] = step1b[3][i] - step1b[28][i];
;output[29 * 32] = step1b[2][i] - step1b[29][i];
- LOAD_FROM_OUTPUT 12, 28, 29, q0, q1
+ LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
vadd.s16 q4, q2, q1
vadd.s16 q5, q3, q0
vsub.s16 q6, q3, q0
vsub.s16 q7, q2, q1
- STORE_IN_OUTPUT 29, 29, 28, q7, q6
- STORE_IN_OUTPUT 28, 2, 3, q4, q5
+ STORE_IN_OUTPUT 29, 28, 29, q6, q7
+ STORE_IN_OUTPUT 29, 2, 3, q4, q5
; --------------------------------------------------------------------------
; part of stage 7
;step1[4] = step1b[4][i] + step1b[11][i];
@@ -885,25 +1064,25 @@ idct32_transpose_pair_loop
;output[20 * 32] = step1b[11][i] - step1b[20][i];
;output[21 * 32] = step1b[10][i] - step1b[21][i];
LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
- vadd.s16 q6, q4, q1
- vadd.s16 q7, q5, q0
- vsub.s16 q8, q5, q0
- vsub.s16 q9, q4, q1
- STORE_IN_OUTPUT 21, 21, 20, q9, q8
- STORE_IN_OUTPUT 20, 11, 10, q7, q6
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_IN_OUTPUT 21, 20, 21, q6, q7
+ STORE_IN_OUTPUT 21, 10, 11, q8, q9
; --------------------------------------------------------------------------
; part of final stage
;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
;output[26 * 32] = step1b[5][i] - step1b[26][i];
;output[27 * 32] = step1b[4][i] - step1b[27][i];
- LOAD_FROM_OUTPUT 10, 26, 27, q0, q1
+ LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
vadd.s16 q4, q2, q1
vadd.s16 q5, q3, q0
vsub.s16 q6, q3, q0
vsub.s16 q7, q2, q1
- STORE_IN_OUTPUT 27, 27, 26, q7, q6
- STORE_IN_OUTPUT 26, 4, 5, q4, q5
+ STORE_IN_OUTPUT 27, 26, 27, q6, q7
+ STORE_IN_OUTPUT 27, 4, 5, q4, q5
; --------------------------------------------------------------------------
; part of stage 7
;step1[6] = step1b[6][i] + step1b[9][i];
@@ -922,92 +1101,199 @@ idct32_transpose_pair_loop
;output[22 * 32] = step1b[9][i] - step1b[22][i];
;output[23 * 32] = step1b[8][i] - step1b[23][i];
LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
- vadd.s16 q6, q4, q1
- vadd.s16 q7, q5, q0
- vsub.s16 q8, q5, q0
- vsub.s16 q9, q4, q1
- STORE_IN_OUTPUT 23, 23, 22, q9, q8
- STORE_IN_OUTPUT 22, 9, 8, q7, q6
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_IN_OUTPUT 23, 22, 23, q6, q7
+ STORE_IN_OUTPUT 23, 8, 9, q8, q9
; --------------------------------------------------------------------------
; part of final stage
;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
;output[24 * 32] = step1b[7][i] - step1b[24][i];
;output[25 * 32] = step1b[6][i] - step1b[25][i];
- LOAD_FROM_OUTPUT 8, 24, 25, q0, q1
+ LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
vadd.s16 q4, q2, q1
vadd.s16 q5, q3, q0
vsub.s16 q6, q3, q0
vsub.s16 q7, q2, q1
- STORE_IN_OUTPUT 25, 25, 24, q7, q6
- STORE_IN_OUTPUT 24, 6, 7, q4, q5
- ; --------------------------------------------------------------------------
+ STORE_IN_OUTPUT 25, 24, 25, q6, q7
+ STORE_IN_OUTPUT 25, 6, 7, q4, q5
- ; TODO(cd) get rid of these push/pop by properly adjusting register
- ; content at end of loop
- pop {r2}
- pop {r1}
- pop {r0}
- add r1, r1, #8*2
- add r2, r2, #8*32*2
+ ; restore r0 by removing the last offset from the last
+ ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
+ sub r0, r0, #24*8*2
+ ; restore r1 by removing the last offset from the last
+ ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2
+ ; advance by 8 columns => 8*2
+ sub r1, r1, #7*32*2 - 8*2
+ ; advance by 8 lines (8*32*2)
+ ; go back by the two pairs from the loop (32*2)
+ add r3, r3, #8*32*2 - 32*2
; bands loop processing
- add r4, r4, #1
- cmp r4, #3
- BLE idct32_bands_loop
+ subs r4, r4, #1
+ bne idct32_bands_loop
- pop {r4}
- bx lr
- ENDP ; |idct32_transpose_and_transform|
+ ; parameters for second pass
+ ; the input of pass2 is the result of pass1. we have to remove the offset
+ ; of 32 columns induced by the above idct32_bands_loop
+ sub r3, r1, #32*2
+ ; r1 = pass2[32 * 32]
+ add r1, sp, #2048
-;void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
-;
-; r0 uint8_t *dest
-; r1 int16_t *out
-; r2 int dest_stride)
-
-|idct32_combine_add| PROC
-
- mov r12, r0 ; dest pointer used for stores
- sub r2, r2, #32 ; adjust the stride (remove the post-increments)
- mov r3, #0 ; initialize loop counter
-
-idct32_combine_add_loop
- ; load out[j * 32 + 0-31]
- vld1.s16 {q12}, [r1]!
- vld1.s16 {q13}, [r1]!
- vld1.s16 {q14}, [r1]!
- vld1.s16 {q15}, [r1]!
- ; load dest[j * dest_stride + 0-31]
- vld1.s16 {q6}, [r0]!
- vld1.s16 {q7}, [r0]!
- ; ROUND_POWER_OF_TWO
- vrshr.s16 q12, q12, #6
- vrshr.s16 q13, q13, #6
- vrshr.s16 q14, q14, #6
- vrshr.s16 q15, q15, #6
- ; add to dest[j * dest_stride + 0-31]
- vaddw.u8 q12, q12, d12
- vaddw.u8 q13, q13, d13
- vaddw.u8 q14, q14, d14
- vaddw.u8 q15, q15, d15
- ; clip pixel
- vqmovun.s16 d12, q12
- vqmovun.s16 d13, q13
- vqmovun.s16 d14, q14
- vqmovun.s16 d15, q15
- ; store back into dest[j * dest_stride + 0-31]
- vst1.16 {q6}, [r12]!
- vst1.16 {q7}, [r12]!
- ; increment pointers by adjusted stride (not necessary for r1/out)
- add r0, r0, r2
- add r12, r12, r2
- ; loop processing
- add r3, r3, #1
- cmp r3, #31
- BLE idct32_combine_add_loop
+ ; pass loop processing
+ add r5, r5, #1
+ B idct32_pass_loop
- bx lr
- ENDP ; |idct32_transpose|
+idct32_bands_end_2nd_pass
+ STORE_COMBINE_CENTER_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
+ ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
+ ;output[30 * 32] = step1b[1][i] - step1b[30][i];
+ ;output[31 * 32] = step1b[0][i] - step1b[31][i];
+ LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_COMBINE_EXTREME_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[2] = step1b[2][i] + step1b[13][i];
+ ;step1[3] = step1b[3][i] + step1b[12][i];
+ ;step1[12] = step1b[3][i] - step1b[12][i];
+ ;step1[13] = step1b[2][i] - step1b[13][i];
+ LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
+ vadd.s16 q2, q10, q1
+ vadd.s16 q3, q11, q0
+ vsub.s16 q4, q11, q0
+ vsub.s16 q5, q10, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[12 * 32] = step1b[12][i] + step1b[19][i];
+ ;output[13 * 32] = step1b[13][i] + step1b[18][i];
+ ;output[18 * 32] = step1b[13][i] - step1b[18][i];
+ ;output[19 * 32] = step1b[12][i] - step1b[19][i];
+ LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_COMBINE_CENTER_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
+ ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
+ ;output[28 * 32] = step1b[3][i] - step1b[28][i];
+ ;output[29 * 32] = step1b[2][i] - step1b[29][i];
+ LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_COMBINE_EXTREME_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[4] = step1b[4][i] + step1b[11][i];
+ ;step1[5] = step1b[5][i] + step1b[10][i];
+ ;step1[10] = step1b[5][i] - step1b[10][i];
+ ;step1[11] = step1b[4][i] - step1b[11][i];
+ LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
+ vadd.s16 q2, q12, q1
+ vadd.s16 q3, q13, q0
+ vsub.s16 q4, q13, q0
+ vsub.s16 q5, q12, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[10 * 32] = step1b[10][i] + step1b[21][i];
+ ;output[11 * 32] = step1b[11][i] + step1b[20][i];
+ ;output[20 * 32] = step1b[11][i] - step1b[20][i];
+ ;output[21 * 32] = step1b[10][i] - step1b[21][i];
+ LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_COMBINE_CENTER_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
+ ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
+ ;output[26 * 32] = step1b[5][i] - step1b[26][i];
+ ;output[27 * 32] = step1b[4][i] - step1b[27][i];
+ LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_COMBINE_EXTREME_RESULTS
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[6] = step1b[6][i] + step1b[9][i];
+ ;step1[7] = step1b[7][i] + step1b[8][i];
+ ;step1[8] = step1b[7][i] - step1b[8][i];
+ ;step1[9] = step1b[6][i] - step1b[9][i];
+ LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
+ vadd.s16 q2, q14, q1
+ vadd.s16 q3, q15, q0
+ vsub.s16 q4, q15, q0
+ vsub.s16 q5, q14, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
+ ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
+ ;output[22 * 32] = step1b[9][i] - step1b[22][i];
+ ;output[23 * 32] = step1b[8][i] - step1b[23][i];
+ LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_COMBINE_CENTER_RESULTS_LAST
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
+ ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
+ ;output[24 * 32] = step1b[7][i] - step1b[24][i];
+ ;output[25 * 32] = step1b[6][i] - step1b[25][i];
+ LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_COMBINE_EXTREME_RESULTS_LAST
+ ; --------------------------------------------------------------------------
+ ; restore pointers to their initial indices for next band pass by
+ ; removing/adding dest_stride * 8. The actual increment by eight
+ ; is taken care of within the _LAST macros.
+ add r6, r6, r2, lsl #3
+ add r9, r9, r2, lsl #3
+ sub r7, r7, r2, lsl #3
+ sub r10, r10, r2, lsl #3
+
+ ; restore r0 by removing the last offset from the last
+ ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
+ sub r0, r0, #24*8*2
+ ; restore r1 by removing the last offset from the last
+ ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
+ ; advance by 8 columns => 8*2
+ sub r1, r1, #25*32*2 - 8*2
+ ; advance by 8 lines (8*32*2)
+ ; go back by the two pairs from the loop (32*2)
+ add r3, r3, #8*32*2 - 32*2
+ ; bands loop processing
+ subs r4, r4, #1
+ bne idct32_bands_loop
+
+ ; stack operation
+ add sp, sp, #512+2048+2048
+ vpop {d8-d15}
+ pop {r4-r11}
+ bx lr
+ ENDP ; |vp9_idct32x32_1024_add_neon|
END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
index 869ee5f..0d4a721 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
@@ -8,21 +8,21 @@
;
- EXPORT |vp9_short_idct4x4_1_add_neon|
+ EXPORT |vp9_idct4x4_1_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
; int dest_stride)
;
; r0 int16_t input
; r1 uint8_t *dest
; r2 int dest_stride)
-|vp9_short_idct4x4_1_add_neon| PROC
+|vp9_idct4x4_1_add_neon| PROC
ldrsh r0, [r0]
; generate cospi_16_64 = 11585
@@ -63,6 +63,6 @@
vst1.32 {d7[1]}, [r12]
bx lr
- ENDP ; |vp9_short_idct4x4_1_add_neon|
+ ENDP ; |vp9_idct4x4_1_add_neon|
END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
index 640fb93..00283fc 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
@@ -8,7 +8,7 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_short_idct4x4_add_neon|
+ EXPORT |vp9_idct4x4_16_add_neon|
ARM
REQUIRE8
PRESERVE8
@@ -16,13 +16,13 @@
AREA ||.text||, CODE, READONLY, ALIGN=2
AREA Block, CODE, READONLY ; name this block of code
-;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
;
; r0 int16_t input
; r1 uint8_t *dest
; r2 int dest_stride)
-|vp9_short_idct4x4_add_neon| PROC
+|vp9_idct4x4_16_add_neon| PROC
; The 2D transform is done with two passes which are actually pretty
; similar. We first transform the rows. This is done by transposing
@@ -185,6 +185,6 @@
vst1.32 {d26[1]}, [r1], r2
vst1.32 {d26[0]}, [r1] ; no post-increment
bx lr
- ENDP ; |vp9_short_idct4x4_add_neon|
+ ENDP ; |vp9_idct4x4_16_add_neon|
END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
index 923804f..421d202 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
@@ -8,21 +8,21 @@
;
- EXPORT |vp9_short_idct8x8_1_add_neon|
+ EXPORT |vp9_idct8x8_1_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
; int dest_stride)
;
; r0 int16_t input
; r1 uint8_t *dest
; r2 int dest_stride)
-|vp9_short_idct8x8_1_add_neon| PROC
+|vp9_idct8x8_1_add_neon| PROC
ldrsh r0, [r0]
; generate cospi_16_64 = 11585
@@ -83,6 +83,6 @@
vst1.64 {d31}, [r12], r2
bx lr
- ENDP ; |vp9_short_idct8x8_1_add_neon|
+ ENDP ; |vp9_idct8x8_1_add_neon|
END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
index a744f59..5476400 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -8,8 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_short_idct8x8_add_neon|
- EXPORT |vp9_short_idct10_8x8_add_neon|
+ EXPORT |vp9_idct8x8_64_add_neon|
+ EXPORT |vp9_idct8x8_10_add_neon|
ARM
REQUIRE8
PRESERVE8
@@ -198,13 +198,13 @@
MEND
AREA Block, CODE, READONLY ; name this block of code
-;void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
;
; r0 int16_t input
; r1 uint8_t *dest
; r2 int dest_stride)
-|vp9_short_idct8x8_add_neon| PROC
+|vp9_idct8x8_64_add_neon| PROC
push {r4-r9}
vpush {d8-d15}
vld1.s16 {q8,q9}, [r0]!
@@ -308,15 +308,15 @@
vpop {d8-d15}
pop {r4-r9}
bx lr
- ENDP ; |vp9_short_idct8x8_add_neon|
+ ENDP ; |vp9_idct8x8_64_add_neon|
-;void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
;
; r0 int16_t input
; r1 uint8_t *dest
; r2 int dest_stride)
-|vp9_short_idct10_8x8_add_neon| PROC
+|vp9_idct8x8_10_add_neon| PROC
push {r4-r9}
vpush {d8-d15}
vld1.s16 {q8,q9}, [r0]!
@@ -514,6 +514,6 @@
vpop {d8-d15}
pop {r4-r9}
bx lr
- ENDP ; |vp9_short_idct10_8x8_add_neon|
+ ENDP ; |vp9_idct8x8_10_add_neon|
END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
index 963ef35..2f326e2 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
@@ -8,7 +8,7 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_short_iht4x4_add_neon|
+ EXPORT |vp9_iht4x4_16_add_neon|
ARM
REQUIRE8
PRESERVE8
@@ -139,7 +139,7 @@
MEND
AREA Block, CODE, READONLY ; name this block of code
-;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
; int dest_stride, int tx_type)
;
; r0 int16_t input
@@ -147,7 +147,7 @@
; r2 int dest_stride
; r3 int tx_type)
; This function will only handle tx_type of 1,2,3.
-|vp9_short_iht4x4_add_neon| PROC
+|vp9_iht4x4_16_add_neon| PROC
; load the inputs into d16-d19
vld1.s16 {q8,q9}, [r0]!
@@ -175,7 +175,7 @@ iadst_idct
; then transform columns
IADST4x4_1D
- b end_vp9_short_iht4x4_add_neon
+ b end_vp9_iht4x4_16_add_neon
idct_iadst
; generate constants
@@ -191,7 +191,7 @@ idct_iadst
; then transform columns
IDCT4x4_1D
- b end_vp9_short_iht4x4_add_neon
+ b end_vp9_iht4x4_16_add_neon
iadst_iadst
; generate constants
@@ -206,7 +206,7 @@ iadst_iadst
; then transform columns
IADST4x4_1D
-end_vp9_short_iht4x4_add_neon
+end_vp9_iht4x4_16_add_neon
; ROUND_POWER_OF_TWO(temp_out[j], 4)
vrshr.s16 q8, q8, #4
vrshr.s16 q9, q9, #4
@@ -232,6 +232,6 @@ end_vp9_short_iht4x4_add_neon
vst1.32 {d26[1]}, [r1], r2
vst1.32 {d26[0]}, [r1] ; no post-increment
bx lr
- ENDP ; |vp9_short_iht4x4_add_neon|
+ ENDP ; |vp9_iht4x4_16_add_neon|
END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
index bab9cb4..93d3af3 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
@@ -8,7 +8,7 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_short_iht8x8_add_neon|
+ EXPORT |vp9_iht8x8_64_add_neon|
ARM
REQUIRE8
PRESERVE8
@@ -559,7 +559,7 @@
AREA Block, CODE, READONLY ; name this block of code
-;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
; int dest_stride, int tx_type)
;
; r0 int16_t input
@@ -567,7 +567,7 @@
; r2 int dest_stride
; r3 int tx_type)
; This function will only handle tx_type of 1,2,3.
-|vp9_short_iht8x8_add_neon| PROC
+|vp9_iht8x8_64_add_neon| PROC
; load the inputs into d16-d19
vld1.s16 {q8,q9}, [r0]!
@@ -602,7 +602,7 @@ iadst_idct
; then transform columns
IADST8X8_1D
- b end_vp9_short_iht8x8_add_neon
+ b end_vp9_iht8x8_64_add_neon
idct_iadst
; generate IADST constants
@@ -620,7 +620,7 @@ idct_iadst
; then transform columns
IDCT8x8_1D
- b end_vp9_short_iht8x8_add_neon
+ b end_vp9_iht8x8_64_add_neon
iadst_iadst
; generate IADST constants
@@ -635,7 +635,7 @@ iadst_iadst
; then transform columns
IADST8X8_1D
-end_vp9_short_iht8x8_add_neon
+end_vp9_iht8x8_64_add_neon
pop {r0-r10}
; ROUND_POWER_OF_TWO(temp_out[j], 5)
@@ -691,6 +691,6 @@ end_vp9_short_iht8x8_add_neon
vst1.64 {d6}, [r0], r2
vst1.64 {d7}, [r0], r2
bx lr
- ENDP ; |vp9_short_iht8x8_add_neon|
+ ENDP ; |vp9_iht8x8_64_add_neon|
END
diff --git a/libvpx/vp9/common/generic/vp9_systemdependent.c b/libvpx/vp9/common/generic/vp9_systemdependent.c
index f144721..536febb 100644
--- a/libvpx/vp9/common/generic/vp9_systemdependent.c
+++ b/libvpx/vp9/common/generic/vp9_systemdependent.c
@@ -10,7 +10,7 @@
#include "./vpx_config.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
#include "vp9/common/vp9_onyxc_int.h"
void vp9_machine_specific_config(VP9_COMMON *cm) {
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h b/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h
new file mode 100644
index 0000000..644264f
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_COMMON_DSPR2_H_
+#define VP9_COMMON_VP9_COMMON_DSPR2_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+
+#if HAVE_DSPR2
+#define CROP_WIDTH 512
+extern uint8_t *vp9_ff_cropTbl;
+
+#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) ({ \
+ \
+ int32_t tmp, out; \
+ int dct_cost_rounding = DCT_CONST_ROUNDING; \
+ int in = input; \
+ \
+ __asm__ __volatile__ ( \
+ /* out = dct_const_round_shift(input_dc * cospi_16_64); */ \
+ "mtlo %[dct_cost_rounding], $ac1 \n\t"\
+ "mthi $zero, $ac1 \n\t"\
+ "madd $ac1, %[in], %[cospi_16_64] \n\t"\
+ "extp %[tmp], $ac1, 31 \n\t"\
+ \
+ /* out = dct_const_round_shift(out * cospi_16_64); */ \
+ "mtlo %[dct_cost_rounding], $ac2 \n\t"\
+ "mthi $zero, $ac2 \n\t"\
+ "madd $ac2, %[tmp], %[cospi_16_64] \n\t"\
+ "extp %[out], $ac2, 31 \n\t"\
+ \
+ : [tmp] "=&r" (tmp), [out] "=r" (out) \
+ : [in] "r" (in), \
+ [dct_cost_rounding] "r" (dct_cost_rounding), \
+ [cospi_16_64] "r" (cospi_16_64) \
+ ); \
+ out; })
+
+static INLINE void vp9_prefetch_load(const unsigned char *src) {
+ __asm__ __volatile__ (
+ "pref 0, 0(%[src]) \n\t"
+ :
+ : [src] "r" (src)
+ );
+}
+
+/* prefetch data for store */
+static INLINE void vp9_prefetch_store(unsigned char *dst) {
+ __asm__ __volatile__ (
+ "pref 1, 0(%[dst]) \n\t"
+ :
+ : [dst] "r" (dst)
+ );
+}
+
+static INLINE void vp9_prefetch_load_streamed(const unsigned char *src) {
+ __asm__ __volatile__ (
+ "pref 4, 0(%[src]) \n\t"
+ :
+ : [src] "r" (src)
+ );
+}
+
+/* prefetch data for store */
+static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) {
+ __asm__ __volatile__ (
+ "pref 5, 0(%[dst]) \n\t"
+ :
+ : [dst] "r" (dst)
+ );
+}
+
+void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride);
+
+void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter,
+ int w, int h);
+
+void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+#endif // #if HAVE_DSPR2
+#endif // VP9_COMMON_VP9_COMMON_DSPR2_H_
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
new file mode 100644
index 0000000..91d62bc
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [src_stride] "r" (src_stride), [cm] "r" (cm),
+ [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+ vp9_prefetch_store(dst + dst_stride + 32);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [src_stride] "r" (src_stride), [cm] "r" (cm),
+ [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (16 == y_step_q4) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ case 8:
+ case 16:
+ case 32:
+ convolve_bi_avg_vert_4_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, w, h);
+ break;
+ case 64:
+ vp9_prefetch_store(dst + 32);
+ convolve_bi_avg_vert_64_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, h);
+ break;
+ default:
+ vp9_convolve8_avg_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_avg_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
new file mode 100644
index 0000000..148b20f
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
@@ -0,0 +1,833 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3;
+ uint32_t tn1, tn2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
+ "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
+ "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
+
+ /* clamp */
+ "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
+ "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */
+ "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
+
+ "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
+ "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
+
+ "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */
+ "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3, tp4;
+ uint32_t p1, p2, p3, p4, n1;
+ uint32_t st0, st1;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+ "lbu %[Temp2], 0(%[dst]) \n\t"
+ "lbu %[tp4], 2(%[dst]) \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+ "addqh_r.w %[tp4], %[tp4], %[st1] \n\t"
+ "sb %[Temp2], 0(%[dst]) \n\t"
+ "sb %[tp4], 2(%[dst]) \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "lbu %[Temp2], 4(%[dst]) \n\t"
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[Temp2], 4(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tp1], 6(%[dst]) \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ "lbu %[tp2], 1(%[dst]) \n\t"
+ "lbu %[tp3], 3(%[dst]) \n\t"
+ "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tp4], 5(%[dst]) \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp2], 1(%[dst]) \n\t"
+ "sb %[tp1], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbu %[tp1], 7(%[dst]) \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "addqh_r.w %[tp3], %[tp3], %[p4] \n\t"
+
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[tp4], %[tp4], %[p2] \n\t"
+
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[tp1], %[tp1], %[p1] \n\t"
+
+ /* store bytes */
+ "sb %[tp3], 3(%[dst]) \n\t"
+ "sb %[tp4], 5(%[dst]) \n\t"
+ "sb %[tp1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+ [st0] "=&r" (st0), [st1] "=&r" (st1),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [n1] "=&r" (n1),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_load(src_ptr + src_stride + 64);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+ vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (16 == x_step_q4) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src);
+ vp9_prefetch_load(src + 32);
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_bi_avg_horiz_4_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h);
+ break;
+ case 8:
+ convolve_bi_avg_horiz_8_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h);
+ break;
+ case 16:
+ convolve_bi_avg_horiz_16_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h, 1);
+ break;
+ case 32:
+ convolve_bi_avg_horiz_16_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h, 2);
+ break;
+ case 64:
+ vp9_prefetch_load(src + 64);
+ vp9_prefetch_store(dst + 32);
+
+ convolve_bi_avg_horiz_64_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h);
+ break;
+ default:
+ vp9_convolve8_avg_horiz_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_avg_horiz_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
new file mode 100644
index 0000000..92644f2
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
@@ -0,0 +1,784 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_filter.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint8_t *dst_ptr;
+ int32_t Temp1, Temp2;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ dst_ptr = dst;
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp2](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[p1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[tp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[p2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [dst_ptr] "+r" (dst_ptr)
+ : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+ [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint8_t *dst_ptr;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3;
+ uint32_t p1, p2, p3, p4;
+ uint8_t *odd_dst;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+
+ dst_ptr = dst;
+ odd_dst = (dst_ptr + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "lbux %[tp1], %[Temp3](%[cm]) \n\t"
+ "extp %[p3], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[Temp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[Temp1], %[p3](%[cm]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "sb %[Temp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp1], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[tp3], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp3], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[p2], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[p1], 0(%[odd_dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
+ : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
+ [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h,
+ int32_t count) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload1], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p5], %[qload1] \n\t"
+ "ulw %[qload2], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload2] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload2], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload2] \n\t"
+ "preceu.ph.qbl %[p5], %[qload2] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm),
+ [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+ );
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += 1;
+ }
+}
+
+static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_load(src_ptr + src_stride + 64);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload1], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p5], %[qload1] \n\t"
+ "ulw %[qload2], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload2] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload2], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload2] \n\t"
+ "preceu.ph.qbl %[p5], %[qload2] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm),
+ [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+ );
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += 1;
+ }
+}
+
+void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter, int w, int h) {
+ int x, y;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ int sum = 0;
+
+ sum += src[x] * filter[3];
+ sum += src[x + 1] * filter[4];
+
+ dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter,
+ int w, int h) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src);
+ vp9_prefetch_load(src + 32);
+
+ switch (w) {
+ case 4:
+ convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter, h);
+ break;
+ case 8:
+ convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter, h);
+ break;
+ case 16:
+ case 32:
+ convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter, h,
+ (w/16));
+ break;
+ case 64:
+ vp9_prefetch_load(src + 32);
+ convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter, h);
+ break;
+ default:
+ convolve_bi_horiz_transposed(src, src_stride,
+ dst, dst_stride,
+ filter, w, h);
+ break;
+ }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
new file mode 100644
index 0000000..1debdb4
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
@@ -0,0 +1,713 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p1], %[Temp2](%[cm]) \n\t"
+ "lbux %[p2], %[Temp4](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst]) \n\t"
+ "sb %[p1], 1(%[dst]) \n\t"
+ "sb %[tp2], 2(%[dst]) \n\t"
+ "sb %[p2], 3(%[dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3;
+ uint32_t p1, p2, p3, p4;
+ uint32_t st0, st1;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st0], 0(%[dst]) \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 2(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "sb %[st0], 4(%[dst]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[st1], 1(%[dst]) \n\t"
+ "sb %[st0], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 3(%[dst]) \n\t"
+ "sb %[p2], 5(%[dst]) \n\t"
+ "sb %[p1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+ [st0] "=&r" (st0), [st1] "=&r" (st1),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_load(src_ptr + src_stride + 64);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+ vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (16 == x_step_q4) {
+ uint32_t pos = 38;
+
+ vp9_prefetch_load((const uint8_t *)filter_x);
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src);
+ vp9_prefetch_load(src + 32);
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h);
+ break;
+ case 8:
+ convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h);
+ break;
+ case 16:
+ convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h, 1);
+ break;
+ case 32:
+ convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h, 2);
+ break;
+ case 64:
+ vp9_prefetch_load(src + 64);
+ vp9_prefetch_store(dst + 32);
+
+ convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h);
+ break;
+ default:
+ vp9_convolve8_horiz_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_horiz_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
new file mode 100644
index 0000000..bf01f11
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_vert_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [scratch1] "=&r" (scratch1),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+ [src_stride] "r" (src_stride),
+ [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_vert_64_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [scratch1] "=&r" (scratch1),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+ [src_stride] "r" (src_stride),
+ [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (16 == y_step_q4) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4 :
+ case 8 :
+ case 16 :
+ case 32 :
+ convolve_bi_vert_4_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, w, h);
+ break;
+ case 64 :
+ vp9_prefetch_store(dst + 32);
+ convolve_bi_vert_64_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, h);
+ break;
+ default:
+ vp9_convolve8_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
new file mode 100644
index 0000000..ab18490
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
@@ -0,0 +1,695 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_avg_vert_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2, load3, load4;
+ uint32_t p1, p2;
+ uint32_t n1, n2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2;
+
+ vector1b = ((const int32_t *)filter_y)[0];
+ vector2b = ((const int32_t *)filter_y)[1];
+ vector3b = ((const int32_t *)filter_y)[2];
+ vector4b = ((const int32_t *)filter_y)[3];
+
+ src -= 3 * src_stride;
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
+ [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+ [vector4a] "r" (vector4a),
+ [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_avg_vert_64_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2, load3, load4;
+ uint32_t p1, p2;
+ uint32_t n1, n2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2;
+
+ vector1b = ((const int32_t *)filter_y)[0];
+ vector2b = ((const int32_t *)filter_y)[1];
+ vector3b = ((const int32_t *)filter_y)[2];
+ vector4b = ((const int32_t *)filter_y)[3];
+
+ src -= 3 * src_stride;
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+ vp9_prefetch_store(dst + dst_stride + 32);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
+ [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+ [vector4a] "r" (vector4a),
+ [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (((const int32_t *)filter_y)[1] == 0x800000) {
+ vp9_convolve_avg(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ } else if (((const int32_t *)filter_y)[0] == 0) {
+ vp9_convolve2_avg_vert_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ } else {
+ if (16 == y_step_q4) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ case 8:
+ case 16:
+ case 32:
+ convolve_avg_vert_4_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, w, h);
+ break;
+ case 64:
+ vp9_prefetch_store(dst + 32);
+ convolve_avg_vert_64_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, h);
+ break;
+ default:
+ vp9_convolve8_avg_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_avg_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+ }
+}
+
+void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ /* Fixed size intermediate buffer places limits on parameters. */
+ DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
+ int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
+
+ assert(w <= 64);
+ assert(h <= 64);
+
+ if (intermediate_height < h)
+ intermediate_height = h;
+
+ if (x_step_q4 != 16 || y_step_q4 != 16)
+ return vp9_convolve8_avg_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+
+ vp9_convolve8_horiz(src - (src_stride * 3), src_stride,
+ temp, 64,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, intermediate_height);
+
+ vp9_convolve8_avg_vert(temp + 64 * 3, 64,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+}
+
+void vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
+ int x, y;
+ uint32_t tp1, tp2, tn1;
+ uint32_t tp3, tp4, tn2;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src);
+ vp9_prefetch_load(src + 32);
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ /* 1 word storage */
+ for (y = h; y--; ) {
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+
+ : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
+ [tp2] "=&r" (tp2)
+ : [src] "r" (src), [dst] "r" (dst)
+ );
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ case 8:
+ /* 2 word storage */
+ for (y = h; y--; ) {
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "ulw %[tp3], 4(%[src]) \n\t"
+ "ulw %[tp4], 4(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+ [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+ : [src] "r" (src), [dst] "r" (dst)
+ );
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ case 16:
+ /* 4 word storage */
+ for (y = h; y--; ) {
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "ulw %[tp3], 4(%[src]) \n\t"
+ "ulw %[tp4], 4(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 8(%[src]) \n\t"
+ "ulw %[tp2], 8(%[dst]) \n\t"
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 12(%[src]) \n\t"
+ "ulw %[tp4], 12(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 8(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 12(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+ [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+ : [src] "r" (src), [dst] "r" (dst)
+ );
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ case 32:
+ /* 8 word storage */
+ for (y = h; y--; ) {
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "ulw %[tp3], 4(%[src]) \n\t"
+ "ulw %[tp4], 4(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 8(%[src]) \n\t"
+ "ulw %[tp2], 8(%[dst]) \n\t"
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 12(%[src]) \n\t"
+ "ulw %[tp4], 12(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 16(%[src]) \n\t"
+ "ulw %[tp2], 16(%[dst]) \n\t"
+ "sw %[tn1], 8(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 12(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 20(%[src]) \n\t"
+ "ulw %[tp4], 20(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 24(%[src]) \n\t"
+ "ulw %[tp2], 24(%[dst]) \n\t"
+ "sw %[tn1], 16(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 20(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 28(%[src]) \n\t"
+ "ulw %[tp4], 28(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 24(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 28(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+ [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+ : [src] "r" (src), [dst] "r" (dst)
+ );
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ case 64:
+ vp9_prefetch_load(src + 64);
+ vp9_prefetch_store(dst + 32);
+
+ /* 16 word storage */
+ for (y = h; y--; ) {
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_load(src + src_stride + 64);
+ vp9_prefetch_store(dst + dst_stride);
+ vp9_prefetch_store(dst + dst_stride + 32);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 0(%[dst]) \n\t"
+ "ulw %[tp3], 4(%[src]) \n\t"
+ "ulw %[tp4], 4(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 8(%[src]) \n\t"
+ "ulw %[tp2], 8(%[dst]) \n\t"
+ "sw %[tn1], 0(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 4(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 12(%[src]) \n\t"
+ "ulw %[tp4], 12(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 16(%[src]) \n\t"
+ "ulw %[tp2], 16(%[dst]) \n\t"
+ "sw %[tn1], 8(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 12(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 20(%[src]) \n\t"
+ "ulw %[tp4], 20(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 24(%[src]) \n\t"
+ "ulw %[tp2], 24(%[dst]) \n\t"
+ "sw %[tn1], 16(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 20(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 28(%[src]) \n\t"
+ "ulw %[tp4], 28(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 32(%[src]) \n\t"
+ "ulw %[tp2], 32(%[dst]) \n\t"
+ "sw %[tn1], 24(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 28(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 36(%[src]) \n\t"
+ "ulw %[tp4], 36(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 40(%[src]) \n\t"
+ "ulw %[tp2], 40(%[dst]) \n\t"
+ "sw %[tn1], 32(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 36(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 44(%[src]) \n\t"
+ "ulw %[tp4], 44(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 48(%[src]) \n\t"
+ "ulw %[tp2], 48(%[dst]) \n\t"
+ "sw %[tn1], 40(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 44(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 52(%[src]) \n\t"
+ "ulw %[tp4], 52(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "ulw %[tp1], 56(%[src]) \n\t"
+ "ulw %[tp2], 56(%[dst]) \n\t"
+ "sw %[tn1], 48(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 52(%[dst]) \n\t" /* store */
+ "ulw %[tp3], 60(%[src]) \n\t"
+ "ulw %[tp4], 60(%[dst]) \n\t"
+ "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
+ "sw %[tn1], 56(%[dst]) \n\t" /* store */
+ "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
+ "sw %[tn2], 60(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+ [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+ : [src] "r" (src), [dst] "r" (dst)
+ );
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ default:
+ for (y = h; y > 0; --y) {
+ for (x = 0; x < w; ++x) {
+ dst[x] = (dst[x] + src[x] + 1) >> 1;
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
new file mode 100644
index 0000000..69da1cf
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
@@ -0,0 +1,1038 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4;
+ uint32_t n1, n2, n3, n4;
+ uint32_t tn1, tn2;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn2] \n\t"
+ "preceu.ph.qbl %[n4], %[tn2] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[n1], %[tn1] \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
+ "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
+ "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
+
+ /* clamp */
+ "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
+ "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */
+ "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
+
+ "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
+ "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
+
+ "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */
+ "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+ [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4, n1;
+ uint32_t tn1, tn2, tn3;
+ uint32_t st0, st1;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+ "lbu %[Temp2], 0(%[dst]) \n\t"
+ "lbu %[tn3], 2(%[dst]) \n\t"
+
+ /* even 2. pixel */
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "preceu.ph.qbl %[n1], %[tn2] \n\t"
+ "ulw %[tn1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[tn1] \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+ "addqh_r.w %[tn3], %[tn3], %[st1] \n\t"
+ "sb %[Temp2], 0(%[dst]) \n\t"
+ "sb %[tn3], 2(%[dst]) \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "balign %[tn3], %[tn1], 3 \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "lbu %[Temp2], 4(%[dst]) \n\t"
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[Temp2], 4(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tn2] \n\t"
+ "preceu.ph.qbl %[p4], %[tn2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tp1], 6(%[dst]) \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn1] \n\t"
+ "preceu.ph.qbl %[n1], %[tn1] \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ "lbu %[tp2], 1(%[dst]) \n\t"
+ "lbu %[tn2], 3(%[dst]) \n\t"
+ "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "preceu.ph.qbr %[p2], %[tn3] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
+ "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tn3], 5(%[dst]) \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp2], 1(%[dst]) \n\t"
+ "sb %[tp1], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbu %[tn1], 7(%[dst]) \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "addqh_r.w %[tn2], %[tn2], %[p4] \n\t"
+
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[tn3], %[tn3], %[p2] \n\t"
+
+ "lbux %[n1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[tn1], %[tn1], %[n1] \n\t"
+
+ /* store bytes */
+ "sb %[tn2], 3(%[dst]) \n\t"
+ "sb %[tn3], 5(%[dst]) \n\t"
+ "sb %[tn1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
+ [st0] "=&r" (st0), [st1] "=&r" (st1),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [n1] "=&r" (n1),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+ [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "ulw %[qload2], 16(%[src]) \n\t"
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "ulw %[qload3], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "ulw %[qload2], 17(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "ulw %[qload3], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter12] "r" (filter12), [filter34] "r" (filter34),
+ [filter56] "r" (filter56), [filter78] "r" (filter78),
+ [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_load(src_ptr + src_stride + 64);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+ vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "ulw %[qload2], 16(%[src]) \n\t"
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "ulw %[qload3], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "ulw %[qload2], 17(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "ulw %[qload3], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter12] "r" (filter12), [filter34] "r" (filter34),
+ [filter56] "r" (filter56), [filter78] "r" (filter78),
+ [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void vp9_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (((const int32_t *)filter_x)[1] == 0x800000) {
+ vp9_convolve_avg(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ } else if (((const int32_t *)filter_x)[0] == 0) {
+ vp9_convolve2_avg_horiz_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ } else {
+ if (16 == x_step_q4) {
+ uint32_t pos = 38;
+
+ src -= 3;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src);
+ vp9_prefetch_load(src + 32);
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_avg_horiz_4_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h);
+ break;
+ case 8:
+ convolve_avg_horiz_8_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h);
+ break;
+ case 16:
+ convolve_avg_horiz_16_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h, 1);
+ break;
+ case 32:
+ convolve_avg_horiz_16_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h, 2);
+ break;
+ case 64:
+ vp9_prefetch_load(src + 64);
+ vp9_prefetch_store(dst + 32);
+
+ convolve_avg_horiz_64_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h);
+ break;
+ default:
+ vp9_convolve8_avg_horiz_c(src + 3, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_avg_horiz_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+ }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
new file mode 100644
index 0000000..0ef9dd5
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
@@ -0,0 +1,1284 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_filter.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+uint8_t vp9_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
+uint8_t *vp9_ff_cropTbl;
+
+void vp9_dsputil_static_init(void) {
+ int i;
+
+ for (i = 0; i < 256; i++) vp9_ff_cropTbl_a[i + CROP_WIDTH] = i;
+
+ for (i = 0; i < CROP_WIDTH; i++) {
+ vp9_ff_cropTbl_a[i] = 0;
+ vp9_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
+ }
+
+ vp9_ff_cropTbl = &vp9_ff_cropTbl_a[CROP_WIDTH];
+}
+
+static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint8_t *dst_ptr;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4;
+ uint32_t tn1, tn2;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ dst_ptr = dst;
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tn2] \n\t"
+ "preceu.ph.qbl %[p4], %[tn2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t"
+ "lbux %[p2], %[Temp4](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[tn1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[tp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[p2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+ [dst_ptr] "+r" (dst_ptr)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+ [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint8_t *dst_ptr;
+ uint32_t vector4a = 64;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3;
+ uint32_t p1, p2, p3, p4, n1;
+ uint8_t *odd_dst;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+
+ dst_ptr = dst;
+ odd_dst = (dst_ptr + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp2], 0(%[src]) \n\t"
+ "ulw %[tp1], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp1] \n\t"
+ "preceu.ph.qbl %[p4], %[tp1] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "preceu.ph.qbr %[p1], %[tp3] \n\t"
+ "preceu.ph.qbl %[n1], %[tp3] \n\t"
+ "ulw %[tp2], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[tp2] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
+ "lbux %[tp3], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
+ "extp %[p3], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[Temp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "sb %[tp3], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+
+ "ulw %[tp1], 1(%[src]) \n\t"
+ "ulw %[tp3], 5(%[src]) \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[tp2], %[p3](%[cm]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "sb %[tp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "ulw %[tp2], 9(%[src]) \n\t"
+
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp1], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n1], %[tp2] \n\t"
+ "ulw %[Temp1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[tp3], %[Temp2](%[cm]) \n\t"
+ "preceu.ph.qbr %[p2], %[Temp1] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp3], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[n1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[p2], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[n1], 0(%[odd_dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [n1] "=&r" (n1),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+ [vector4a] "r" (vector4a), [cm] "r" (cm),
+ [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h,
+ int32_t count) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "ulw %[qload2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload2] \n\t"
+ "preceu.ph.qbl %[p5], %[qload2] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload2], 16(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload2], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload2] \n\t"
+ "preceu.ph.qbl %[p5], %[qload2] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload2], 17(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+ : [filter12] "r" (filter12), [filter34] "r" (filter34),
+ [filter56] "r" (filter56), [filter78] "r" (filter78),
+ [vector_64] "r" (vector_64), [cm] "r" (cm),
+ [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+ );
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+
+ dst_ptr += 1;
+ }
+}
+
+static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_load(src_ptr + src_stride + 64);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "ulw %[qload2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload2] \n\t"
+ "preceu.ph.qbl %[p5], %[qload2] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload2], 16(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload2], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload2] \n\t"
+ "preceu.ph.qbl %[p5], %[qload2] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload2], 17(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+ : [filter12] "r" (filter12), [filter34] "r" (filter34),
+ [filter56] "r" (filter56), [filter78] "r" (filter78),
+ [vector_64] "r" (vector_64), [cm] "r" (cm),
+ [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+ );
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+
+ dst_ptr += 1;
+ }
+}
+
+void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter, int w, int h) {
+ int x, y, k;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ int sum = 0;
+
+ for (k = 0; k < 8; ++k)
+ sum += src[x + k] * filter[k];
+
+ dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ int w, int h) {
+ int x, y;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ dst[x * dst_stride] = src[x];
+ }
+
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
+ int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ if (intermediate_height < h)
+ intermediate_height = h;
+
+ if (x_step_q4 != 16 || y_step_q4 != 16)
+ return vp9_convolve8_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+
+ if ((((const int32_t *)filter_x)[1] == 0x800000)
+ && (((const int32_t *)filter_y)[1] == 0x800000))
+ return vp9_convolve_copy(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+
+ /* copy the src to dst */
+ if (filter_x[3] == 0x80) {
+ copy_horiz_transposed(src - src_stride * 3, src_stride,
+ temp, intermediate_height,
+ w, intermediate_height);
+ } else if (((const int32_t *)filter_x)[0] == 0) {
+ vp9_convolve2_dspr2(src - src_stride * 3, src_stride,
+ temp, intermediate_height,
+ filter_x,
+ w, intermediate_height);
+ } else {
+ src -= (src_stride * 3 + 3);
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src);
+ vp9_prefetch_load(src + 32);
+
+ switch (w) {
+ case 4:
+ convolve_horiz_4_transposed_dspr2(src, src_stride,
+ temp, intermediate_height,
+ filter_x, intermediate_height);
+ break;
+ case 8:
+ convolve_horiz_8_transposed_dspr2(src, src_stride,
+ temp, intermediate_height,
+ filter_x, intermediate_height);
+ break;
+ case 16:
+ case 32:
+ convolve_horiz_16_transposed_dspr2(src, src_stride,
+ temp, intermediate_height,
+ filter_x, intermediate_height,
+ (w/16));
+ break;
+ case 64:
+ vp9_prefetch_load(src + 32);
+ convolve_horiz_64_transposed_dspr2(src, src_stride,
+ temp, intermediate_height,
+ filter_x, intermediate_height);
+ break;
+ default:
+ convolve_horiz_transposed(src, src_stride,
+ temp, intermediate_height,
+ filter_x, w, intermediate_height);
+ break;
+ }
+ }
+
+ /* copy the src to dst */
+ if (filter_y[3] == 0x80) {
+ copy_horiz_transposed(temp + 3, intermediate_height,
+ dst, dst_stride,
+ h, w);
+ } else if (((const int32_t *)filter_y)[0] == 0) {
+ vp9_convolve2_dspr2(temp + 3, intermediate_height,
+ dst, dst_stride,
+ filter_y,
+ h, w);
+ } else {
+ switch (h) {
+ case 4:
+ convolve_horiz_4_transposed_dspr2(temp, intermediate_height,
+ dst, dst_stride,
+ filter_y, w);
+ break;
+ case 8:
+ convolve_horiz_8_transposed_dspr2(temp, intermediate_height,
+ dst, dst_stride,
+ filter_y, w);
+ break;
+ case 16:
+ case 32:
+ convolve_horiz_16_transposed_dspr2(temp, intermediate_height,
+ dst, dst_stride,
+ filter_y, w, (h/16));
+ break;
+ case 64:
+ convolve_horiz_64_transposed_dspr2(temp, intermediate_height,
+ dst, dst_stride,
+ filter_y, w);
+ break;
+ default:
+ convolve_horiz_transposed(temp, intermediate_height,
+ dst, dst_stride,
+ filter_y, h, w);
+ break;
+ }
+ }
+}
+
+void vp9_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
+ int x, y;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src);
+ vp9_prefetch_load(src + 32);
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ {
+ uint32_t tp1;
+
+ /* 1 word storage */
+ for (y = h; y--; ) {
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], (%[src]) \n\t"
+ "sw %[tp1], (%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r" (tp1)
+ : [src] "r" (src), [dst] "r" (dst)
+ );
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+ break;
+ case 8:
+ {
+ uint32_t tp1, tp2;
+
+ /* 2 word storage */
+ for (y = h; y--; ) {
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2)
+ : [src] "r" (src), [dst] "r" (dst)
+ );
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+ break;
+ case 16:
+ {
+ uint32_t tp1, tp2, tp3, tp4;
+
+ /* 4 word storage */
+ for (y = h; y--; ) {
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "ulw %[tp4], 12(%[src]) \n\t"
+
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tp3] "=&r" (tp3), [tp4] "=&r" (tp4)
+ : [src] "r" (src), [dst] "r" (dst)
+ );
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+ break;
+ case 32:
+ {
+ uint32_t tp1, tp2, tp3, tp4;
+ uint32_t tp5, tp6, tp7, tp8;
+
+ /* 8 word storage */
+ for (y = h; y--; ) {
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "ulw %[tp4], 12(%[src]) \n\t"
+ "ulw %[tp5], 16(%[src]) \n\t"
+ "ulw %[tp6], 20(%[src]) \n\t"
+ "ulw %[tp7], 24(%[src]) \n\t"
+ "ulw %[tp8], 28(%[src]) \n\t"
+
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
+ "sw %[tp5], 16(%[dst]) \n\t" /* store */
+ "sw %[tp6], 20(%[dst]) \n\t" /* store */
+ "sw %[tp7], 24(%[dst]) \n\t" /* store */
+ "sw %[tp8], 28(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+ [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
+ [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
+ : [src] "r" (src), [dst] "r" (dst)
+ );
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+ break;
+ case 64:
+ {
+ uint32_t tp1, tp2, tp3, tp4;
+ uint32_t tp5, tp6, tp7, tp8;
+
+ vp9_prefetch_load(src + 64);
+ vp9_prefetch_store(dst + 32);
+
+ /* 16 word storage */
+ for (y = h; y--; ) {
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_load(src + src_stride + 64);
+ vp9_prefetch_store(dst + dst_stride);
+ vp9_prefetch_store(dst + dst_stride + 32);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "ulw %[tp4], 12(%[src]) \n\t"
+ "ulw %[tp5], 16(%[src]) \n\t"
+ "ulw %[tp6], 20(%[src]) \n\t"
+ "ulw %[tp7], 24(%[src]) \n\t"
+ "ulw %[tp8], 28(%[src]) \n\t"
+
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
+ "sw %[tp5], 16(%[dst]) \n\t" /* store */
+ "sw %[tp6], 20(%[dst]) \n\t" /* store */
+ "sw %[tp7], 24(%[dst]) \n\t" /* store */
+ "sw %[tp8], 28(%[dst]) \n\t" /* store */
+
+ "ulw %[tp1], 32(%[src]) \n\t"
+ "ulw %[tp2], 36(%[src]) \n\t"
+ "ulw %[tp3], 40(%[src]) \n\t"
+ "ulw %[tp4], 44(%[src]) \n\t"
+ "ulw %[tp5], 48(%[src]) \n\t"
+ "ulw %[tp6], 52(%[src]) \n\t"
+ "ulw %[tp7], 56(%[src]) \n\t"
+ "ulw %[tp8], 60(%[src]) \n\t"
+
+ "sw %[tp1], 32(%[dst]) \n\t" /* store */
+ "sw %[tp2], 36(%[dst]) \n\t" /* store */
+ "sw %[tp3], 40(%[dst]) \n\t" /* store */
+ "sw %[tp4], 44(%[dst]) \n\t" /* store */
+ "sw %[tp5], 48(%[dst]) \n\t" /* store */
+ "sw %[tp6], 52(%[dst]) \n\t" /* store */
+ "sw %[tp7], 56(%[dst]) \n\t" /* store */
+ "sw %[tp8], 60(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+ [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
+ [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
+ : [src] "r" (src), [dst] "r" (dst)
+ );
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+ break;
+ default:
+ for (y = h; y--; ) {
+ for (x = 0; x < w; ++x) {
+ dst[x] = src[x];
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
new file mode 100644
index 0000000..0303896
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
@@ -0,0 +1,923 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_horiz_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4;
+ uint32_t n1, n2, n3, n4;
+ uint32_t tn1, tn2;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn2] \n\t"
+ "preceu.ph.qbl %[n4], %[tn2] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[n1], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t"
+ "lbux %[n2], %[Temp4](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst]) \n\t"
+ "sb %[tn1], 1(%[dst]) \n\t"
+ "sb %[tp2], 2(%[dst]) \n\t"
+ "sb %[n2], 3(%[dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+ [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_horiz_8_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3, p4, n1;
+ uint32_t tn1, tn2, tn3;
+ uint32_t st0, st1;
+
+ vector1b = ((const int32_t *)filter_x0)[0];
+ vector2b = ((const int32_t *)filter_x0)[1];
+ vector3b = ((const int32_t *)filter_x0)[2];
+ vector4b = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tn2], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "preceu.ph.qbr %[p1], %[tn2] \n\t"
+ "preceu.ph.qbl %[n1], %[tn2] \n\t"
+ "ulw %[tn1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[tn1] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st0], 0(%[dst]) \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+
+ "balign %[tn3], %[tn1], 3 \n\t"
+ "balign %[tn1], %[tn2], 3 \n\t"
+ "balign %[tn2], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 2(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tn2] \n\t"
+ "preceu.ph.qbl %[p4], %[tn2] \n\t"
+ "sb %[st0], 4(%[dst]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tn1] \n\t"
+ "preceu.ph.qbl %[n1], %[tn1] \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "preceu.ph.qbr %[p2], %[tn3] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[st1], 1(%[dst]) \n\t"
+ "sb %[st0], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[n1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 3(%[dst]) \n\t"
+ "sb %[p2], 5(%[dst]) \n\t"
+ "sb %[n1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
+ [st0] "=&r" (st0), [st1] "=&r" (st1),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [n1] "=&r" (n1),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+ [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "ulw %[qload2], 16(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "ulw %[qload3], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "ulw %[qload2], 17(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "ulw %[qload3], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter12] "r" (filter12), [filter34] "r" (filter34),
+ [filter56] "r" (filter56), [filter78] "r" (filter78),
+ [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst),
+ [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t filter12, filter34, filter56, filter78;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+
+ filter12 = ((const int32_t *)filter_x0)[0];
+ filter34 = ((const int32_t *)filter_x0)[1];
+ filter56 = ((const int32_t *)filter_x0)[2];
+ filter78 = ((const int32_t *)filter_x0)[3];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_load(src_ptr + src_stride + 64);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+ vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
+ "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
+ "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "ulw %[qload2], 16(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
+ "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
+ "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "ulw %[qload3], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
+ "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
+ "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
+ "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "ulw %[qload2], 17(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
+ "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p4], %[qload2] \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
+ "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbl %[p1], %[qload2] \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "ulw %[qload3], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
+ "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p5], %[qload3] \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
+ "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
+ "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter12] "r" (filter12), [filter34] "r" (filter34),
+ [filter56] "r" (filter56), [filter78] "r" (filter78),
+ [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst),
+ [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void vp9_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (((const int32_t *)filter_x)[1] == 0x800000) {
+ vp9_convolve_copy(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ } else if (((const int32_t *)filter_x)[0] == 0) {
+ vp9_convolve2_horiz_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ } else {
+ if (16 == x_step_q4) {
+ uint32_t pos = 38;
+
+ vp9_prefetch_load((const uint8_t *)filter_x);
+ src -= 3;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src);
+ vp9_prefetch_load(src + 32);
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_horiz_4_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h);
+ break;
+ case 8:
+ convolve_horiz_8_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h);
+ break;
+ case 16:
+ convolve_horiz_16_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h, 1);
+ break;
+ case 32:
+ convolve_horiz_16_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h, 2);
+ break;
+ case 64:
+ vp9_prefetch_load(src + 64);
+ vp9_prefetch_store(dst + 32);
+
+ convolve_horiz_64_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h);
+ break;
+ default:
+ vp9_convolve8_horiz_c(src + 3, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_horiz_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+ }
+}
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
new file mode 100644
index 0000000..0930bb3
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_vert_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2, load3, load4;
+ uint32_t p1, p2;
+ uint32_t n1, n2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2;
+
+ vector1b = ((const int32_t *)filter_y)[0];
+ vector2b = ((const int32_t *)filter_y)[1];
+ vector3b = ((const int32_t *)filter_y)[2];
+ vector4b = ((const int32_t *)filter_y)[3];
+
+ src -= 3 * src_stride;
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [n1] "=&r" (n1), [n2] "=&r" (n2),
+ [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+ [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
+ [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_vert_64_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2, load3, load4;
+ uint32_t p1, p2;
+ uint32_t n1, n2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t vector1b, vector2b, vector3b, vector4b;
+ int32_t Temp1, Temp2;
+
+ vector1b = ((const int32_t *)filter_y)[0];
+ vector2b = ((const int32_t *)filter_y)[1];
+ vector3b = ((const int32_t *)filter_y)[2];
+ vector4b = ((const int32_t *)filter_y)[3];
+
+ src -= 3 * src_stride;
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+ vp9_prefetch_store(dst + dst_stride + 32);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load3], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load4], 0(%[src_ptr]) \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbr %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbr %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+ "preceu.ph.qbl %[scratch2], %[load3] \n\t"
+ "preceu.ph.qbl %[p2], %[load4] \n\t"
+ "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
+ "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [n1] "=&r" (n1), [n2] "=&r" (n2),
+ [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+ [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
+ [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (((const int32_t *)filter_y)[1] == 0x800000) {
+ vp9_convolve_copy(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ } else if (((const int32_t *)filter_y)[0] == 0) {
+ vp9_convolve2_vert_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ } else {
+ if (16 == y_step_q4) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4 :
+ case 8 :
+ case 16 :
+ case 32 :
+ convolve_vert_4_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, w, h);
+ break;
+ case 64 :
+ vp9_prefetch_store(dst + 32);
+ convolve_vert_64_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, h);
+ break;
+ default:
+ vp9_convolve8_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+ }
+}
+
+#endif
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
new file mode 100644
index 0000000..1b2f550
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
@@ -0,0 +1,1315 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output,
+ uint32_t no_rows) {
+ int i;
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int step1_10, step1_11, step1_12, step1_13;
+ int step2_0, step2_1, step2_2, step2_3;
+ int step2_8, step2_9, step2_10, step2_11;
+ int step2_12, step2_13, step2_14, step2_15;
+ int load1, load2, load3, load4, load5, load6, load7, load8;
+ int result1, result2, result3, result4;
+ const int const_2_power_13 = 8192;
+
+ for (i = no_rows; i--; ) {
+ /* prefetch row */
+ vp9_prefetch_load((const uint8_t *)(input + 16));
+
+ __asm__ __volatile__ (
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 16(%[input]) \n\t"
+ "lh %[load3], 8(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[step2_0], $ac1, 31 \n\t"
+ "extp %[step2_1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[step2_2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[step2_3], $ac1, 31 \n\t"
+
+ "add %[step1_0], %[step2_0], %[step2_3] \n\t"
+ "add %[step1_1], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+ [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+ [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+ [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load5], 2(%[input]) \n\t"
+ "lh %[load6], 30(%[input]) \n\t"
+ "lh %[load7], 18(%[input]) \n\t"
+ "lh %[load8], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_2_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_14_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_18_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_14_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_2_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_30_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "sub %[load5], %[result1], %[result2] \n\t"
+ "sub %[load6], %[result4], %[result3] \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load6], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load5], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load5], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_8_64] \n\t"
+
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[result1], %[result2] \n\t"
+ "add %[step2_15], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [load7] "=&r" (load7), [load8] "=&r" (load8),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+ [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+ [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 22(%[input]) \n\t"
+ "lh %[load3], 26(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_26_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_22_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac2, %[load4], %[cospi_6_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[result2], %[result1] \n\t"
+ "sub %[load2], %[result4], %[result3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[result1], %[result2] \n\t"
+ "add %[step2_12], %[result4], %[result3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+ [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+ [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load5], 4(%[input]) \n\t"
+ "lh %[load6], 28(%[input]) \n\t"
+ "lh %[load7], 20(%[input]) \n\t"
+ "lh %[load8], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_4_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_12_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_20_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_12_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_4_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_28_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[result4], %[result3] \n\t"
+ "sub %[load5], %[load5], %[result1] \n\t"
+ "add %[load5], %[load5], %[result2] \n\t"
+
+ "sub %[load6], %[result1], %[result2] \n\t"
+ "sub %[load6], %[load6], %[result3] \n\t"
+ "add %[load6], %[load6], %[result4] \n\t"
+
+ "madd $ac1, %[load5], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[result1], %[result2] \n\t"
+ "add %[step1_7], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [load7] "=&r" (load7), [load8] "=&r" (load8),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+ [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "sub %[load5], %[step2_14], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+
+ "madd $ac0, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_14], %[step2_13] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "add %[load6], %[load6], %[step2_9] \n\t"
+
+ "madd $ac1, %[load6], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[step2_15], %[step2_12] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "add %[load5], %[load5], %[step2_11] \n\t"
+
+ "madd $ac2, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_15], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_11] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_10], $ac0, 31 \n\t"
+ "extp %[step1_13], $ac1, 31 \n\t"
+ "extp %[step1_11], $ac2, 31 \n\t"
+ "extp %[step1_12], $ac3, 31 \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+ [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+ [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+ [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+ [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "add %[load5], %[step1_0], %[step1_7] \n\t"
+ "add %[load5], %[load5], %[step2_12] \n\t"
+ "add %[load5], %[load5], %[step2_15] \n\t"
+ "add %[load6], %[step1_1], %[step1_6] \n\t"
+ "add %[load6], %[load6], %[step2_13] \n\t"
+ "add %[load6], %[load6], %[step2_14] \n\t"
+ "sh %[load5], 0(%[output]) \n\t"
+ "sh %[load6], 32(%[output]) \n\t"
+ "sub %[load5], %[step1_1], %[step1_6] \n\t"
+ "add %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+ "sub %[load6], %[step1_0], %[step1_7] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+ "add %[load6], %[load6], %[step2_11] \n\t"
+ "sh %[load5], 192(%[output]) \n\t"
+ "sh %[load6], 224(%[output]) \n\t"
+ "sub %[load5], %[step1_0], %[step1_7] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "sub %[load5], %[load5], %[step2_11] \n\t"
+ "sub %[load6], %[step1_1], %[step1_6] \n\t"
+ "sub %[load6], %[load6], %[step2_9] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "sh %[load5], 256(%[output]) \n\t"
+ "sh %[load6], 288(%[output]) \n\t"
+ "add %[load5], %[step1_1], %[step1_6] \n\t"
+ "sub %[load5], %[load5], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_14] \n\t"
+ "add %[load6], %[step1_0], %[step1_7] \n\t"
+ "sub %[load6], %[load6], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_15] \n\t"
+ "sh %[load5], 448(%[output]) \n\t"
+ "sh %[load6], 480(%[output]) \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6)
+ : [output] "r" (output),
+ [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+ [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+ [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+ [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+ [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+ [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
+ );
+
+ __asm__ __volatile__ (
+ "add %[load5], %[step1_2], %[step1_5] \n\t"
+ "add %[load5], %[load5], %[step1_13] \n\t"
+ "add %[load6], %[step1_3], %[step1_4] \n\t"
+ "add %[load6], %[load6], %[step1_12] \n\t"
+ "sh %[load5], 64(%[output]) \n\t"
+ "sh %[load6], 96(%[output]) \n\t"
+ "sub %[load5], %[step1_3], %[step1_4] \n\t"
+ "add %[load5], %[load5], %[step1_11] \n\t"
+ "sub %[load6], %[step1_2], %[step1_5] \n\t"
+ "add %[load6], %[load6], %[step1_10] \n\t"
+ "sh %[load5], 128(%[output]) \n\t"
+ "sh %[load6], 160(%[output]) \n\t"
+ "sub %[load5], %[step1_2], %[step1_5] \n\t"
+ "sub %[load5], %[load5], %[step1_10] \n\t"
+ "sub %[load6], %[step1_3], %[step1_4] \n\t"
+ "sub %[load6], %[load6], %[step1_11] \n\t"
+ "sh %[load5], 320(%[output]) \n\t"
+ "sh %[load6], 352(%[output]) \n\t"
+ "add %[load5], %[step1_3], %[step1_4] \n\t"
+ "sub %[load5], %[load5], %[step1_12] \n\t"
+ "add %[load6], %[step1_2], %[step1_5] \n\t"
+ "sub %[load6], %[load6], %[step1_13] \n\t"
+ "sh %[load5], 384(%[output]) \n\t"
+ "sh %[load6], 416(%[output]) \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6)
+ : [output] "r" (output),
+ [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+ [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+ [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+ [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
+ );
+
+ input += 16;
+ output += 1;
+ }
+}
+
+static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int i;
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int step1_8, step1_9, step1_10, step1_11;
+ int step1_12, step1_13, step1_14, step1_15;
+ int step2_0, step2_1, step2_2, step2_3;
+ int step2_8, step2_9, step2_10, step2_11;
+ int step2_12, step2_13, step2_14, step2_15;
+ int load1, load2, load3, load4, load5, load6, load7, load8;
+ int result1, result2, result3, result4;
+ const int const_2_power_13 = 8192;
+ uint8_t *dest_pix;
+ uint8_t *cm = vp9_ff_cropTbl;
+
+ /* prefetch vp9_ff_cropTbl */
+ vp9_prefetch_load(vp9_ff_cropTbl);
+ vp9_prefetch_load(vp9_ff_cropTbl + 32);
+ vp9_prefetch_load(vp9_ff_cropTbl + 64);
+ vp9_prefetch_load(vp9_ff_cropTbl + 96);
+ vp9_prefetch_load(vp9_ff_cropTbl + 128);
+ vp9_prefetch_load(vp9_ff_cropTbl + 160);
+ vp9_prefetch_load(vp9_ff_cropTbl + 192);
+ vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+ for (i = 0; i < 16; ++i) {
+ dest_pix = (dest + i);
+ __asm__ __volatile__ (
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 16(%[input]) \n\t"
+ "lh %[load3], 8(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[step2_0], $ac1, 31 \n\t"
+ "extp %[step2_1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[step2_2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[step2_3], $ac1, 31 \n\t"
+
+ "add %[step1_0], %[step2_0], %[step2_3] \n\t"
+ "add %[step1_1], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+ [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+ [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+ [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load5], 2(%[input]) \n\t"
+ "lh %[load6], 30(%[input]) \n\t"
+ "lh %[load7], 18(%[input]) \n\t"
+ "lh %[load8], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_2_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_14_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_18_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_14_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_2_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_30_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "sub %[load5], %[result1], %[result2] \n\t"
+ "sub %[load6], %[result4], %[result3] \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load6], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load5], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load5], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_8_64] \n\t"
+
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[result1], %[result2] \n\t"
+ "add %[step2_15], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [load7] "=&r" (load7), [load8] "=&r" (load8),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+ [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+ [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 22(%[input]) \n\t"
+ "lh %[load3], 26(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_26_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_22_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac2, %[load4], %[cospi_6_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[result2], %[result1] \n\t"
+ "sub %[load2], %[result4], %[result3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[result1], %[result2] \n\t"
+ "add %[step2_12], %[result4], %[result3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+ [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+ [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load5], 4(%[input]) \n\t"
+ "lh %[load6], 28(%[input]) \n\t"
+ "lh %[load7], 20(%[input]) \n\t"
+ "lh %[load8], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_4_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_12_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_20_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_12_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_4_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_28_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[result4], %[result3] \n\t"
+ "sub %[load5], %[load5], %[result1] \n\t"
+ "add %[load5], %[load5], %[result2] \n\t"
+
+ "sub %[load6], %[result1], %[result2] \n\t"
+ "sub %[load6], %[load6], %[result3] \n\t"
+ "add %[load6], %[load6], %[result4] \n\t"
+
+ "madd $ac1, %[load5], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+
+ "add %[step1_4], %[result1], %[result2] \n\t"
+ "add %[step1_7], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [load7] "=&r" (load7), [load8] "=&r" (load8),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+ [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "sub %[load5], %[step2_14], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+
+ "madd $ac0, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_14], %[step2_13] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "add %[load6], %[load6], %[step2_9] \n\t"
+
+ "madd $ac1, %[load6], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[step2_15], %[step2_12] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "add %[load5], %[load5], %[step2_11] \n\t"
+
+ "madd $ac2, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_15], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_11] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_10], $ac0, 31 \n\t"
+ "extp %[step1_13], $ac1, 31 \n\t"
+ "extp %[step1_11], $ac2, 31 \n\t"
+ "extp %[step1_12], $ac3, 31 \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+ [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+ [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+ [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+ [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step1_8 = step2_8 + step2_11;
+ step1_9 = step2_9 + step2_10;
+ step1_14 = step2_13 + step2_14;
+ step1_15 = step2_12 + step2_15;
+
+ __asm__ __volatile__ (
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_0], %[step1_7] \n\t"
+ "add %[load5], %[load5], %[step1_15] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_1], %[step1_6] \n\t"
+ "add %[load6], %[load6], %[step1_14] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_2], %[step1_5] \n\t"
+ "add %[load5], %[load5], %[step1_13] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_3], %[step1_4] \n\t"
+ "add %[load6], %[load6], %[step1_12] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_3], %[step1_4] \n\t"
+ "add %[load5], %[load5], %[step1_11] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_2], %[step1_5] \n\t"
+ "add %[load6], %[load6], %[step1_10] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "sub %[load5], %[step1_1], %[step1_6] \n\t"
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[load5], %[step1_9] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_0], %[step1_7] \n\t"
+ "add %[load6], %[load6], %[step1_8] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_0], %[step1_7] \n\t"
+ "sub %[load5], %[load5], %[step1_8] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_1], %[step1_6] \n\t"
+ "sub %[load6], %[load6], %[step1_9] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_2], %[step1_5] \n\t"
+ "sub %[load5], %[load5], %[step1_10] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_3], %[step1_4] \n\t"
+ "sub %[load6], %[load6], %[step1_11] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_3], %[step1_4] \n\t"
+ "sub %[load5], %[load5], %[step1_12] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_2], %[step1_5] \n\t"
+ "sub %[load6], %[load6], %[step1_13] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_1], %[step1_6] \n\t"
+ "sub %[load5], %[load5], %[step1_14] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_0], %[step1_7] \n\t"
+ "sub %[load6], %[load6], %[step1_15] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
+ [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+ [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+ [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+ [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+ [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+ [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+ [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+ [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
+ );
+
+ input += 16;
+ }
+}
+
+void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // First transform rows
+ idct16_1d_rows_dspr2(input, out, 16);
+
+ // Then transform columns and add to dest
+ idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+static void iadst16_1d(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+
+ int x0 = input[15];
+ int x1 = input[0];
+ int x2 = input[13];
+ int x3 = input[2];
+ int x4 = input[11];
+ int x5 = input[4];
+ int x6 = input[9];
+ int x7 = input[6];
+ int x8 = input[7];
+ int x9 = input[8];
+ int x10 = input[5];
+ int x11 = input[10];
+ int x12 = input[3];
+ int x13 = input[12];
+ int x14 = input[1];
+ int x15 = input[14];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+ | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+ output[0] = output[1] = output[2] = output[3] = output[4]
+ = output[5] = output[6] = output[7] = output[8]
+ = output[9] = output[10] = output[11] = output[12]
+ = output[13] = output[14] = output[15] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = dct_const_round_shift(s0 + s8);
+ x1 = dct_const_round_shift(s1 + s9);
+ x2 = dct_const_round_shift(s2 + s10);
+ x3 = dct_const_round_shift(s3 + s11);
+ x4 = dct_const_round_shift(s4 + s12);
+ x5 = dct_const_round_shift(s5 + s13);
+ x6 = dct_const_round_shift(s6 + s14);
+ x7 = dct_const_round_shift(s7 + s15);
+ x8 = dct_const_round_shift(s0 - s8);
+ x9 = dct_const_round_shift(s1 - s9);
+ x10 = dct_const_round_shift(s2 - s10);
+ x11 = dct_const_round_shift(s3 - s11);
+ x12 = dct_const_round_shift(s4 - s12);
+ x13 = dct_const_round_shift(s5 - s13);
+ x14 = dct_const_round_shift(s6 - s14);
+ x15 = dct_const_round_shift(s7 - s15);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = s0 + s4;
+ x1 = s1 + s5;
+ x2 = s2 + s6;
+ x3 = s3 + s7;
+ x4 = s0 - s4;
+ x5 = s1 - s5;
+ x6 = s2 - s6;
+ x7 = s3 - s7;
+ x8 = dct_const_round_shift(s8 + s12);
+ x9 = dct_const_round_shift(s9 + s13);
+ x10 = dct_const_round_shift(s10 + s14);
+ x11 = dct_const_round_shift(s11 + s15);
+ x12 = dct_const_round_shift(s8 - s12);
+ x13 = dct_const_round_shift(s9 - s13);
+ x14 = dct_const_round_shift(s10 - s14);
+ x15 = dct_const_round_shift(s11 - s15);
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = s0 + s2;
+ x1 = s1 + s3;
+ x2 = s0 - s2;
+ x3 = s1 - s3;
+ x4 = dct_const_round_shift(s4 + s6);
+ x5 = dct_const_round_shift(s5 + s7);
+ x6 = dct_const_round_shift(s4 - s6);
+ x7 = dct_const_round_shift(s5 - s7);
+ x8 = s8 + s10;
+ x9 = s9 + s11;
+ x10 = s8 - s10;
+ x11 = s9 - s11;
+ x12 = dct_const_round_shift(s12 + s14);
+ x13 = dct_const_round_shift(s13 + s15);
+ x14 = dct_const_round_shift(s12 - s14);
+ x15 = dct_const_round_shift(s13 - s15);
+
+ // stage 4
+ s2 = (- cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (- x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (- x10 + x11);
+ s14 = (- cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = dct_const_round_shift(s2);
+ x3 = dct_const_round_shift(s3);
+ x6 = dct_const_round_shift(s6);
+ x7 = dct_const_round_shift(s7);
+ x10 = dct_const_round_shift(s10);
+ x11 = dct_const_round_shift(s11);
+ x14 = dct_const_round_shift(s14);
+ x15 = dct_const_round_shift(s15);
+
+ output[0] = x0;
+ output[1] = -x8;
+ output[2] = x12;
+ output[3] = -x4;
+ output[4] = x6;
+ output[5] = x14;
+ output[6] = x10;
+ output[7] = x2;
+ output[8] = x3;
+ output[9] = x11;
+ output[10] = x15;
+ output[11] = x7;
+ output[12] = x5;
+ output[13] = -x13;
+ output[14] = x9;
+ output[15] = -x1;
+}
+
+void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+ int pitch, int tx_type) {
+ int i, j;
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ int16_t *outptr = out;
+ int16_t temp_out[16];
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ switch (tx_type) {
+ case DCT_DCT: // DCT in both horizontal and vertical
+ idct16_1d_rows_dspr2(input, outptr, 16);
+ idct16_1d_cols_add_blk_dspr2(out, dest, pitch);
+ break;
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ idct16_1d_rows_dspr2(input, outptr, 16);
+
+ outptr = out;
+
+ for (i = 0; i < 16; ++i) {
+ iadst16_1d(outptr, temp_out);
+
+ for (j = 0; j < 16; ++j)
+ dest[j * pitch + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * pitch + i]);
+ outptr += 16;
+ }
+ break;
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ {
+ int16_t temp_in[16 * 16];
+
+ for (i = 0; i < 16; ++i) {
+ /* prefetch row */
+ vp9_prefetch_load((const uint8_t *)(input + 16));
+
+ iadst16_1d(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ for (i = 0; i < 16; ++i)
+ for (j = 0; j < 16; ++j)
+ temp_in[j * 16 + i] = out[i * 16 + j];
+
+ idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch);
+ }
+ break;
+ case ADST_ADST: // ADST in both directions
+ {
+ int16_t temp_in[16];
+
+ for (i = 0; i < 16; ++i) {
+ /* prefetch row */
+ vp9_prefetch_load((const uint8_t *)(input + 16));
+
+ iadst16_1d(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ iadst16_1d(temp_in, temp_out);
+ for (j = 0; j < 16; ++j)
+ dest[j * pitch + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * pitch + i]);
+ }
+ }
+ break;
+ default:
+ printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");
+ break;
+ }
+}
+
+void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ int16_t *outptr = out;
+ uint32_t i;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ idct16_1d_rows_dspr2(input, outptr, 4);
+
+ outptr += 4;
+ for (i = 0; i < 6; ++i) {
+ __asm__ __volatile__ (
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 64(%[outptr]) \n\t"
+ "sw $zero, 96(%[outptr]) \n\t"
+ "sw $zero, 128(%[outptr]) \n\t"
+ "sw $zero, 160(%[outptr]) \n\t"
+ "sw $zero, 192(%[outptr]) \n\t"
+ "sw $zero, 224(%[outptr]) \n\t"
+ "sw $zero, 256(%[outptr]) \n\t"
+ "sw $zero, 288(%[outptr]) \n\t"
+ "sw $zero, 320(%[outptr]) \n\t"
+ "sw $zero, 352(%[outptr]) \n\t"
+ "sw $zero, 384(%[outptr]) \n\t"
+ "sw $zero, 416(%[outptr]) \n\t"
+ "sw $zero, 448(%[outptr]) \n\t"
+ "sw $zero, 480(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r" (outptr)
+ );
+
+ outptr += 2;
+ }
+
+ // Then transform columns
+ idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ uint32_t pos = 45;
+ int32_t out;
+ int32_t r;
+ int32_t a1, absa1;
+ int32_t vector_a1;
+ int32_t t1, t2, t3, t4;
+ int32_t vector_1, vector_2, vector_3, vector_4;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r" (pos)
+ );
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__ (
+ "addi %[out], %[out], 32 \n\t"
+ "sra %[a1], %[out], 6 \n\t"
+
+ : [out] "+r" (out), [a1] "=r" (a1)
+ :
+ );
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 16; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 16; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
new file mode 100644
index 0000000..5e92db3
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
@@ -0,0 +1,1073 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+ int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+ int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
+ int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
+ int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
+ int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+ int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+ int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+ int16_t step2_28, step2_29, step2_30, step2_31;
+ int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+ int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+ int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
+ int16_t step3_28, step3_29, step3_30, step3_31;
+ int temp0, temp1, temp2, temp3;
+ int load1, load2, load3, load4;
+ int result1, result2;
+ int i, temp21;
+ uint8_t *dest_pix, *dest_pix1;
+ const int const_2_power_13 = 8192;
+ uint8_t *cm = vp9_ff_cropTbl;
+
+ /* prefetch vp9_ff_cropTbl */
+ vp9_prefetch_load(vp9_ff_cropTbl);
+ vp9_prefetch_load(vp9_ff_cropTbl + 32);
+ vp9_prefetch_load(vp9_ff_cropTbl + 64);
+ vp9_prefetch_load(vp9_ff_cropTbl + 96);
+ vp9_prefetch_load(vp9_ff_cropTbl + 128);
+ vp9_prefetch_load(vp9_ff_cropTbl + 160);
+ vp9_prefetch_load(vp9_ff_cropTbl + 192);
+ vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+ for (i = 0; i < 32; ++i) {
+ dest_pix = dest + i;
+ dest_pix1 = dest + i + 31 * dest_stride;
+
+ __asm__ __volatile__ (
+ "lh %[load1], 2(%[input]) \n\t"
+ "lh %[load2], 62(%[input]) \n\t"
+ "lh %[load3], 34(%[input]) \n\t"
+ "lh %[load4], 30(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_31_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_1_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_1_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_31_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_15_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_17_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_17_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_15_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_17], $ac1, 31 \n\t"
+ "extp %[step1_30], $ac3, 31 \n\t"
+ "add %[step1_16], %[temp0], %[temp1] \n\t"
+ "add %[step1_31], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
+ [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
+ [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 18(%[input]) \n\t"
+ "lh %[load2], 46(%[input]) \n\t"
+ "lh %[load3], 50(%[input]) \n\t"
+ "lh %[load4], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_23_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_9_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_9_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_23_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_7_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_25_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_25_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_7_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_18], $ac1, 31 \n\t"
+ "extp %[step1_29], $ac3, 31 \n\t"
+ "add %[step1_19], %[temp0], %[temp1] \n\t"
+ "add %[step1_28], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
+ [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
+ [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 54(%[input]) \n\t"
+ "lh %[load3], 42(%[input]) \n\t"
+ "lh %[load4], 22(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_27_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_5_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_5_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_27_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_11_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_21_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_21_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_11_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+
+ "madd $ac1, %[load2], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_12_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_20_64] \n\t"
+
+ "extp %[step1_21], $ac1, 31 \n\t"
+ "extp %[step1_26], $ac3, 31 \n\t"
+ "add %[step1_20], %[temp0], %[temp1] \n\t"
+ "add %[step1_27], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
+ [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
+ [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
+ [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 26(%[input]) \n\t"
+ "lh %[load2], 38(%[input]) \n\t"
+ "lh %[load3], 58(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_19_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_13_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_13_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_19_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_3_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_29_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_29_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_3_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+ "msub $ac1, %[load1], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_20_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_12_64] \n\t"
+ "extp %[step1_22], $ac1, 31 \n\t"
+ "extp %[step1_25], $ac3, 31 \n\t"
+ "add %[step1_23], %[temp0], %[temp1] \n\t"
+ "add %[step1_24], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
+ [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
+ [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
+ [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 4(%[input]) \n\t"
+ "lh %[load2], 60(%[input]) \n\t"
+ "lh %[load3], 36(%[input]) \n\t"
+ "lh %[load4], 28(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_2_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_2_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_30_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_14_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_18_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_14_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+ "msub $ac1, %[load1], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_8_64] \n\t"
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[temp0], %[temp1] \n\t"
+ "add %[step2_15], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
+ [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+ [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 20(%[input]) \n\t"
+ "lh %[load2], 44(%[input]) \n\t"
+ "lh %[load3], 52(%[input]) \n\t"
+ "lh %[load4], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_22_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_26_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_6_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[temp0], %[temp1] \n\t"
+ "add %[step2_12], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+ [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+ [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+ );
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "sub %[temp0], %[step2_14], %[step2_13] \n\t"
+ "sub %[temp0], %[temp0], %[step2_9] \n\t"
+ "add %[temp0], %[temp0], %[step2_10] \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp1], %[step2_14], %[step2_13] \n\t"
+ "add %[temp1], %[temp1], %[step2_9] \n\t"
+ "sub %[temp1], %[temp1], %[step2_10] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "sub %[temp0], %[step2_15], %[step2_12] \n\t"
+ "sub %[temp0], %[temp0], %[step2_8] \n\t"
+ "add %[temp0], %[temp0], %[step2_11] \n\t"
+ "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sub %[temp1], %[step2_15], %[step2_12] \n\t"
+ "add %[temp1], %[temp1], %[step2_8] \n\t"
+ "sub %[temp1], %[temp1], %[step2_11] \n\t"
+ "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
+
+ "add %[step3_8], %[step2_8], %[step2_11] \n\t"
+ "add %[step3_9], %[step2_9], %[step2_10] \n\t"
+ "add %[step3_14], %[step2_13], %[step2_14] \n\t"
+ "add %[step3_15], %[step2_12], %[step2_15] \n\t"
+ "extp %[step3_10], $ac0, 31 \n\t"
+ "extp %[step3_13], $ac1, 31 \n\t"
+ "extp %[step3_11], $ac2, 31 \n\t"
+ "extp %[step3_12], $ac3, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
+ [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
+ [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
+ [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8),
+ [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+ [step2_11] "r" (step2_11), [step2_12] "r" (step2_12),
+ [step2_13] "r" (step2_13), [step2_14] "r" (step2_14),
+ [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step2_18 = step1_17 - step1_18;
+ step2_29 = step1_30 - step1_29;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
+ "extp %[step3_18], $ac0, 31 \n\t"
+
+ : [step3_18] "=r" (step3_18)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+ step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_19 = step1_16 - step1_19;
+ step2_28 = step1_31 - step1_28;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
+ "extp %[step3_19], $ac0, 31 \n\t"
+
+ : [step3_19] "=r" (step3_19)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+ step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_16 = step1_16 + step1_19;
+ step3_17 = step1_17 + step1_18;
+ step3_30 = step1_29 + step1_30;
+ step3_31 = step1_28 + step1_31;
+
+ step2_20 = step1_23 - step1_20;
+ step2_27 = step1_24 - step1_27;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
+ "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
+ "extp %[step3_20], $ac0, 31 \n\t"
+
+ : [step3_20] "=r" (step3_20)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+ step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_21 = step1_22 - step1_21;
+ step2_26 = step1_25 - step1_26;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
+ "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
+ "extp %[step3_21], $ac1, 31 \n\t"
+
+ : [step3_21] "=r" (step3_21)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+ step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_22 = step1_21 + step1_22;
+ step3_23 = step1_20 + step1_23;
+ step3_24 = step1_24 + step1_27;
+ step3_25 = step1_25 + step1_26;
+
+ step2_16 = step3_16 + step3_23;
+ step2_17 = step3_17 + step3_22;
+ step2_18 = step3_18 + step3_21;
+ step2_19 = step3_19 + step3_20;
+ step2_20 = step3_19 - step3_20;
+ step2_21 = step3_18 - step3_21;
+ step2_22 = step3_17 - step3_22;
+ step2_23 = step3_16 - step3_23;
+
+ step2_24 = step3_31 - step3_24;
+ step2_25 = step3_30 - step3_25;
+ step2_26 = step3_29 - step3_26;
+ step2_27 = step3_28 - step3_27;
+ step2_28 = step3_28 + step3_27;
+ step2_29 = step3_29 + step3_26;
+ step2_30 = step3_30 + step3_25;
+ step2_31 = step3_31 + step3_24;
+
+ __asm__ __volatile__ (
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 32(%[input]) \n\t"
+ "lh %[load3], 16(%[input]) \n\t"
+ "lh %[load4], 48(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[temp2], $ac3, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[temp3], $ac1, 31 \n\t"
+ "add %[step1_0], %[temp0], %[temp3] \n\t"
+ "add %[step1_1], %[temp1], %[temp2] \n\t"
+ "sub %[step1_2], %[temp1], %[temp2] \n\t"
+ "sub %[step1_3], %[temp0], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+ [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 8(%[input]) \n\t"
+ "lh %[load2], 56(%[input]) \n\t"
+ "lh %[load3], 40(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_12_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_20_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_12_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load1], %[load1], %[temp0] \n\t"
+ "add %[load1], %[load1], %[temp1] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[load2], %[temp2] \n\t"
+ "add %[load2], %[load2], %[temp3] \n\t"
+ "madd $ac1, %[load1], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[temp0], %[temp1] \n\t"
+ "add %[step1_7], %[temp3], %[temp2] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+ [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step2_0 = step1_0 + step1_7;
+ step2_1 = step1_1 + step1_6;
+ step2_2 = step1_2 + step1_5;
+ step2_3 = step1_3 + step1_4;
+ step2_4 = step1_3 - step1_4;
+ step2_5 = step1_2 - step1_5;
+ step2_6 = step1_1 - step1_6;
+ step2_7 = step1_0 - step1_7;
+
+ // stage 7
+ step1_0 = step2_0 + step3_15;
+ step1_1 = step2_1 + step3_14;
+ step1_2 = step2_2 + step3_13;
+ step1_3 = step2_3 + step3_12;
+ step1_4 = step2_4 + step3_11;
+ step1_5 = step2_5 + step3_10;
+ step1_6 = step2_6 + step3_9;
+ step1_7 = step2_7 + step3_8;
+ step1_8 = step2_7 - step3_8;
+ step1_9 = step2_6 - step3_9;
+ step1_10 = step2_5 - step3_10;
+ step1_11 = step2_4 - step3_11;
+ step1_12 = step2_3 - step3_12;
+ step1_13 = step2_2 - step3_13;
+ step1_14 = step2_1 - step3_14;
+ step1_15 = step2_0 - step3_15;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_27], %[step2_20] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_20], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20),
+ [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_20 + step2_27) * cospi_16_64;
+ step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_26], %[step2_21] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_21], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26),
+ [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_21 + step2_26) * cospi_16_64;
+ step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_25], %[step2_22] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_22], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25),
+ [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_22 + step2_25) * cospi_16_64;
+ step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_24], %[step2_23] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_23], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24),
+ [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_23 + step2_24) * cospi_16_64;
+ step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_0], %[step2_31] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_1], %[step2_30] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_2], %[step2_29] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_3], %[step2_28] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+ [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+ [step2_28] "r" (step2_28), [step2_29] "r" (step2_29),
+ [step2_30] "r" (step2_30), [step2_31] "r" (step2_31)
+ );
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+ [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+ );
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_4], %[step1_27] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_5], %[step1_26] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_6], %[step1_25] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_7], %[step1_24] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+ [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+ [step1_24] "r" (step1_24), [step1_25] "r" (step1_25),
+ [step1_26] "r" (step1_26), [step1_27] "r" (step1_27)
+ );
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+ [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+ );
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_8], %[step1_23] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_9], %[step1_22] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_10], %[step1_21] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_11], %[step1_20] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+ [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+ [step1_20] "r" (step1_20), [step1_21] "r" (step1_21),
+ [step1_22] "r" (step1_22), [step1_23] "r" (step1_23)
+ );
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+ [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+ );
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_12], %[step2_19] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_13], %[step2_18] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_14], %[step2_17] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_15], %[step2_16] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+ [step1_14] "r" (step1_14), [step1_15] "r" (step1_15),
+ [step2_16] "r" (step2_16), [step2_17] "r" (step2_17),
+ [step2_18] "r" (step2_18), [step2_19] "r" (step2_19)
+ );
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+ [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+ );
+
+ input += 32;
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
new file mode 100644
index 0000000..d3aee73
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
@@ -0,0 +1,1013 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+ int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+ int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+ int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+ int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+ int16_t step1_28, step1_29, step1_30, step1_31;
+ int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+ int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+ int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+ int16_t step2_28, step2_29, step2_30, step2_31;
+ int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+ int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+ int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+ int16_t step3_29, step3_30, step3_31;
+ int temp0, temp1, temp2, temp3;
+ int load1, load2, load3, load4;
+ int result1, result2;
+ int temp21;
+ int i;
+ const int const_2_power_13 = 8192;
+ const int32_t *input_int;
+
+ for (i = 32; i--; ) {
+ input_int = (const int32_t *)input;
+
+ if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
+ input_int[4] | input_int[5] | input_int[6] | input_int[7] |
+ input_int[8] | input_int[9] | input_int[10] | input_int[11] |
+ input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
+ input += 32;
+
+ __asm__ __volatile__ (
+ "sh $zero, 0(%[output]) \n\t"
+ "sh $zero, 64(%[output]) \n\t"
+ "sh $zero, 128(%[output]) \n\t"
+ "sh $zero, 192(%[output]) \n\t"
+ "sh $zero, 256(%[output]) \n\t"
+ "sh $zero, 320(%[output]) \n\t"
+ "sh $zero, 384(%[output]) \n\t"
+ "sh $zero, 448(%[output]) \n\t"
+ "sh $zero, 512(%[output]) \n\t"
+ "sh $zero, 576(%[output]) \n\t"
+ "sh $zero, 640(%[output]) \n\t"
+ "sh $zero, 704(%[output]) \n\t"
+ "sh $zero, 768(%[output]) \n\t"
+ "sh $zero, 832(%[output]) \n\t"
+ "sh $zero, 896(%[output]) \n\t"
+ "sh $zero, 960(%[output]) \n\t"
+ "sh $zero, 1024(%[output]) \n\t"
+ "sh $zero, 1088(%[output]) \n\t"
+ "sh $zero, 1152(%[output]) \n\t"
+ "sh $zero, 1216(%[output]) \n\t"
+ "sh $zero, 1280(%[output]) \n\t"
+ "sh $zero, 1344(%[output]) \n\t"
+ "sh $zero, 1408(%[output]) \n\t"
+ "sh $zero, 1472(%[output]) \n\t"
+ "sh $zero, 1536(%[output]) \n\t"
+ "sh $zero, 1600(%[output]) \n\t"
+ "sh $zero, 1664(%[output]) \n\t"
+ "sh $zero, 1728(%[output]) \n\t"
+ "sh $zero, 1792(%[output]) \n\t"
+ "sh $zero, 1856(%[output]) \n\t"
+ "sh $zero, 1920(%[output]) \n\t"
+ "sh $zero, 1984(%[output]) \n\t"
+
+ :
+ : [output] "r" (output)
+ );
+
+ output += 1;
+
+ continue;
+ }
+
+ /* prefetch row */
+ vp9_prefetch_load((const uint8_t *)(input + 32));
+ vp9_prefetch_load((const uint8_t *)(input + 48));
+
+ __asm__ __volatile__ (
+ "lh %[load1], 2(%[input]) \n\t"
+ "lh %[load2], 62(%[input]) \n\t"
+ "lh %[load3], 34(%[input]) \n\t"
+ "lh %[load4], 30(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_31_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_1_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_1_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_31_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_15_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_17_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_17_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_15_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_17], $ac1, 31 \n\t"
+ "extp %[step1_30], $ac3, 31 \n\t"
+ "add %[step1_16], %[temp0], %[temp1] \n\t"
+ "add %[step1_31], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
+ [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
+ [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 18(%[input]) \n\t"
+ "lh %[load2], 46(%[input]) \n\t"
+ "lh %[load3], 50(%[input]) \n\t"
+ "lh %[load4], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_23_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_9_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_9_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_23_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_7_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_25_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_25_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_7_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_18], $ac1, 31 \n\t"
+ "extp %[step1_29], $ac3, 31 \n\t"
+ "add %[step1_19], %[temp0], %[temp1] \n\t"
+ "add %[step1_28], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
+ [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
+ [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 54(%[input]) \n\t"
+ "lh %[load3], 42(%[input]) \n\t"
+ "lh %[load4], 22(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_27_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_5_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_5_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_27_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_11_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_21_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_21_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_11_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+
+ "madd $ac1, %[load2], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_12_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_20_64] \n\t"
+
+ "extp %[step1_21], $ac1, 31 \n\t"
+ "extp %[step1_26], $ac3, 31 \n\t"
+ "add %[step1_20], %[temp0], %[temp1] \n\t"
+ "add %[step1_27], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
+ [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
+ [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
+ [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 26(%[input]) \n\t"
+ "lh %[load2], 38(%[input]) \n\t"
+ "lh %[load3], 58(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_19_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_13_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_13_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_19_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_3_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_29_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_29_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_3_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_20_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_12_64] \n\t"
+
+ "extp %[step1_22], $ac1, 31 \n\t"
+ "extp %[step1_25], $ac3, 31 \n\t"
+ "add %[step1_23], %[temp0], %[temp1] \n\t"
+ "add %[step1_24], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
+ [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
+ [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
+ [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 4(%[input]) \n\t"
+ "lh %[load2], 60(%[input]) \n\t"
+ "lh %[load3], 36(%[input]) \n\t"
+ "lh %[load4], 28(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_2_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_2_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_30_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_14_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_18_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_14_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_8_64] \n\t"
+
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[temp0], %[temp1] \n\t"
+ "add %[step2_15], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
+ [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+ [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 20(%[input]) \n\t"
+ "lh %[load2], 44(%[input]) \n\t"
+ "lh %[load3], 52(%[input]) \n\t"
+ "lh %[load4], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_22_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_26_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_6_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[temp0], %[temp1] \n\t"
+ "add %[step2_12], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+ [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+ [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+ );
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "sub %[temp0], %[step2_14], %[step2_13] \n\t"
+ "sub %[temp0], %[temp0], %[step2_9] \n\t"
+ "add %[temp0], %[temp0], %[step2_10] \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp1], %[step2_14], %[step2_13] \n\t"
+ "add %[temp1], %[temp1], %[step2_9] \n\t"
+ "sub %[temp1], %[temp1], %[step2_10] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "sub %[temp0], %[step2_15], %[step2_12] \n\t"
+ "sub %[temp0], %[temp0], %[step2_8] \n\t"
+ "add %[temp0], %[temp0], %[step2_11] \n\t"
+ "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sub %[temp1], %[step2_15], %[step2_12] \n\t"
+ "add %[temp1], %[temp1], %[step2_8] \n\t"
+ "sub %[temp1], %[temp1], %[step2_11] \n\t"
+ "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
+
+ "add %[step3_8], %[step2_8], %[step2_11] \n\t"
+ "add %[step3_9], %[step2_9], %[step2_10] \n\t"
+ "add %[step3_14], %[step2_13], %[step2_14] \n\t"
+ "add %[step3_15], %[step2_12], %[step2_15] \n\t"
+
+ "extp %[step3_10], $ac0, 31 \n\t"
+ "extp %[step3_13], $ac1, 31 \n\t"
+ "extp %[step3_11], $ac2, 31 \n\t"
+ "extp %[step3_12], $ac3, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
+ [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
+ [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
+ [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+ [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+ [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+ [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step2_18 = step1_17 - step1_18;
+ step2_29 = step1_30 - step1_29;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
+ "extp %[step3_18], $ac0, 31 \n\t"
+
+ : [step3_18] "=r" (step3_18)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+ step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_19 = step1_16 - step1_19;
+ step2_28 = step1_31 - step1_28;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
+ "extp %[step3_19], $ac0, 31 \n\t"
+
+ : [step3_19] "=r" (step3_19)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+ step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_16 = step1_16 + step1_19;
+ step3_17 = step1_17 + step1_18;
+ step3_30 = step1_29 + step1_30;
+ step3_31 = step1_28 + step1_31;
+
+ step2_20 = step1_23 - step1_20;
+ step2_27 = step1_24 - step1_27;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
+ "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
+ "extp %[step3_20], $ac0, 31 \n\t"
+
+ : [step3_20] "=r" (step3_20)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+ step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_21 = step1_22 - step1_21;
+ step2_26 = step1_25 - step1_26;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
+ "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
+ "extp %[step3_21], $ac1, 31 \n\t"
+
+ : [step3_21] "=r" (step3_21)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+ step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_22 = step1_21 + step1_22;
+ step3_23 = step1_20 + step1_23;
+ step3_24 = step1_24 + step1_27;
+ step3_25 = step1_25 + step1_26;
+
+ step2_16 = step3_16 + step3_23;
+ step2_17 = step3_17 + step3_22;
+ step2_18 = step3_18 + step3_21;
+ step2_19 = step3_19 + step3_20;
+ step2_20 = step3_19 - step3_20;
+ step2_21 = step3_18 - step3_21;
+ step2_22 = step3_17 - step3_22;
+ step2_23 = step3_16 - step3_23;
+
+ step2_24 = step3_31 - step3_24;
+ step2_25 = step3_30 - step3_25;
+ step2_26 = step3_29 - step3_26;
+ step2_27 = step3_28 - step3_27;
+ step2_28 = step3_28 + step3_27;
+ step2_29 = step3_29 + step3_26;
+ step2_30 = step3_30 + step3_25;
+ step2_31 = step3_31 + step3_24;
+
+ __asm__ __volatile__ (
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 32(%[input]) \n\t"
+ "lh %[load3], 16(%[input]) \n\t"
+ "lh %[load4], 48(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[temp2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[temp3], $ac1, 31 \n\t"
+
+ "add %[step1_0], %[temp0], %[temp3] \n\t"
+ "add %[step1_1], %[temp1], %[temp2] \n\t"
+ "sub %[step1_2], %[temp1], %[temp2] \n\t"
+ "sub %[step1_3], %[temp0], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+ [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_16_64] "r" (cospi_16_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 8(%[input]) \n\t"
+ "lh %[load2], 56(%[input]) \n\t"
+ "lh %[load3], 40(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_12_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_20_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_12_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load1], %[load1], %[temp0] \n\t"
+ "add %[load1], %[load1], %[temp1] \n\t"
+
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[load2], %[temp2] \n\t"
+ "add %[load2], %[load2], %[temp3] \n\t"
+
+ "madd $ac1, %[load1], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[temp0], %[temp1] \n\t"
+ "add %[step1_7], %[temp3], %[temp2] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+ [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step2_0 = step1_0 + step1_7;
+ step2_1 = step1_1 + step1_6;
+ step2_2 = step1_2 + step1_5;
+ step2_3 = step1_3 + step1_4;
+ step2_4 = step1_3 - step1_4;
+ step2_5 = step1_2 - step1_5;
+ step2_6 = step1_1 - step1_6;
+ step2_7 = step1_0 - step1_7;
+
+ step1_0 = step2_0 + step3_15;
+ step1_1 = step2_1 + step3_14;
+ step1_2 = step2_2 + step3_13;
+ step1_3 = step2_3 + step3_12;
+ step1_4 = step2_4 + step3_11;
+ step1_5 = step2_5 + step3_10;
+ step1_6 = step2_6 + step3_9;
+ step1_7 = step2_7 + step3_8;
+ step1_8 = step2_7 - step3_8;
+ step1_9 = step2_6 - step3_9;
+ step1_10 = step2_5 - step3_10;
+ step1_11 = step2_4 - step3_11;
+ step1_12 = step2_3 - step3_12;
+ step1_13 = step2_2 - step3_13;
+ step1_14 = step2_1 - step3_14;
+ step1_15 = step2_0 - step3_15;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_27], %[step2_20] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_20], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_20 + step2_27) * cospi_16_64;
+ step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_26], %[step2_21] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_21], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_21 + step2_26) * cospi_16_64;
+ step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_25], %[step2_22] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_22], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_22 + step2_25) * cospi_16_64;
+ step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_24], %[step2_23] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_23], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_23 + step2_24) * cospi_16_64;
+ step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ // final stage
+ output[0 * 32] = step1_0 + step2_31;
+ output[1 * 32] = step1_1 + step2_30;
+ output[2 * 32] = step1_2 + step2_29;
+ output[3 * 32] = step1_3 + step2_28;
+ output[4 * 32] = step1_4 + step1_27;
+ output[5 * 32] = step1_5 + step1_26;
+ output[6 * 32] = step1_6 + step1_25;
+ output[7 * 32] = step1_7 + step1_24;
+ output[8 * 32] = step1_8 + step1_23;
+ output[9 * 32] = step1_9 + step1_22;
+ output[10 * 32] = step1_10 + step1_21;
+ output[11 * 32] = step1_11 + step1_20;
+ output[12 * 32] = step1_12 + step2_19;
+ output[13 * 32] = step1_13 + step2_18;
+ output[14 * 32] = step1_14 + step2_17;
+ output[15 * 32] = step1_15 + step2_16;
+ output[16 * 32] = step1_15 - step2_16;
+ output[17 * 32] = step1_14 - step2_17;
+ output[18 * 32] = step1_13 - step2_18;
+ output[19 * 32] = step1_12 - step2_19;
+ output[20 * 32] = step1_11 - step1_20;
+ output[21 * 32] = step1_10 - step1_21;
+ output[22 * 32] = step1_9 - step1_22;
+ output[23 * 32] = step1_8 - step1_23;
+ output[24 * 32] = step1_7 - step1_24;
+ output[25 * 32] = step1_6 - step1_25;
+ output[26 * 32] = step1_5 - step1_26;
+ output[27 * 32] = step1_4 - step1_27;
+ output[28 * 32] = step1_3 - step2_28;
+ output[29 * 32] = step1_2 - step2_29;
+ output[30 * 32] = step1_1 - step2_30;
+ output[31 * 32] = step1_0 - step2_31;
+
+ input += 32;
+ output += 1;
+ }
+}
+
+void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // Rows
+ idct32_1d_rows_dspr2(input, outptr);
+
+ // Columns
+ vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ int r, out;
+ int32_t a1, absa1;
+ int32_t vector_a1;
+ int32_t t1, t2, t3, t4;
+ int32_t vector_1, vector_2, vector_3, vector_4;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r" (pos)
+ );
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__ (
+ "addi %[out], %[out], 32 \n\t"
+ "sra %[a1], %[out], 6 \n\t"
+
+ : [out] "+r" (out), [a1] "=r" (a1)
+ :
+ );
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 32; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+
+ "lw %[t1], 16(%[dest]) \n\t"
+ "lw %[t2], 20(%[dest]) \n\t"
+ "lw %[t3], 24(%[dest]) \n\t"
+ "lw %[t4], 28(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 16(%[dest]) \n\t"
+ "sw %[vector_2], 20(%[dest]) \n\t"
+ "sw %[vector_3], 24(%[dest]) \n\t"
+ "sw %[vector_4], 28(%[dest]) \n\t"
+
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+ [dest] "+&r" (dest)
+ : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 32; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+
+ "lw %[t1], 16(%[dest]) \n\t"
+ "lw %[t2], 20(%[dest]) \n\t"
+ "lw %[t3], 24(%[dest]) \n\t"
+ "lw %[t4], 28(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 16(%[dest]) \n\t"
+ "sw %[vector_2], 20(%[dest]) \n\t"
+ "sw %[vector_3], 24(%[dest]) \n\t"
+ "sw %[vector_4], 28(%[dest]) \n\t"
+
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+ [dest] "+&r" (dest)
+ : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
new file mode 100644
index 0000000..5b7aa5e
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+ int16_t step_0, step_1, step_2, step_3;
+ int Temp0, Temp1, Temp2, Temp3;
+ const int const_2_power_13 = 8192;
+ int i;
+
+ for (i = 4; i--; ) {
+ __asm__ __volatile__ (
+ /*
+ temp_1 = (input[0] + input[2]) * cospi_16_64;
+ step_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[2]) * cospi_16_64;
+ step_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 4(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "extp %[step_0], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "extp %[step_1], $ac1, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ /*
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ step_2 = dct_const_round_shift(temp1);
+ */
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "extp %[step_2], $ac0, 31 \n\t"
+
+ /*
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step_3 = dct_const_round_shift(temp2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[step_3], $ac1, 31 \n\t"
+
+ /*
+ output[0] = step_0 + step_3;
+ output[4] = step_1 + step_2;
+ output[8] = step_1 - step_2;
+ output[12] = step_0 - step_3;
+ */
+ "add %[Temp0], %[step_0], %[step_3] \n\t"
+ "sh %[Temp0], 0(%[output]) \n\t"
+
+ "add %[Temp1], %[step_1], %[step_2] \n\t"
+ "sh %[Temp1], 8(%[output]) \n\t"
+
+ "sub %[Temp2], %[step_1], %[step_2] \n\t"
+ "sh %[Temp2], 16(%[output]) \n\t"
+
+ "sub %[Temp3], %[step_0], %[step_3] \n\t"
+ "sh %[Temp3], 24(%[output]) \n\t"
+
+ : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+ [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+ [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+ [output] "+r" (output)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+ [cospi_24_64] "r" (cospi_24_64),
+ [input] "r" (input)
+ );
+
+ input += 4;
+ output += 1;
+ }
+}
+
+static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int16_t step_0, step_1, step_2, step_3;
+ int Temp0, Temp1, Temp2, Temp3;
+ const int const_2_power_13 = 8192;
+ int i;
+ uint8_t *dest_pix;
+ uint8_t *cm = vp9_ff_cropTbl;
+
+ /* prefetch vp9_ff_cropTbl */
+ vp9_prefetch_load(vp9_ff_cropTbl);
+ vp9_prefetch_load(vp9_ff_cropTbl + 32);
+ vp9_prefetch_load(vp9_ff_cropTbl + 64);
+ vp9_prefetch_load(vp9_ff_cropTbl + 96);
+ vp9_prefetch_load(vp9_ff_cropTbl + 128);
+ vp9_prefetch_load(vp9_ff_cropTbl + 160);
+ vp9_prefetch_load(vp9_ff_cropTbl + 192);
+ vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+ for (i = 0; i < 4; ++i) {
+ dest_pix = (dest + i);
+
+ __asm__ __volatile__ (
+ /*
+ temp_1 = (input[0] + input[2]) * cospi_16_64;
+ step_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[2]) * cospi_16_64;
+ step_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 4(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "extp %[step_0], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "extp %[step_1], $ac1, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ /*
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ step_2 = dct_const_round_shift(temp1);
+ */
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "extp %[step_2], $ac0, 31 \n\t"
+
+ /*
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step_3 = dct_const_round_shift(temp2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[step_3], $ac1, 31 \n\t"
+
+ /*
+ output[0] = step_0 + step_3;
+ output[4] = step_1 + step_2;
+ output[8] = step_1 - step_2;
+ output[12] = step_0 - step_3;
+ */
+ "add %[Temp0], %[step_0], %[step_3] \n\t"
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step_1], %[step_2] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step_1], %[step_2] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step_0], %[step_3] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+
+ : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+ [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+ [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+ [dest_pix] "+r" (dest_pix)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+ [cospi_24_64] "r" (cospi_24_64),
+ [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+ );
+
+ input += 4;
+ }
+}
+
+void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // Rows
+ vp9_idct4_1d_rows_dspr2(input, outptr);
+
+ // Columns
+ vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int a1, absa1;
+ int r;
+ int32_t out;
+ int t2, vector_a1, vector_a;
+ uint32_t pos = 45;
+ int16_t input_dc = input[0];
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r" (pos)
+ );
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
+ __asm__ __volatile__ (
+ "addi %[out], %[out], 8 \n\t"
+ "sra %[a1], %[out], 4 \n\t"
+
+ : [out] "+r" (out), [a1] "=r" (a1)
+ :
+ );
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 4; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t2], 0(%[dest]) \n\t"
+ "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_a], 0(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "replv.qb %[vector_a1], %[a1] \n\t"
+ : [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 4; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t2], 0(%[dest]) \n\t"
+ "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_a], 0(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ }
+}
+
+static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+ int x0, x1, x2, x3;
+
+ x0 = input[0];
+ x1 = input[1];
+ x2 = input[2];
+ x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = x0 - x2 + x3;
+
+ x0 = s0 + s3 + s5;
+ x1 = s1 - s4 - s6;
+ x2 = sinpi_3_9 * s7;
+ x3 = s2;
+
+ s0 = x0 + x3;
+ s1 = x1 + x3;
+ s2 = x2;
+ s3 = x0 + x1 - x3;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = dct_const_round_shift(s0);
+ output[1] = dct_const_round_shift(s1);
+ output[2] = dct_const_round_shift(s2);
+ output[3] = dct_const_round_shift(s3);
+}
+
+void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ int i, j;
+ DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+ int16_t *outptr = out;
+ int16_t temp_in[4 * 4], temp_out[4];
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ switch (tx_type) {
+ case DCT_DCT: // DCT in both horizontal and vertical
+ vp9_idct4_1d_rows_dspr2(input, outptr);
+ vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ break;
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ vp9_idct4_1d_rows_dspr2(input, outptr);
+
+ outptr = out;
+
+ for (i = 0; i < 4; ++i) {
+ iadst4_1d_dspr2(outptr, temp_out);
+
+ for (j = 0; j < 4; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ + dest[j * dest_stride + i]);
+
+ outptr += 4;
+ }
+ break;
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ for (i = 0; i < 4; ++i) {
+ iadst4_1d_dspr2(input, outptr);
+ input += 4;
+ outptr += 4;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) {
+ temp_in[i * 4 + j] = out[j * 4 + i];
+ }
+ }
+ vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+ break;
+ case ADST_ADST: // ADST in both directions
+ for (i = 0; i < 4; ++i) {
+ iadst4_1d_dspr2(input, outptr);
+ input += 4;
+ outptr += 4;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ iadst4_1d_dspr2(temp_in, temp_out);
+
+ for (j = 0; j < 4; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ + dest[j * dest_stride + i]);
+ }
+ break;
+ default:
+ printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n");
+ break;
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
new file mode 100644
index 0000000..93a0840
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -0,0 +1,745 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output,
+ uint32_t no_rows) {
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ const int const_2_power_13 = 8192;
+ int Temp0, Temp1, Temp2, Temp3, Temp4;
+ int i;
+
+ for (i = no_rows; i--; ) {
+ __asm__ __volatile__ (
+ /*
+ temp_1 = (input[0] + input[4]) * cospi_16_64;
+ step2_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[4]) * cospi_16_64;
+ step2_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 8(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "extp %[Temp4], $ac0, 31 \n\t"
+
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+ step2_2 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 4(%[input]) \n\t"
+ "lh %[Temp1], 12(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "extp %[Temp3], $ac0, 31 \n\t"
+
+ /*
+ step1_1 = step2_1 + step2_2;
+ step1_2 = step2_1 - step2_2;
+ */
+ "add %[step1_1], %[Temp2], %[Temp3] \n\t"
+ "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
+
+ /*
+ temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+ step2_3 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ /*
+ step1_0 = step2_0 + step2_3;
+ step1_3 = step2_0 - step2_3;
+ */
+ "add %[step1_0], %[Temp4], %[Temp1] \n\t"
+ "sub %[step1_3], %[Temp4], %[Temp1] \n\t"
+
+ /*
+ temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ step1_4 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp1], 14(%[input]) \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
+ "extp %[step1_4], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1_7 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
+ "extp %[step1_7], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ step1_5 = dct_const_round_shift(temp_1);
+ */
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
+ "extp %[step1_5], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1_6 = dct_const_round_shift(temp_2);
+ */
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+ temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+ */
+ "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
+ "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
+ "add %[Temp0], %[Temp0], %[step1_5] \n\t"
+ "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
+ "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
+ "add %[Temp1], %[Temp1], %[step1_7] \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
+
+ /*
+ step1_4 = step1_4 + step1_5;
+ step1_7 = step1_6 + step1_7;
+ */
+ "add %[step1_4], %[step1_4], %[step1_5] \n\t"
+ "add %[step1_7], %[step1_7], %[step1_6] \n\t"
+
+ "extp %[step1_5], $ac0, 31 \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ "add %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "sh %[Temp0], 0(%[output]) \n\t"
+ "add %[Temp1], %[step1_1], %[step1_6] \n\t"
+ "sh %[Temp1], 16(%[output]) \n\t"
+ "add %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "sh %[Temp0], 32(%[output]) \n\t"
+ "add %[Temp1], %[step1_3], %[step1_4] \n\t"
+ "sh %[Temp1], 48(%[output]) \n\t"
+
+ "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "sh %[Temp0], 64(%[output]) \n\t"
+ "sub %[Temp1], %[step1_2], %[step1_5] \n\t"
+ "sh %[Temp1], 80(%[output]) \n\t"
+ "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "sh %[Temp0], 96(%[output]) \n\t"
+ "sub %[Temp1], %[step1_0], %[step1_7] \n\t"
+ "sh %[Temp1], 112(%[output]) \n\t"
+
+ : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+ [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+ [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+ [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+ [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+ [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [Temp4] "=&r" (Temp4)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_24_64] "r" (cospi_24_64),
+ [output] "r" (output), [input] "r" (input)
+ );
+
+ input += 8;
+ output += 1;
+ }
+}
+
+static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int Temp0, Temp1, Temp2, Temp3;
+ int i;
+ const int const_2_power_13 = 8192;
+ uint8_t *dest_pix;
+ uint8_t *cm = vp9_ff_cropTbl;
+
+ /* prefetch vp9_ff_cropTbl */
+ vp9_prefetch_load(vp9_ff_cropTbl);
+ vp9_prefetch_load(vp9_ff_cropTbl + 32);
+ vp9_prefetch_load(vp9_ff_cropTbl + 64);
+ vp9_prefetch_load(vp9_ff_cropTbl + 96);
+ vp9_prefetch_load(vp9_ff_cropTbl + 128);
+ vp9_prefetch_load(vp9_ff_cropTbl + 160);
+ vp9_prefetch_load(vp9_ff_cropTbl + 192);
+ vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+ for (i = 0; i < 8; ++i) {
+ dest_pix = (dest + i);
+
+ __asm__ __volatile__ (
+ /*
+ temp_1 = (input[0] + input[4]) * cospi_16_64;
+ step2_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[4]) * cospi_16_64;
+ step2_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 8(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "extp %[step1_6], $ac0, 31 \n\t"
+
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+ step2_2 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 4(%[input]) \n\t"
+ "lh %[Temp1], 12(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "extp %[Temp3], $ac0, 31 \n\t"
+
+ /*
+ step1_1 = step2_1 + step2_2;
+ step1_2 = step2_1 - step2_2;
+ */
+ "add %[step1_1], %[Temp2], %[Temp3] \n\t"
+ "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
+
+ /*
+ temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+ step2_3 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ /*
+ step1_0 = step2_0 + step2_3;
+ step1_3 = step2_0 - step2_3;
+ */
+ "add %[step1_0], %[step1_6], %[Temp1] \n\t"
+ "sub %[step1_3], %[step1_6], %[Temp1] \n\t"
+
+ /*
+ temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ step1_4 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp1], 14(%[input]) \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
+ "extp %[step1_4], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1_7 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
+ "extp %[step1_7], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ step1_5 = dct_const_round_shift(temp_1);
+ */
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
+ "extp %[step1_5], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1_6 = dct_const_round_shift(temp_2);
+ */
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+ temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+ */
+ "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
+ "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
+ "add %[Temp0], %[Temp0], %[step1_5] \n\t"
+ "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
+ "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
+ "add %[Temp1], %[Temp1], %[step1_7] \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
+
+ /*
+ step1_4 = step1_4 + step1_5;
+ step1_7 = step1_6 + step1_7;
+ */
+ "add %[step1_4], %[step1_4], %[step1_5] \n\t"
+ "add %[step1_7], %[step1_7], %[step1_6] \n\t"
+
+ "extp %[step1_5], $ac0, 31 \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /* add block */
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+
+ : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+ [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+ [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+ [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+ [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+ [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dest_pix] "+r" (dest_pix)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_24_64] "r" (cospi_24_64),
+ [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+ );
+
+ input += 8;
+ }
+}
+
+void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // First transform rows
+ idct8_1d_rows_dspr2(input, outptr, 8);
+
+ // Then transform columns and add to dest
+ idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+ int x0, x1, x2, x3, x4, x5, x6, x7;
+
+ x0 = input[7];
+ x1 = input[0];
+ x2 = input[5];
+ x3 = input[2];
+ x4 = input[3];
+ x5 = input[4];
+ x6 = input[1];
+ x7 = input[6];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ output[0] = output[1] = output[2] = output[3] = output[4]
+ = output[5] = output[6] = output[7] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+
+ x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
+ x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
+ x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
+ x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
+ x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
+ x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+
+ x0 = s0 + s2;
+ x1 = s1 + s3;
+ x2 = s0 - s2;
+ x3 = s1 - s3;
+ x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
+ x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
+
+ // stage 3
+ s2 = cospi_16_64 * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (x6 - x7);
+
+ x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
+ x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
+
+ output[0] = x0;
+ output[1] = -x4;
+ output[2] = x6;
+ output[3] = -x2;
+ output[4] = x3;
+ output[5] = -x7;
+ output[6] = x5;
+ output[7] = -x1;
+}
+
+void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ int i, j;
+ DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+ int16_t *outptr = out;
+ int16_t temp_in[8 * 8], temp_out[8];
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ switch (tx_type) {
+ case DCT_DCT: // DCT in both horizontal and vertical
+ idct8_1d_rows_dspr2(input, outptr, 8);
+ idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ break;
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ idct8_1d_rows_dspr2(input, outptr, 8);
+
+ for (i = 0; i < 8; ++i) {
+ iadst8_1d_dspr2(&out[i * 8], temp_out);
+
+ for (j = 0; j < 8; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * dest_stride + i]);
+ }
+ break;
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ for (i = 0; i < 8; ++i) {
+ iadst8_1d_dspr2(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) {
+ temp_in[i * 8 + j] = out[j * 8 + i];
+ }
+ }
+ idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+ break;
+ case ADST_ADST: // ADST in both directions
+ for (i = 0; i < 8; ++i) {
+ iadst8_1d_dspr2(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+
+ iadst8_1d_dspr2(temp_in, temp_out);
+
+ for (j = 0; j < 8; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * dest_stride + i]);
+ }
+ break;
+ default:
+ printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");
+ break;
+ }
+}
+
+void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // First transform rows
+ idct8_1d_rows_dspr2(input, outptr, 4);
+
+ outptr += 4;
+
+ __asm__ __volatile__ (
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 4(%[outptr]) \n\t"
+ "sw $zero, 16(%[outptr]) \n\t"
+ "sw $zero, 20(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 36(%[outptr]) \n\t"
+ "sw $zero, 48(%[outptr]) \n\t"
+ "sw $zero, 52(%[outptr]) \n\t"
+ "sw $zero, 64(%[outptr]) \n\t"
+ "sw $zero, 68(%[outptr]) \n\t"
+ "sw $zero, 80(%[outptr]) \n\t"
+ "sw $zero, 84(%[outptr]) \n\t"
+ "sw $zero, 96(%[outptr]) \n\t"
+ "sw $zero, 100(%[outptr]) \n\t"
+ "sw $zero, 112(%[outptr]) \n\t"
+ "sw $zero, 116(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r" (outptr)
+ );
+
+
+ // Then transform columns and add to dest
+ idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ uint32_t pos = 45;
+ int32_t out;
+ int32_t r;
+ int32_t a1, absa1;
+ int32_t t1, t2, vector_a1, vector_1, vector_2;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r" (pos)
+ );
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__ (
+ "addi %[out], %[out], 16 \n\t"
+ "sra %[a1], %[out], 5 \n\t"
+
+ : [out] "+r" (out), [a1] "=r" (a1)
+ :
+ );
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 8; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 8; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [dest] "+r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c
index 864e27e..0d65651 100644
--- a/libvpx/vp9/common/vp9_alloccommon.c
+++ b/libvpx/vp9/common/vp9_alloccommon.c
@@ -41,30 +41,25 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) {
vpx_free(cm->mip);
vpx_free(cm->prev_mip);
- vpx_free(cm->above_seg_context);
vpx_free(cm->last_frame_seg_map);
vpx_free(cm->mi_grid_base);
vpx_free(cm->prev_mi_grid_base);
- vpx_free(cm->above_context[0]);
- for (i = 0; i < MAX_MB_PLANE; i++)
- cm->above_context[i] = 0;
cm->mip = NULL;
cm->prev_mip = NULL;
- cm->above_seg_context = NULL;
cm->last_frame_seg_map = NULL;
cm->mi_grid_base = NULL;
cm->prev_mi_grid_base = NULL;
}
static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
- cm->mb_cols = (aligned_width + 8) >> 4;
- cm->mb_rows = (aligned_height + 8) >> 4;
- cm->MBs = cm->mb_rows * cm->mb_cols;
-
cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE;
+
+ cm->mb_cols = (cm->mi_cols + 1) >> 1;
+ cm->mb_rows = (cm->mi_rows + 1) >> 1;
+ cm->MBs = cm->mb_rows * cm->mb_cols;
}
static void setup_mi(VP9_COMMON *cm) {
@@ -85,7 +80,7 @@ static void setup_mi(VP9_COMMON *cm) {
}
int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
- int i, mi_cols;
+ int i;
const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
@@ -140,21 +135,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
setup_mi(cm);
- // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
- // information is exposed at this level
- mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-
- // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
- // block where mi unit size is 8x8.
- cm->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE *
- (2 * mi_cols), 1);
- if (!cm->above_context[0])
- goto fail;
-
- cm->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
- if (!cm->above_seg_context)
- goto fail;
-
// Create the segmentation map structure and set to 0.
cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
if (!cm->last_frame_seg_map)
@@ -170,13 +150,8 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
void vp9_create_common(VP9_COMMON *cm) {
vp9_machine_specific_config(cm);
- vp9_init_mbmode_probs(cm);
-
cm->tx_mode = ONLY_4X4;
cm->comp_pred_mode = HYBRID_PREDICTION;
-
- // Initialize reference frame sign bias structure to defaults
- vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));
}
void vp9_remove_common(VP9_COMMON *cm) {
@@ -184,24 +159,19 @@ void vp9_remove_common(VP9_COMMON *cm) {
}
void vp9_initialize_common() {
+ vp9_init_neighbors();
vp9_coef_tree_initialize();
vp9_entropy_mode_init();
vp9_entropy_mv_init();
}
void vp9_update_frame_size(VP9_COMMON *cm) {
- int i, mi_cols;
const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, MI_SIZE_LOG2);
const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, MI_SIZE_LOG2);
set_mb_mi(cm, aligned_width, aligned_height);
setup_mi(cm);
- mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
- for (i = 1; i < MAX_MB_PLANE; i++)
- cm->above_context[i] =
- cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
-
// Initialize the previous frame segment map to 0.
if (cm->last_frame_seg_map)
vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h
index c8d677f..d0d4852 100644
--- a/libvpx/vp9/common/vp9_blockd.h
+++ b/libvpx/vp9/common/vp9_blockd.h
@@ -20,6 +20,7 @@
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_common_data.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_mv.h"
#include "vp9/common/vp9_scale.h"
#include "vp9/common/vp9_seg_common.h"
@@ -52,18 +53,10 @@ static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
typedef enum {
KEY_FRAME = 0,
INTER_FRAME = 1,
- NUM_FRAME_TYPES,
+ FRAME_TYPES,
} FRAME_TYPE;
typedef enum {
- EIGHTTAP = 0,
- EIGHTTAP_SMOOTH = 1,
- EIGHTTAP_SHARP = 2,
- BILINEAR = 3,
- SWITCHABLE = 4 /* should be the last one */
-} INTERPOLATIONFILTERTYPE;
-
-typedef enum {
DC_PRED, // Average of above and left pixels
V_PRED, // Vertical
H_PRED, // Horizontal
@@ -81,10 +74,6 @@ typedef enum {
MB_MODE_COUNT
} MB_PREDICTION_MODE;
-static INLINE int is_intra_mode(MB_PREDICTION_MODE mode) {
- return mode <= TM_PRED;
-}
-
static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
return mode >= NEARESTMV && mode <= NEWMV;
}
@@ -101,10 +90,10 @@ static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) {
modes for the Y blocks to the left and above us; for interframes, there
is a single probability table. */
-union b_mode_info {
+typedef struct {
MB_PREDICTION_MODE as_mode;
int_mv as_mv[2]; // first, second inter predictor motion vectors
-};
+} b_mode_info;
typedef enum {
NONE = -1,
@@ -137,7 +126,7 @@ typedef struct {
TX_SIZE tx_size;
int_mv mv[2]; // for each reference frame used
int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
- int_mv best_mv, best_second_mv;
+ int_mv best_mv[2];
uint8_t mode_context[MAX_REF_FRAMES];
@@ -147,14 +136,14 @@ typedef struct {
// Flags used for prediction status of various bit-stream signals
unsigned char seg_id_predicted;
- INTERPOLATIONFILTERTYPE interp_filter;
+ INTERPOLATION_TYPE interp_filter;
BLOCK_SIZE sb_type;
} MB_MODE_INFO;
typedef struct {
MB_MODE_INFO mbmi;
- union b_mode_info bmi[4];
+ b_mode_info bmi[4];
} MODE_INFO;
static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
@@ -203,22 +192,15 @@ typedef struct macroblockd {
struct scale_factors scale_factor[2];
MODE_INFO *last_mi;
- MODE_INFO *this_mi;
int mode_info_stride;
- MODE_INFO *mic_stream_ptr;
-
// A NULL indicates that the 8x8 is not part of the image
MODE_INFO **mi_8x8;
MODE_INFO **prev_mi_8x8;
+ MODE_INFO *mi_stream;
int up_available;
int left_available;
- int right_available;
-
- // partition contexts
- PARTITION_CONTEXT *above_seg_context;
- PARTITION_CONTEXT *left_seg_context;
/* Distance of MB away from frame edges */
int mb_to_left_edge;
@@ -228,14 +210,10 @@ typedef struct macroblockd {
int lossless;
/* Inverse transform function pointers. */
- void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
- void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride);
- void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);
+ void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
struct subpix_fn_table subpix;
- int allow_high_precision_mv;
-
int corrupted;
unsigned char sb_index; // index of 32x32 block inside the 64x64 block
@@ -245,71 +223,15 @@ typedef struct macroblockd {
int q_index;
-} MACROBLOCKD;
+ /* Y,U,V,(A) */
+ ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+ ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
-static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
- switch (subsize) {
- case BLOCK_64X64:
- case BLOCK_64X32:
- case BLOCK_32X64:
- case BLOCK_32X32:
- return &xd->sb_index;
- case BLOCK_32X16:
- case BLOCK_16X32:
- case BLOCK_16X16:
- return &xd->mb_index;
- case BLOCK_16X8:
- case BLOCK_8X16:
- case BLOCK_8X8:
- return &xd->b_index;
- case BLOCK_8X4:
- case BLOCK_4X8:
- case BLOCK_4X4:
- return &xd->ab_index;
- default:
- assert(0);
- return NULL;
- }
-}
-
-static INLINE void update_partition_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type,
- BLOCK_SIZE sb_size) {
- const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
- const int bwl = b_width_log2(sb_type);
- const int bhl = b_height_log2(sb_type);
- const int boffset = b_width_log2(BLOCK_64X64) - bsl;
- const char pcval0 = ~(0xe << boffset);
- const char pcval1 = ~(0xf << boffset);
- const char pcvalue[2] = {pcval0, pcval1};
-
- assert(MAX(bwl, bhl) <= bsl);
-
- // update the partition context at the end notes. set partition bits
- // of block sizes larger than the current one to be one, and partition
- // bits of smaller block sizes to be zero.
- vpx_memset(xd->above_seg_context, pcvalue[bwl == bsl], bs);
- vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs);
-}
-
-static INLINE int partition_plane_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type) {
- int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
- int above = 0, left = 0, i;
- int boffset = mi_width_log2(BLOCK_64X64) - bsl;
-
- assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
- assert(bsl >= 0);
- assert(boffset >= 0);
-
- for (i = 0; i < bs; i++)
- above |= (xd->above_seg_context[i] & (1 << boffset));
- for (i = 0; i < bs; i++)
- left |= (xd->left_seg_context[i] & (1 << boffset));
+ PARTITION_CONTEXT *above_seg_context;
+ PARTITION_CONTEXT left_seg_context[8];
+} MACROBLOCKD;
- above = (above > 0);
- left = (left > 0);
- return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
-}
static BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) {
const BLOCK_SIZE subsize = subsize_lookup[partition][bsize];
@@ -321,7 +243,7 @@ extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];
static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
const MACROBLOCKD *xd, int ib) {
- const MODE_INFO *const mi = xd->this_mi;
+ const MODE_INFO *const mi = xd->mi_8x8[0];
const MB_MODE_INFO *const mbmi = &mi->mbmi;
if (plane_type != PLANE_TYPE_Y_WITH_DC ||
@@ -336,13 +258,13 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type,
const MACROBLOCKD *xd) {
return plane_type == PLANE_TYPE_Y_WITH_DC ?
- mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT;
+ mode2txfm_map[xd->mi_8x8[0]->mbmi.mode] : DCT_DCT;
}
static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type,
const MACROBLOCKD *xd) {
return plane_type == PLANE_TYPE_Y_WITH_DC ?
- mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT;
+ mode2txfm_map[xd->mi_8x8[0]->mbmi.mode] : DCT_DCT;
}
static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {
@@ -391,7 +313,7 @@ static INLINE void foreach_transformed_block_in_plane(
const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
foreach_transformed_block_visitor visit, void *arg) {
const struct macroblockd_plane *const pd = &xd->plane[plane];
- const MB_MODE_INFO* mbmi = &xd->this_mi->mbmi;
+ const MB_MODE_INFO* mbmi = &xd->mi_8x8[0]->mbmi;
// block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
// 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
// transform size varies per plane, look it up in a common way.
@@ -495,7 +417,7 @@ static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
*y = (raster_mb >> tx_cols_log2) << tx_size;
}
-static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
+static void extend_for_intra(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
int plane, int block, TX_SIZE tx_size) {
struct macroblockd_plane *const pd = &xd->plane[plane];
uint8_t *const buf = pd->dst.buf;
@@ -520,19 +442,22 @@ static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
}
if (xd->mb_to_bottom_edge < 0) {
- const int bh = 4 << b_height_log2(plane_bsize);
- const int umv_border_start = bh + (xd->mb_to_bottom_edge >>
- (3 + pd->subsampling_y));
- int i;
- const uint8_t c = buf[(umv_border_start - 1) * stride + x];
- uint8_t *d = &buf[umv_border_start * stride + x];
-
- if (y + bh > umv_border_start)
- for (i = 0; i < bh; ++i, d += stride)
- *d = c;
+ if (xd->left_available || x >= 0) {
+ const int bh = 4 << b_height_log2(plane_bsize);
+ const int umv_border_start =
+ bh + (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y));
+
+ if (y + bh > umv_border_start) {
+ const uint8_t c = buf[(umv_border_start - 1) * stride + x];
+ uint8_t *d = &buf[umv_border_start * stride + x];
+ int i;
+ for (i = 0; i < bh; ++i, d += stride)
+ *d = c;
+ }
+ }
}
}
-static void set_contexts_on_border(MACROBLOCKD *xd,
+static void set_contexts_on_border(const MACROBLOCKD *xd,
struct macroblockd_plane *pd,
BLOCK_SIZE plane_bsize,
int tx_size_in_blocks, int has_eob,
@@ -570,7 +495,7 @@ static void set_contexts_on_border(MACROBLOCKD *xd,
L[pt] = 0;
}
-static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
+static void set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
int has_eob, int aoff, int loff) {
ENTROPY_CONTEXT *const A = pd->above_context + aoff;
@@ -586,7 +511,7 @@ static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
}
}
-static int get_tx_eob(struct segmentation *seg, int segment_id,
+static int get_tx_eob(const struct segmentation *seg, int segment_id,
TX_SIZE tx_size) {
const int eob_max = 16 << (tx_size << 1);
return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
diff --git a/libvpx/vp9/common/vp9_common.h b/libvpx/vp9/common/vp9_common.h
index 1796906..36d1cdf 100644
--- a/libvpx/vp9/common/vp9_common.h
+++ b/libvpx/vp9/common/vp9_common.h
@@ -40,8 +40,8 @@
vpx_memcpy(dest, src, n * sizeof(*src)); \
}
-#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest));
-#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest));
+#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest))
+#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest))
static INLINE uint8_t clip_pixel(int val) {
return (val > 255) ? 255u : (val < 0) ? 0u : val;
@@ -84,9 +84,11 @@ static int get_unsigned_bits(unsigned int num_values) {
} while (0)
#endif
-#define SYNC_CODE_0 0x49
-#define SYNC_CODE_1 0x83
-#define SYNC_CODE_2 0x42
+#define VP9_SYNC_CODE_0 0x49
+#define VP9_SYNC_CODE_1 0x83
+#define VP9_SYNC_CODE_2 0x42
+
+#define VP9_FRAME_MARKER 0x2
#endif // VP9_COMMON_VP9_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c
index dc41efd..f858900 100644
--- a/libvpx/vp9/common/vp9_common_data.c
+++ b/libvpx/vp9/common/vp9_common_data.c
@@ -115,6 +115,16 @@ const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES] = {
TX_16X16, TX_16X16, TX_16X16, TX_32X32
};
+const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
+ TX_4X4, // ONLY_4X4
+ TX_8X8, // ALLOW_8X8
+ TX_16X16, // ALLOW_16X16
+ TX_32X32, // ALLOW_32X32
+ TX_32X32, // TX_MODE_SELECT
+};
+
+
+
const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
// ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1
// ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1
@@ -133,3 +143,4 @@ const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
{{BLOCK_64X64, BLOCK_64X32}, {BLOCK_32X64, BLOCK_32X32}},
};
+
diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h
index 3822bfc..c1f6405 100644
--- a/libvpx/vp9/common/vp9_common_data.h
+++ b/libvpx/vp9/common/vp9_common_data.h
@@ -27,6 +27,7 @@ extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES];
extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES];
+extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES];
extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
-#endif // VP9_COMMON_VP9_COMMON_DATA_H
+#endif // VP9_COMMON_VP9_COMMON_DATA_H
diff --git a/libvpx/vp9/common/vp9_convolve.c b/libvpx/vp9/common/vp9_convolve.c
index 94231a1..a2d864c 100644
--- a/libvpx/vp9/common/vp9_convolve.c
+++ b/libvpx/vp9/common/vp9_convolve.c
@@ -7,13 +7,13 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vp9/common/vp9_convolve.h"
#include <assert.h>
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_convolve.h"
#include "vp9/common/vp9_filter.h"
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
@@ -35,7 +35,7 @@ static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
for (y = 0; y < h; ++y) {
/* Initial phase offset */
- int x_q4 = (filter_x0 - filter_x_base) / taps;
+ int x_q4 = (int)(filter_x0 - filter_x_base) / taps;
for (x = 0; x < w; ++x) {
/* Per-pixel src offset */
@@ -76,7 +76,7 @@ static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
for (y = 0; y < h; ++y) {
/* Initial phase offset */
- int x_q4 = (filter_x0 - filter_x_base) / taps;
+ int x_q4 = (int)(filter_x0 - filter_x_base) / taps;
for (x = 0; x < w; ++x) {
/* Per-pixel src offset */
@@ -118,7 +118,7 @@ static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
for (x = 0; x < w; ++x) {
/* Initial phase offset */
- int y_q4 = (filter_y0 - filter_y_base) / taps;
+ int y_q4 = (int)(filter_y0 - filter_y_base) / taps;
for (y = 0; y < h; ++y) {
/* Per-pixel src offset */
@@ -160,7 +160,7 @@ static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
for (x = 0; x < w; ++x) {
/* Initial phase offset */
- int y_q4 = (filter_y0 - filter_y_base) / taps;
+ int y_q4 = (int)(filter_y0 - filter_y_base) / taps;
for (y = 0; y < h; ++y) {
/* Per-pixel src offset */
@@ -282,7 +282,7 @@ void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
int r;
for (r = h; r > 0; --r) {
- memcpy(dst, src, w);
+ vpx_memcpy(dst, src, w);
src += src_stride;
dst += dst_stride;
}
diff --git a/libvpx/vp9/common/vp9_convolve.h b/libvpx/vp9/common/vp9_convolve.h
index 13220e9..29d4990 100644
--- a/libvpx/vp9/common/vp9_convolve.h
+++ b/libvpx/vp9/common/vp9_convolve.h
@@ -7,23 +7,16 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef VP9_COMMON_CONVOLVE_H_
-#define VP9_COMMON_CONVOLVE_H_
+#ifndef VP9_COMMON_VP9_CONVOLVE_H_
+#define VP9_COMMON_VP9_CONVOLVE_H_
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
-#define FILTER_BITS 7
-
typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h);
-struct subpix_fn_table {
- const int16_t (*filter_x)[8];
- const int16_t (*filter_y)[8];
-};
-
-#endif // VP9_COMMON_CONVOLVE_H_
+#endif // VP9_COMMON_VP9_CONVOLVE_H_
diff --git a/libvpx/vp9/common/vp9_debugmodes.c b/libvpx/vp9/common/vp9_debugmodes.c
index 79f769e..355ac1a 100644
--- a/libvpx/vp9/common/vp9_debugmodes.c
+++ b/libvpx/vp9/common/vp9_debugmodes.c
@@ -63,9 +63,9 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) {
print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
- log_frame_info(cm, "Vectors ",mvs);
+ log_frame_info(cm, "Vectors ", mvs);
for (mi_row = 0; mi_row < rows; mi_row++) {
- fprintf(mvs,"V ");
+ fprintf(mvs, "V ");
for (mi_col = 0; mi_col < cols; mi_col++) {
fprintf(mvs, "%4d:%4d ", mi_8x8[mi_index]->mbmi.mv[0].as_mv.row,
mi_8x8[mi_index]->mbmi.mv[0].as_mv.col);
diff --git a/libvpx/vp9/common/vp9_default_coef_probs.h b/libvpx/vp9/common/vp9_default_coef_probs.h
index 185fced..3b512be 100644
--- a/libvpx/vp9/common/vp9_default_coef_probs.h
+++ b/libvpx/vp9/common/vp9_default_coef_probs.h
@@ -7,6 +7,8 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
+#ifndef VP9_COMMON_DEFAULT_COEF_PROBS_H_
+#define VP9_COMMON_DEFAULT_COEF_PROBS_H_
/*Generated file, included by vp9_entropy.c*/
static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {
@@ -694,3 +696,4 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {
}
};
+#endif // VP9_COMMON_DEFAULT_COEF_PROBS_H_
diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c
index 32d9e0c..d3a867c 100644
--- a/libvpx/vp9/common/vp9_entropy.c
+++ b/libvpx/vp9/common/vp9_entropy.c
@@ -52,156 +52,11 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
};
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = {
- 0, 4, 1, 5,
- 8, 2, 12, 9,
- 3, 6, 13, 10,
- 7, 14, 11, 15,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = {
- 0, 4, 8, 1,
- 12, 5, 9, 2,
- 13, 6, 10, 3,
- 7, 14, 11, 15,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
- 0, 1, 4, 2,
- 5, 3, 6, 8,
- 9, 7, 12, 10,
- 13, 11, 14, 15,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
- 0, 8, 1, 16, 9, 2, 17, 24,
- 10, 3, 18, 25, 32, 11, 4, 26,
- 33, 19, 40, 12, 34, 27, 5, 41,
- 20, 48, 13, 35, 42, 28, 21, 6,
- 49, 56, 36, 43, 29, 7, 14, 50,
- 57, 44, 22, 37, 15, 51, 58, 30,
- 45, 23, 52, 59, 38, 31, 60, 53,
- 46, 39, 61, 54, 47, 62, 55, 63,
-};
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
- 0, 8, 16, 1, 24, 9, 32, 17,
- 2, 40, 25, 10, 33, 18, 48, 3,
- 26, 41, 11, 56, 19, 34, 4, 49,
- 27, 42, 12, 35, 20, 57, 50, 28,
- 5, 43, 13, 36, 58, 51, 21, 44,
- 6, 29, 59, 37, 14, 52, 22, 7,
- 45, 60, 30, 15, 38, 53, 23, 46,
- 31, 61, 39, 54, 47, 62, 55, 63,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
- 0, 1, 2, 8, 9, 3, 16, 10,
- 4, 17, 11, 24, 5, 18, 25, 12,
- 19, 26, 32, 6, 13, 20, 33, 27,
- 7, 34, 40, 21, 28, 41, 14, 35,
- 48, 42, 29, 36, 49, 22, 43, 15,
- 56, 37, 50, 44, 30, 57, 23, 51,
- 58, 45, 38, 52, 31, 59, 53, 46,
- 60, 39, 61, 47, 54, 55, 62, 63,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
- 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
- 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
- 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
- 100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146,
- 55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25,
- 133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119,
- 26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194,
- 180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59,
- 12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13,
- 226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169,
- 242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108,
- 77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140,
- 230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141,
- 63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142,
- 219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251,
- 190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, 255,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
- 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
- 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
- 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
- 146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85,
- 22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179,
- 225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24,
- 87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227,
- 88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167,
- 213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229,
- 74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59,
- 200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170,
- 60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202,
- 233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125,
- 62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79,
- 126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236,
- 159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, 255,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
- 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
- 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
- 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
- 25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100,
- 13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102,
- 144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160,
- 89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176,
- 75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136,
- 165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166,
- 167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108,
- 197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170,
- 124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186,
- 156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110,
- 157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158,
- 188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175,
- 190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
- 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160, 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193, 68, 131, 37, 100,
- 225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38, 258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321, 102, 352, 8, 197,
- 71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292, 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293, 41, 417, 199, 136,
- 262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105, 419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169, 295, 420, 106, 451,
- 481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421, 75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391, 453, 139, 44, 234,
- 484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108, 546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577, 486, 77, 204, 362,
- 608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173, 610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17, 111, 238, 48, 143,
- 80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51, 83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424, 393, 300, 269, 176, 145,
- 52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301, 270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581, 550, 519, 488, 457, 426, 395,
- 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737, 706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241, 210, 179, 117, 86, 55, 738, 707,
- 614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491, 367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676, 645, 552, 521, 428, 397, 304,
- 273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553, 522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26, 864, 833, 802, 771, 740, 709,
- 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741, 710, 679, 617, 586, 555, 493,
- 462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835, 742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867, 743, 619, 495, 371, 247, 123,
- 896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680, 649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929, 898, 836, 805, 774, 712, 681,
- 650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154, 92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, 651, 620, 589, 558, 527,
- 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590, 559, 497, 466, 435, 373,
- 342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715, 622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623, 499, 375, 251, 127,
- 900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560, 529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716, 685, 654, 592, 561,
- 530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903, 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, 438, 407, 376, 345,
- 314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718, 687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998, 967, 874, 843, 750,
- 719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503, 379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657, 564, 533, 440, 409,
- 316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534, 472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783, 752, 721, 690, 659,
- 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970, 939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381, 350, 319, 1002, 971,
- 878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631, 507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568, 537, 444, 413, 972,
- 941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414, 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, 508, 477,
- 446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571, 509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479, 1007, 883, 759, 635, 511,
- 912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945, 914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915, 884, 853, 822, 791,
- 760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823, 761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607, 1011, 887, 763, 639,
- 916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825, 794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733, 702, 671, 1013, 982,
- 951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015, 891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798, 1016, 985, 954, 923,
- 892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863, 1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021, 990, 959, 1022, 991, 1023,
-};
/* Array indices are identical to previously-existing CONTEXT_NODE indices */
-const vp9_tree_index vp9_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */
-{
+const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
-DCT_EOB_TOKEN, 2, /* 0 = EOB */
-ZERO_TOKEN, 4, /* 1 = ZERO */
-ONE_TOKEN, 6, /* 2 = ONE */
@@ -419,7 +274,7 @@ static void init_bit_trees() {
init_bit_tree(cat6, 14);
}
-const vp9_extra_bit vp9_extra_bits[12] = {
+const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = {
{ 0, 0, 0, 0},
{ 0, 0, 0, 1},
{ 0, 0, 0, 2},
@@ -443,159 +298,7 @@ void vp9_default_coef_probs(VP9_COMMON *cm) {
vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
}
-// Neighborhood 5-tuples for various scans and blocksizes,
-// in {top, left, topleft, topright, bottomleft} order
-// for each position in raster scan order.
-// -1 indicates the neighbor does not exist.
-DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
-
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
-
-static int find_in_scan(const int16_t *scan, int l, int idx) {
- int n, l2 = l * l;
- for (n = 0; n < l2; n++) {
- int rc = scan[n];
- if (rc == idx)
- return n;
- }
- assert(0);
- return -1;
-}
-static void init_scan_neighbors(const int16_t *scan,
- int16_t *iscan,
- int l, int16_t *neighbors) {
- int l2 = l * l;
- int n, i, j;
-
- // dc doesn't use this type of prediction
- neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
- neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
- iscan[0] = find_in_scan(scan, l, 0);
- for (n = 1; n < l2; n++) {
- int rc = scan[n];
- iscan[n] = find_in_scan(scan, l, n);
- i = rc / l;
- j = rc % l;
- if (i > 0 && j > 0) {
- // col/row scan is used for adst/dct, and generally means that
- // energy decreases to zero much faster in the dimension in
- // which ADST is used compared to the direction in which DCT
- // is used. Likewise, we find much higher correlation between
- // coefficients within the direction in which DCT is used.
- // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
- // as a context. If ADST or DCT is used in both directions, we
- // use the combination of the two as a context.
- int a = (i - 1) * l + j;
- int b = i * l + j - 1;
- if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
- scan == vp9_col_scan_16x16) {
- // in the col/row scan cases (as well as left/top edge cases), we set
- // both contexts to the same value, so we can branchlessly do a+b+1>>1
- // which automatically becomes a if a == b
- neighbors[MAX_NEIGHBORS * n + 0] =
- neighbors[MAX_NEIGHBORS * n + 1] = a;
- } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
- scan == vp9_row_scan_16x16) {
- neighbors[MAX_NEIGHBORS * n + 0] =
- neighbors[MAX_NEIGHBORS * n + 1] = b;
- } else {
- neighbors[MAX_NEIGHBORS * n + 0] = a;
- neighbors[MAX_NEIGHBORS * n + 1] = b;
- }
- } else if (i > 0) {
- neighbors[MAX_NEIGHBORS * n + 0] =
- neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
- } else {
- assert(j > 0);
- neighbors[MAX_NEIGHBORS * n + 0] =
- neighbors[MAX_NEIGHBORS * n + 1] = i * l + j - 1;
- }
- assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
- }
- // one padding item so we don't have to add branches in code to handle
- // calls to get_coef_context() for the token after the final dc token
- neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
- neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
-}
-
-void vp9_init_neighbors() {
- init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
- vp9_default_scan_4x4_neighbors);
- init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
- vp9_row_scan_4x4_neighbors);
- init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
- vp9_col_scan_4x4_neighbors);
- init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
- vp9_default_scan_8x8_neighbors);
- init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
- vp9_row_scan_8x8_neighbors);
- init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
- vp9_col_scan_8x8_neighbors);
- init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
- vp9_default_scan_16x16_neighbors);
- init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
- vp9_row_scan_16x16_neighbors);
- init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
- vp9_col_scan_16x16_neighbors);
- init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
- vp9_default_scan_32x32_neighbors);
-}
-
-const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) {
- if (scan == vp9_default_scan_4x4) {
- return vp9_default_scan_4x4_neighbors;
- } else if (scan == vp9_row_scan_4x4) {
- return vp9_row_scan_4x4_neighbors;
- } else if (scan == vp9_col_scan_4x4) {
- return vp9_col_scan_4x4_neighbors;
- } else if (scan == vp9_default_scan_8x8) {
- return vp9_default_scan_8x8_neighbors;
- } else if (scan == vp9_row_scan_8x8) {
- return vp9_row_scan_8x8_neighbors;
- } else if (scan == vp9_col_scan_8x8) {
- return vp9_col_scan_8x8_neighbors;
- } else if (scan == vp9_default_scan_16x16) {
- return vp9_default_scan_16x16_neighbors;
- } else if (scan == vp9_row_scan_16x16) {
- return vp9_row_scan_16x16_neighbors;
- } else if (scan == vp9_col_scan_16x16) {
- return vp9_col_scan_16x16_neighbors;
- } else {
- assert(scan == vp9_default_scan_32x32);
- return vp9_default_scan_32x32_neighbors;
- }
-}
-
void vp9_coef_tree_initialize() {
- vp9_init_neighbors();
init_bit_trees();
vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
}
@@ -612,16 +315,15 @@ void vp9_coef_tree_initialize() {
static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
unsigned int count_sat,
unsigned int update_factor) {
- FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+ const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[tx_size];
- vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size];
+ const vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size];
vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size];
unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
cm->counts.eob_branch[tx_size];
- int t, i, j, k, l;
+ int i, j, k, l, m;
unsigned int branch_ct[UNCONSTRAINED_NODES][2];
- vp9_prob coef_probs[UNCONSTRAINED_NODES];
for (i = 0; i < BLOCK_TYPES; ++i)
for (j = 0; j < REF_TYPES; ++j)
@@ -629,15 +331,14 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
if (l >= 3 && k == 0)
continue;
- vp9_tree_probs_from_distribution(vp9_coefmodel_tree, coef_probs,
- branch_ct, coef_counts[i][j][k][l],
- 0);
+ vp9_tree_probs_from_distribution(vp9_coefmodel_tree, branch_ct,
+ coef_counts[i][j][k][l], 0);
branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
- coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
- for (t = 0; t < UNCONSTRAINED_NODES; ++t)
- dst_coef_probs[i][j][k][l][t] = merge_probs(
- pre_coef_probs[i][j][k][l][t], coef_probs[t],
- branch_ct[t], count_sat, update_factor);
+ for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+ dst_coef_probs[i][j][k][l][m] = merge_probs(
+ pre_coef_probs[i][j][k][l][m],
+ branch_ct[m],
+ count_sat, update_factor);
}
}
@@ -645,7 +346,7 @@ void vp9_adapt_coef_probs(VP9_COMMON *cm) {
TX_SIZE t;
unsigned int count_sat, update_factor;
- if (cm->frame_type == KEY_FRAME || cm->intra_only) {
+ if (frame_is_intra_only(cm)) {
update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
count_sat = COEF_COUNT_SAT_KEY;
} else if (cm->last_frame_type == KEY_FRAME) {
diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h
index f138c09..c58e852 100644
--- a/libvpx/vp9/common/vp9_entropy.h
+++ b/libvpx/vp9/common/vp9_entropy.h
@@ -12,9 +12,13 @@
#define VP9_COMMON_VP9_ENTROPY_H_
#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_treecoder.h"
+
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/common/vp9_treecoder.h"
+
+#define DIFF_UPDATE_PROB 252
/* Coefficient token alphabet */
@@ -36,7 +40,10 @@
#define INTER_MODE_CONTEXTS 7
-extern const vp9_tree_index vp9_coef_tree[];
+extern DECLARE_ALIGNED(16, const uint8_t,
+ vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+
+extern const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)];
#define DCT_EOB_MODEL_TOKEN 3 /* EOB Extra Bits 0+0 */
extern const vp9_tree_index vp9_coefmodel_tree[];
@@ -44,13 +51,14 @@ extern const vp9_tree_index vp9_coefmodel_tree[];
extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
typedef struct {
- vp9_tree_p tree;
+ vp9_tree_index *tree;
const vp9_prob *prob;
int len;
int base_val;
} vp9_extra_bit;
-extern const vp9_extra_bit vp9_extra_bits[12]; /* indexed by token value */
+// indexed by token value
+extern const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS];
#define MAX_PROB 255
#define DCT_MAX_VALUE 16384
@@ -88,72 +96,14 @@ typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
[MAX_ENTROPY_TOKENS];
typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
[ENTROPY_NODES][2];
-typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
- [ENTROPY_NODES];
#define SUBEXP_PARAM 4 /* Subexponential code parameter */
#define MODULUS_PARAM 13 /* Modulus parameter */
struct VP9Common;
void vp9_default_coef_probs(struct VP9Common *cm);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
-#define MAX_NEIGHBORS 2
-
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
-
-void vp9_coef_tree_initialize(void);
+void vp9_coef_tree_initialize();
void vp9_adapt_coef_probs(struct VP9Common *cm);
static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
@@ -183,16 +133,6 @@ static int get_coef_band(const uint8_t * band_translate, int coef_index) {
? (COEF_BANDS-1) : band_translate[coef_index];
}
-static INLINE int get_coef_context(const int16_t *neighbors,
- uint8_t *token_cache,
- int c) {
- return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
- token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
-}
-
-const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan);
-
-
// 128 lists of probabilities are stored for the following ONE node probs:
// 1, 3, 5, 7, ..., 253, 255
// In between probabilities are interpolated linearly
@@ -210,171 +150,62 @@ typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
[PREV_COEF_CONTEXTS]
[UNCONSTRAINED_NODES + 1];
-typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS]
- [PREV_COEF_CONTEXTS]
- [UNCONSTRAINED_NODES][2];
void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
-static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
- switch (tx_type) {
- case ADST_DCT:
- return vp9_row_scan_4x4;
- case DCT_ADST:
- return vp9_col_scan_4x4;
- default:
- return vp9_default_scan_4x4;
- }
-}
+static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+ const ENTROPY_CONTEXT *l) {
+ ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
-static INLINE void get_scan_nb_4x4(TX_TYPE tx_type,
- const int16_t **scan, const int16_t **nb) {
- switch (tx_type) {
- case ADST_DCT:
- *scan = vp9_row_scan_4x4;
- *nb = vp9_row_scan_4x4_neighbors;
- break;
- case DCT_ADST:
- *scan = vp9_col_scan_4x4;
- *nb = vp9_col_scan_4x4_neighbors;
- break;
- default:
- *scan = vp9_default_scan_4x4;
- *nb = vp9_default_scan_4x4_neighbors;
+ switch (tx_size) {
+ case TX_4X4:
+ above_ec = a[0] != 0;
+ left_ec = l[0] != 0;
break;
- }
-}
-
-static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) {
- switch (tx_type) {
- case ADST_DCT:
- return vp9_row_iscan_4x4;
- case DCT_ADST:
- return vp9_col_iscan_4x4;
- default:
- return vp9_default_iscan_4x4;
- }
-}
-
-static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
- switch (tx_type) {
- case ADST_DCT:
- return vp9_row_scan_8x8;
- case DCT_ADST:
- return vp9_col_scan_8x8;
- default:
- return vp9_default_scan_8x8;
- }
-}
-
-static INLINE void get_scan_nb_8x8(TX_TYPE tx_type,
- const int16_t **scan, const int16_t **nb) {
- switch (tx_type) {
- case ADST_DCT:
- *scan = vp9_row_scan_8x8;
- *nb = vp9_row_scan_8x8_neighbors;
+ case TX_8X8:
+ above_ec = !!*(const uint16_t *)a;
+ left_ec = !!*(const uint16_t *)l;
break;
- case DCT_ADST:
- *scan = vp9_col_scan_8x8;
- *nb = vp9_col_scan_8x8_neighbors;
+ case TX_16X16:
+ above_ec = !!*(const uint32_t *)a;
+ left_ec = !!*(const uint32_t *)l;
break;
- default:
- *scan = vp9_default_scan_8x8;
- *nb = vp9_default_scan_8x8_neighbors;
+ case TX_32X32:
+ above_ec = !!*(const uint64_t *)a;
+ left_ec = !!*(const uint64_t *)l;
break;
- }
-}
-
-static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) {
- switch (tx_type) {
- case ADST_DCT:
- return vp9_row_iscan_8x8;
- case DCT_ADST:
- return vp9_col_iscan_8x8;
default:
- return vp9_default_iscan_8x8;
- }
-}
-
-static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
- switch (tx_type) {
- case ADST_DCT:
- return vp9_row_scan_16x16;
- case DCT_ADST:
- return vp9_col_scan_16x16;
- default:
- return vp9_default_scan_16x16;
+ assert(!"Invalid transform size.");
}
-}
-static INLINE void get_scan_nb_16x16(TX_TYPE tx_type,
- const int16_t **scan, const int16_t **nb) {
- switch (tx_type) {
- case ADST_DCT:
- *scan = vp9_row_scan_16x16;
- *nb = vp9_row_scan_16x16_neighbors;
- break;
- case DCT_ADST:
- *scan = vp9_col_scan_16x16;
- *nb = vp9_col_scan_16x16_neighbors;
- break;
- default:
- *scan = vp9_default_scan_16x16;
- *nb = vp9_default_scan_16x16_neighbors;
- break;
- }
+ return combine_entropy_contexts(above_ec, left_ec);
}
-static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
- switch (tx_type) {
- case ADST_DCT:
- return vp9_row_iscan_16x16;
- case DCT_ADST:
- return vp9_col_iscan_16x16;
- default:
- return vp9_default_iscan_16x16;
- }
+static const uint8_t *get_band_translate(TX_SIZE tx_size) {
+ return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
+ : vp9_coefband_trans_8x8plus;
}
-static int get_entropy_context(const MACROBLOCKD *xd, TX_SIZE tx_size,
- PLANE_TYPE type, int block_idx,
- ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
- const int16_t **scan,
- const uint8_t **band_translate) {
- ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
-
+static void get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+ PLANE_TYPE type, int block_idx,
+ const int16_t **scan, const int16_t **scan_nb) {
switch (tx_size) {
case TX_4X4:
- *scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
- *band_translate = vp9_coefband_trans_4x4;
- above_ec = A[0] != 0;
- left_ec = L[0] != 0;
+ get_scan_nb_4x4(get_tx_type_4x4(type, xd, block_idx), scan, scan_nb);
break;
case TX_8X8:
- *scan = get_scan_8x8(get_tx_type_8x8(type, xd));
- *band_translate = vp9_coefband_trans_8x8plus;
- above_ec = !!*(uint16_t *)A;
- left_ec = !!*(uint16_t *)L;
+ get_scan_nb_8x8(get_tx_type_8x8(type, xd), scan, scan_nb);
break;
case TX_16X16:
- *scan = get_scan_16x16(get_tx_type_16x16(type, xd));
- *band_translate = vp9_coefband_trans_8x8plus;
- above_ec = !!*(uint32_t *)A;
- left_ec = !!*(uint32_t *)L;
+ get_scan_nb_16x16(get_tx_type_16x16(type, xd), scan, scan_nb);
break;
case TX_32X32:
*scan = vp9_default_scan_32x32;
- *band_translate = vp9_coefband_trans_8x8plus;
- above_ec = !!*(uint64_t *)A;
- left_ec = !!*(uint64_t *)L;
+ *scan_nb = vp9_default_scan_32x32_neighbors;
break;
default:
assert(!"Invalid transform size.");
}
-
- return combine_entropy_contexts(above_ec, left_ec);
}
-enum { VP9_COEF_UPDATE_PROB = 252 };
-
#endif // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c
index 93c89b0..a963d55 100644
--- a/libvpx/vp9/common/vp9_entropymode.c
+++ b/libvpx/vp9/common/vp9_entropymode.c
@@ -14,204 +14,199 @@
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_seg_common.h"
-const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES]
- [INTRA_MODES - 1] = {
- { 144, 11, 54, 157, 195, 130, 46, 58, 108 } /* y = dc */,
- { 118, 15, 123, 148, 131, 101, 44, 93, 131 } /* y = v */,
- { 113, 12, 23, 188, 226, 142, 26, 32, 125 } /* y = h */,
- { 120, 11, 50, 123, 163, 135, 64, 77, 103 } /* y = d45 */,
- { 113, 9, 36, 155, 111, 157, 32, 44, 161 } /* y = d135 */,
- { 116, 9, 55, 176, 76, 96, 37, 61, 149 } /* y = d117 */,
- { 115, 9, 28, 141, 161, 167, 21, 25, 193 } /* y = d153 */,
- { 120, 12, 32, 145, 195, 142, 32, 38, 86 } /* y = d207 */,
- { 116, 12, 64, 120, 140, 125, 49, 115, 121 } /* y = d63 */,
- { 102, 19, 66, 162, 182, 122, 35, 59, 128 } /* y = tm */
+const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
+ { // above = dc
+ { 137, 30, 42, 148, 151, 207, 70, 52, 91 }, // left = dc
+ { 92, 45, 102, 136, 116, 180, 74, 90, 100 }, // left = v
+ { 73, 32, 19, 187, 222, 215, 46, 34, 100 }, // left = h
+ { 91, 30, 32, 116, 121, 186, 93, 86, 94 }, // left = d45
+ { 72, 35, 36, 149, 68, 206, 68, 63, 105 }, // left = d135
+ { 73, 31, 28, 138, 57, 124, 55, 122, 151 }, // left = d117
+ { 67, 23, 21, 140, 126, 197, 40, 37, 171 }, // left = d153
+ { 86, 27, 28, 128, 154, 212, 45, 43, 53 }, // left = d207
+ { 74, 32, 27, 107, 86, 160, 63, 134, 102 }, // left = d63
+ { 59, 67, 44, 140, 161, 202, 78, 67, 119 } // left = tm
+ }, { // above = v
+ { 63, 36, 126, 146, 123, 158, 60, 90, 96 }, // left = dc
+ { 43, 46, 168, 134, 107, 128, 69, 142, 92 }, // left = v
+ { 44, 29, 68, 159, 201, 177, 50, 57, 77 }, // left = h
+ { 58, 38, 76, 114, 97, 172, 78, 133, 92 }, // left = d45
+ { 46, 41, 76, 140, 63, 184, 69, 112, 57 }, // left = d135
+ { 38, 32, 85, 140, 46, 112, 54, 151, 133 }, // left = d117
+ { 39, 27, 61, 131, 110, 175, 44, 75, 136 }, // left = d153
+ { 52, 30, 74, 113, 130, 175, 51, 64, 58 }, // left = d207
+ { 47, 35, 80, 100, 74, 143, 64, 163, 74 }, // left = d63
+ { 36, 61, 116, 114, 128, 162, 80, 125, 82 } // left = tm
+ }, { // above = h
+ { 82, 26, 26, 171, 208, 204, 44, 32, 105 }, // left = dc
+ { 55, 44, 68, 166, 179, 192, 57, 57, 108 }, // left = v
+ { 42, 26, 11, 199, 241, 228, 23, 15, 85 }, // left = h
+ { 68, 42, 19, 131, 160, 199, 55, 52, 83 }, // left = d45
+ { 58, 50, 25, 139, 115, 232, 39, 52, 118 }, // left = d135
+ { 50, 35, 33, 153, 104, 162, 64, 59, 131 }, // left = d117
+ { 44, 24, 16, 150, 177, 202, 33, 19, 156 }, // left = d153
+ { 55, 27, 12, 153, 203, 218, 26, 27, 49 }, // left = d207
+ { 53, 49, 21, 110, 116, 168, 59, 80, 76 }, // left = d63
+ { 38, 72, 19, 168, 203, 212, 50, 50, 107 } // left = tm
+ }, { // above = d45
+ { 103, 26, 36, 129, 132, 201, 83, 80, 93 }, // left = dc
+ { 59, 38, 83, 112, 103, 162, 98, 136, 90 }, // left = v
+ { 62, 30, 23, 158, 200, 207, 59, 57, 50 }, // left = h
+ { 67, 30, 29, 84, 86, 191, 102, 91, 59 }, // left = d45
+ { 60, 32, 33, 112, 71, 220, 64, 89, 104 }, // left = d135
+ { 53, 26, 34, 130, 56, 149, 84, 120, 103 }, // left = d117
+ { 53, 21, 23, 133, 109, 210, 56, 77, 172 }, // left = d153
+ { 77, 19, 29, 112, 142, 228, 55, 66, 36 }, // left = d207
+ { 61, 29, 29, 93, 97, 165, 83, 175, 162 }, // left = d63
+ { 47, 47, 43, 114, 137, 181, 100, 99, 95 } // left = tm
+ }, { // above = d135
+ { 69, 23, 29, 128, 83, 199, 46, 44, 101 }, // left = dc
+ { 53, 40, 55, 139, 69, 183, 61, 80, 110 }, // left = v
+ { 40, 29, 19, 161, 180, 207, 43, 24, 91 }, // left = h
+ { 60, 34, 19, 105, 61, 198, 53, 64, 89 }, // left = d45
+ { 52, 31, 22, 158, 40, 209, 58, 62, 89 }, // left = d135
+ { 44, 31, 29, 147, 46, 158, 56, 102, 198 }, // left = d117
+ { 35, 19, 12, 135, 87, 209, 41, 45, 167 }, // left = d153
+ { 55, 25, 21, 118, 95, 215, 38, 39, 66 }, // left = d207
+ { 51, 38, 25, 113, 58, 164, 70, 93, 97 }, // left = d63
+ { 47, 54, 34, 146, 108, 203, 72, 103, 151 } // left = tm
+ }, { // above = d117
+ { 64, 19, 37, 156, 66, 138, 49, 95, 133 }, // left = dc
+ { 46, 27, 80, 150, 55, 124, 55, 121, 135 }, // left = v
+ { 36, 23, 27, 165, 149, 166, 54, 64, 118 }, // left = h
+ { 53, 21, 36, 131, 63, 163, 60, 109, 81 }, // left = d45
+ { 40, 26, 35, 154, 40, 185, 51, 97, 123 }, // left = d135
+ { 35, 19, 34, 179, 19, 97, 48, 129, 124 }, // left = d117
+ { 36, 20, 26, 136, 62, 164, 33, 77, 154 }, // left = d153
+ { 45, 18, 32, 130, 90, 157, 40, 79, 91 }, // left = d207
+ { 45, 26, 28, 129, 45, 129, 49, 147, 123 }, // left = d63
+ { 38, 44, 51, 136, 74, 162, 57, 97, 121 } // left = tm
+ }, { // above = d153
+ { 75, 17, 22, 136, 138, 185, 32, 34, 166 }, // left = dc
+ { 56, 39, 58, 133, 117, 173, 48, 53, 187 }, // left = v
+ { 35, 21, 12, 161, 212, 207, 20, 23, 145 }, // left = h
+ { 56, 29, 19, 117, 109, 181, 55, 68, 112 }, // left = d45
+ { 47, 29, 17, 153, 64, 220, 59, 51, 114 }, // left = d135
+ { 46, 16, 24, 136, 76, 147, 41, 64, 172 }, // left = d117
+ { 34, 17, 11, 108, 152, 187, 13, 15, 209 }, // left = d153
+ { 51, 24, 14, 115, 133, 209, 32, 26, 104 }, // left = d207
+ { 55, 30, 18, 122, 79, 179, 44, 88, 116 }, // left = d63
+ { 37, 49, 25, 129, 168, 164, 41, 54, 148 } // left = tm
+ }, { // above = d207
+ { 82, 22, 32, 127, 143, 213, 39, 41, 70 }, // left = dc
+ { 62, 44, 61, 123, 105, 189, 48, 57, 64 }, // left = v
+ { 47, 25, 17, 175, 222, 220, 24, 30, 86 }, // left = h
+ { 68, 36, 17, 106, 102, 206, 59, 74, 74 }, // left = d45
+ { 57, 39, 23, 151, 68, 216, 55, 63, 58 }, // left = d135
+ { 49, 30, 35, 141, 70, 168, 82, 40, 115 }, // left = d117
+ { 51, 25, 15, 136, 129, 202, 38, 35, 139 }, // left = d153
+ { 68, 26, 16, 111, 141, 215, 29, 28, 28 }, // left = d207
+ { 59, 39, 19, 114, 75, 180, 77, 104, 42 }, // left = d63
+ { 40, 61, 26, 126, 152, 206, 61, 59, 93 } // left = tm
+ }, { // above = d63
+ { 78, 23, 39, 111, 117, 170, 74, 124, 94 }, // left = dc
+ { 48, 34, 86, 101, 92, 146, 78, 179, 134 }, // left = v
+ { 47, 22, 24, 138, 187, 178, 68, 69, 59 }, // left = h
+ { 56, 25, 33, 105, 112, 187, 95, 177, 129 }, // left = d45
+ { 48, 31, 27, 114, 63, 183, 82, 116, 56 }, // left = d135
+ { 43, 28, 37, 121, 63, 123, 61, 192, 169 }, // left = d117
+ { 42, 17, 24, 109, 97, 177, 56, 76, 122 }, // left = d153
+ { 58, 18, 28, 105, 139, 182, 70, 92, 63 }, // left = d207
+ { 46, 23, 32, 74, 86, 150, 67, 183, 88 }, // left = d63
+ { 36, 38, 48, 92, 122, 165, 88, 137, 91 } // left = tm
+ }, { // above = tm
+ { 65, 70, 60, 155, 159, 199, 61, 60, 81 }, // left = dc
+ { 44, 78, 115, 132, 119, 173, 71, 112, 93 }, // left = v
+ { 39, 38, 21, 184, 227, 206, 42, 32, 64 }, // left = h
+ { 58, 47, 36, 124, 137, 193, 80, 82, 78 }, // left = d45
+ { 49, 50, 35, 144, 95, 205, 63, 78, 59 }, // left = d135
+ { 41, 53, 52, 148, 71, 142, 65, 128, 51 }, // left = d117
+ { 40, 36, 28, 143, 143, 202, 40, 55, 137 }, // left = d153
+ { 52, 34, 29, 129, 183, 227, 42, 35, 43 }, // left = d207
+ { 42, 44, 44, 104, 105, 164, 64, 130, 80 }, // left = d63
+ { 43, 81, 53, 140, 169, 204, 68, 84, 72 } // left = tm
+ }
};
-static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS]
- [INTRA_MODES - 1] = {
- { 65, 32, 18, 144, 162, 194, 41, 51, 98 } /* block_size < 8x8 */,
- { 132, 68, 18, 165, 217, 196, 45, 40, 78 } /* block_size < 16x16 */,
- { 173, 80, 19, 176, 240, 193, 64, 35, 46 } /* block_size < 32x32 */,
- { 221, 135, 38, 194, 248, 121, 96, 85, 29 } /* block_size >= 32x32 */
+const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
+ { 144, 11, 54, 157, 195, 130, 46, 58, 108 }, // y = dc
+ { 118, 15, 123, 148, 131, 101, 44, 93, 131 }, // y = v
+ { 113, 12, 23, 188, 226, 142, 26, 32, 125 }, // y = h
+ { 120, 11, 50, 123, 163, 135, 64, 77, 103 }, // y = d45
+ { 113, 9, 36, 155, 111, 157, 32, 44, 161 }, // y = d135
+ { 116, 9, 55, 176, 76, 96, 37, 61, 149 }, // y = d117
+ { 115, 9, 28, 141, 161, 167, 21, 25, 193 }, // y = d153
+ { 120, 12, 32, 145, 195, 142, 32, 38, 86 }, // y = d207
+ { 116, 12, 64, 120, 140, 125, 49, 115, 121 }, // y = d63
+ { 102, 19, 66, 162, 182, 122, 35, 59, 128 } // y = tm
};
-static const vp9_prob default_if_uv_probs[INTRA_MODES]
- [INTRA_MODES - 1] = {
- { 120, 7, 76, 176, 208, 126, 28, 54, 103 } /* y = dc */,
- { 48, 12, 154, 155, 139, 90, 34, 117, 119 } /* y = v */,
- { 67, 6, 25, 204, 243, 158, 13, 21, 96 } /* y = h */,
- { 97, 5, 44, 131, 176, 139, 48, 68, 97 } /* y = d45 */,
- { 83, 5, 42, 156, 111, 152, 26, 49, 152 } /* y = d135 */,
- { 80, 5, 58, 178, 74, 83, 33, 62, 145 } /* y = d117 */,
- { 86, 5, 32, 154, 192, 168, 14, 22, 163 } /* y = d153 */,
- { 85, 5, 32, 156, 216, 148, 19, 29, 73 } /* y = d207 */,
- { 77, 7, 64, 116, 132, 122, 37, 126, 120 } /* y = d63 */,
- { 101, 21, 107, 181, 192, 103, 19, 67, 125 } /* y = tm */
+static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
+ { 65, 32, 18, 144, 162, 194, 41, 51, 98 }, // block_size < 8x8
+ { 132, 68, 18, 165, 217, 196, 45, 40, 78 }, // block_size < 16x16
+ { 173, 80, 19, 176, 240, 193, 64, 35, 46 }, // block_size < 32x32
+ { 221, 135, 38, 194, 248, 121, 96, 85, 29 } // block_size >= 32x32
};
-static const vp9_prob default_partition_probs[NUM_FRAME_TYPES]
- [NUM_PARTITION_CONTEXTS]
- [PARTITION_TYPES - 1] = {
- { /* frame_type = keyframe */
- /* 8x8 -> 4x4 */
- { 158, 97, 94 } /* a/l both not split */,
- { 93, 24, 99 } /* a split, l not split */,
- { 85, 119, 44 } /* l split, a not split */,
- { 62, 59, 67 } /* a/l both split */,
- /* 16x16 -> 8x8 */
- { 149, 53, 53 } /* a/l both not split */,
- { 94, 20, 48 } /* a split, l not split */,
- { 83, 53, 24 } /* l split, a not split */,
- { 52, 18, 18 } /* a/l both split */,
- /* 32x32 -> 16x16 */
- { 150, 40, 39 } /* a/l both not split */,
- { 78, 12, 26 } /* a split, l not split */,
- { 67, 33, 11 } /* l split, a not split */,
- { 24, 7, 5 } /* a/l both split */,
- /* 64x64 -> 32x32 */
- { 174, 35, 49 } /* a/l both not split */,
- { 68, 11, 27 } /* a split, l not split */,
- { 57, 15, 9 } /* l split, a not split */,
- { 12, 3, 3 } /* a/l both split */
- }, { /* frame_type = interframe */
- /* 8x8 -> 4x4 */
- { 199, 122, 141 } /* a/l both not split */,
- { 147, 63, 159 } /* a split, l not split */,
- { 148, 133, 118 } /* l split, a not split */,
- { 121, 104, 114 } /* a/l both split */,
- /* 16x16 -> 8x8 */
- { 174, 73, 87 } /* a/l both not split */,
- { 92, 41, 83 } /* a split, l not split */,
- { 82, 99, 50 } /* l split, a not split */,
- { 53, 39, 39 } /* a/l both split */,
- /* 32x32 -> 16x16 */
- { 177, 58, 59 } /* a/l both not split */,
- { 68, 26, 63 } /* a split, l not split */,
- { 52, 79, 25 } /* l split, a not split */,
- { 17, 14, 12 } /* a/l both split */,
- /* 64x64 -> 32x32 */
- { 222, 34, 30 } /* a/l both not split */,
- { 72, 16, 44 } /* a split, l not split */,
- { 58, 32, 12 } /* l split, a not split */,
- { 10, 7, 6 } /* a/l both split */
- }
+static const vp9_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
+ { 120, 7, 76, 176, 208, 126, 28, 54, 103 }, // y = dc
+ { 48, 12, 154, 155, 139, 90, 34, 117, 119 }, // y = v
+ { 67, 6, 25, 204, 243, 158, 13, 21, 96 }, // y = h
+ { 97, 5, 44, 131, 176, 139, 48, 68, 97 }, // y = d45
+ { 83, 5, 42, 156, 111, 152, 26, 49, 152 }, // y = d135
+ { 80, 5, 58, 178, 74, 83, 33, 62, 145 }, // y = d117
+ { 86, 5, 32, 154, 192, 168, 14, 22, 163 }, // y = d153
+ { 85, 5, 32, 156, 216, 148, 19, 29, 73 }, // y = d207
+ { 77, 7, 64, 116, 132, 122, 37, 126, 120 }, // y = d63
+ { 101, 21, 107, 181, 192, 103, 19, 67, 125 } // y = tm
};
-const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES]
- [INTRA_MODES]
- [INTRA_MODES - 1] = {
- { /* above = dc */
- { 137, 30, 42, 148, 151, 207, 70, 52, 91 } /* left = dc */,
- { 92, 45, 102, 136, 116, 180, 74, 90, 100 } /* left = v */,
- { 73, 32, 19, 187, 222, 215, 46, 34, 100 } /* left = h */,
- { 91, 30, 32, 116, 121, 186, 93, 86, 94 } /* left = d45 */,
- { 72, 35, 36, 149, 68, 206, 68, 63, 105 } /* left = d135 */,
- { 73, 31, 28, 138, 57, 124, 55, 122, 151 } /* left = d117 */,
- { 67, 23, 21, 140, 126, 197, 40, 37, 171 } /* left = d153 */,
- { 86, 27, 28, 128, 154, 212, 45, 43, 53 } /* left = d207 */,
- { 74, 32, 27, 107, 86, 160, 63, 134, 102 } /* left = d63 */,
- { 59, 67, 44, 140, 161, 202, 78, 67, 119 } /* left = tm */
- }, { /* above = v */
- { 63, 36, 126, 146, 123, 158, 60, 90, 96 } /* left = dc */,
- { 43, 46, 168, 134, 107, 128, 69, 142, 92 } /* left = v */,
- { 44, 29, 68, 159, 201, 177, 50, 57, 77 } /* left = h */,
- { 58, 38, 76, 114, 97, 172, 78, 133, 92 } /* left = d45 */,
- { 46, 41, 76, 140, 63, 184, 69, 112, 57 } /* left = d135 */,
- { 38, 32, 85, 140, 46, 112, 54, 151, 133 } /* left = d117 */,
- { 39, 27, 61, 131, 110, 175, 44, 75, 136 } /* left = d153 */,
- { 52, 30, 74, 113, 130, 175, 51, 64, 58 } /* left = d207 */,
- { 47, 35, 80, 100, 74, 143, 64, 163, 74 } /* left = d63 */,
- { 36, 61, 116, 114, 128, 162, 80, 125, 82 } /* left = tm */
- }, { /* above = h */
- { 82, 26, 26, 171, 208, 204, 44, 32, 105 } /* left = dc */,
- { 55, 44, 68, 166, 179, 192, 57, 57, 108 } /* left = v */,
- { 42, 26, 11, 199, 241, 228, 23, 15, 85 } /* left = h */,
- { 68, 42, 19, 131, 160, 199, 55, 52, 83 } /* left = d45 */,
- { 58, 50, 25, 139, 115, 232, 39, 52, 118 } /* left = d135 */,
- { 50, 35, 33, 153, 104, 162, 64, 59, 131 } /* left = d117 */,
- { 44, 24, 16, 150, 177, 202, 33, 19, 156 } /* left = d153 */,
- { 55, 27, 12, 153, 203, 218, 26, 27, 49 } /* left = d207 */,
- { 53, 49, 21, 110, 116, 168, 59, 80, 76 } /* left = d63 */,
- { 38, 72, 19, 168, 203, 212, 50, 50, 107 } /* left = tm */
- }, { /* above = d45 */
- { 103, 26, 36, 129, 132, 201, 83, 80, 93 } /* left = dc */,
- { 59, 38, 83, 112, 103, 162, 98, 136, 90 } /* left = v */,
- { 62, 30, 23, 158, 200, 207, 59, 57, 50 } /* left = h */,
- { 67, 30, 29, 84, 86, 191, 102, 91, 59 } /* left = d45 */,
- { 60, 32, 33, 112, 71, 220, 64, 89, 104 } /* left = d135 */,
- { 53, 26, 34, 130, 56, 149, 84, 120, 103 } /* left = d117 */,
- { 53, 21, 23, 133, 109, 210, 56, 77, 172 } /* left = d153 */,
- { 77, 19, 29, 112, 142, 228, 55, 66, 36 } /* left = d207 */,
- { 61, 29, 29, 93, 97, 165, 83, 175, 162 } /* left = d63 */,
- { 47, 47, 43, 114, 137, 181, 100, 99, 95 } /* left = tm */
- }, { /* above = d135 */
- { 69, 23, 29, 128, 83, 199, 46, 44, 101 } /* left = dc */,
- { 53, 40, 55, 139, 69, 183, 61, 80, 110 } /* left = v */,
- { 40, 29, 19, 161, 180, 207, 43, 24, 91 } /* left = h */,
- { 60, 34, 19, 105, 61, 198, 53, 64, 89 } /* left = d45 */,
- { 52, 31, 22, 158, 40, 209, 58, 62, 89 } /* left = d135 */,
- { 44, 31, 29, 147, 46, 158, 56, 102, 198 } /* left = d117 */,
- { 35, 19, 12, 135, 87, 209, 41, 45, 167 } /* left = d153 */,
- { 55, 25, 21, 118, 95, 215, 38, 39, 66 } /* left = d207 */,
- { 51, 38, 25, 113, 58, 164, 70, 93, 97 } /* left = d63 */,
- { 47, 54, 34, 146, 108, 203, 72, 103, 151 } /* left = tm */
- }, { /* above = d117 */
- { 64, 19, 37, 156, 66, 138, 49, 95, 133 } /* left = dc */,
- { 46, 27, 80, 150, 55, 124, 55, 121, 135 } /* left = v */,
- { 36, 23, 27, 165, 149, 166, 54, 64, 118 } /* left = h */,
- { 53, 21, 36, 131, 63, 163, 60, 109, 81 } /* left = d45 */,
- { 40, 26, 35, 154, 40, 185, 51, 97, 123 } /* left = d135 */,
- { 35, 19, 34, 179, 19, 97, 48, 129, 124 } /* left = d117 */,
- { 36, 20, 26, 136, 62, 164, 33, 77, 154 } /* left = d153 */,
- { 45, 18, 32, 130, 90, 157, 40, 79, 91 } /* left = d207 */,
- { 45, 26, 28, 129, 45, 129, 49, 147, 123 } /* left = d63 */,
- { 38, 44, 51, 136, 74, 162, 57, 97, 121 } /* left = tm */
- }, { /* above = d153 */
- { 75, 17, 22, 136, 138, 185, 32, 34, 166 } /* left = dc */,
- { 56, 39, 58, 133, 117, 173, 48, 53, 187 } /* left = v */,
- { 35, 21, 12, 161, 212, 207, 20, 23, 145 } /* left = h */,
- { 56, 29, 19, 117, 109, 181, 55, 68, 112 } /* left = d45 */,
- { 47, 29, 17, 153, 64, 220, 59, 51, 114 } /* left = d135 */,
- { 46, 16, 24, 136, 76, 147, 41, 64, 172 } /* left = d117 */,
- { 34, 17, 11, 108, 152, 187, 13, 15, 209 } /* left = d153 */,
- { 51, 24, 14, 115, 133, 209, 32, 26, 104 } /* left = d207 */,
- { 55, 30, 18, 122, 79, 179, 44, 88, 116 } /* left = d63 */,
- { 37, 49, 25, 129, 168, 164, 41, 54, 148 } /* left = tm */
- }, { /* above = d207 */
- { 82, 22, 32, 127, 143, 213, 39, 41, 70 } /* left = dc */,
- { 62, 44, 61, 123, 105, 189, 48, 57, 64 } /* left = v */,
- { 47, 25, 17, 175, 222, 220, 24, 30, 86 } /* left = h */,
- { 68, 36, 17, 106, 102, 206, 59, 74, 74 } /* left = d45 */,
- { 57, 39, 23, 151, 68, 216, 55, 63, 58 } /* left = d135 */,
- { 49, 30, 35, 141, 70, 168, 82, 40, 115 } /* left = d117 */,
- { 51, 25, 15, 136, 129, 202, 38, 35, 139 } /* left = d153 */,
- { 68, 26, 16, 111, 141, 215, 29, 28, 28 } /* left = d207 */,
- { 59, 39, 19, 114, 75, 180, 77, 104, 42 } /* left = d63 */,
- { 40, 61, 26, 126, 152, 206, 61, 59, 93 } /* left = tm */
- }, { /* above = d63 */
- { 78, 23, 39, 111, 117, 170, 74, 124, 94 } /* left = dc */,
- { 48, 34, 86, 101, 92, 146, 78, 179, 134 } /* left = v */,
- { 47, 22, 24, 138, 187, 178, 68, 69, 59 } /* left = h */,
- { 56, 25, 33, 105, 112, 187, 95, 177, 129 } /* left = d45 */,
- { 48, 31, 27, 114, 63, 183, 82, 116, 56 } /* left = d135 */,
- { 43, 28, 37, 121, 63, 123, 61, 192, 169 } /* left = d117 */,
- { 42, 17, 24, 109, 97, 177, 56, 76, 122 } /* left = d153 */,
- { 58, 18, 28, 105, 139, 182, 70, 92, 63 } /* left = d207 */,
- { 46, 23, 32, 74, 86, 150, 67, 183, 88 } /* left = d63 */,
- { 36, 38, 48, 92, 122, 165, 88, 137, 91 } /* left = tm */
- }, { /* above = tm */
- { 65, 70, 60, 155, 159, 199, 61, 60, 81 } /* left = dc */,
- { 44, 78, 115, 132, 119, 173, 71, 112, 93 } /* left = v */,
- { 39, 38, 21, 184, 227, 206, 42, 32, 64 } /* left = h */,
- { 58, 47, 36, 124, 137, 193, 80, 82, 78 } /* left = d45 */,
- { 49, 50, 35, 144, 95, 205, 63, 78, 59 } /* left = d135 */,
- { 41, 53, 52, 148, 71, 142, 65, 128, 51 } /* left = d117 */,
- { 40, 36, 28, 143, 143, 202, 40, 55, 137 } /* left = d153 */,
- { 52, 34, 29, 129, 183, 227, 42, 35, 43 } /* left = d207 */,
- { 42, 44, 44, 104, 105, 164, 64, 130, 80 } /* left = d63 */,
- { 43, 81, 53, 140, 169, 204, 68, 84, 72 } /* left = tm */
- }
+const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+ [PARTITION_TYPES - 1] = {
+ // 8x8 -> 4x4
+ { 158, 97, 94 }, // a/l both not split
+ { 93, 24, 99 }, // a split, l not split
+ { 85, 119, 44 }, // l split, a not split
+ { 62, 59, 67 }, // a/l both split
+ // 16x16 -> 8x8
+ { 149, 53, 53 }, // a/l both not split
+ { 94, 20, 48 }, // a split, l not split
+ { 83, 53, 24 }, // l split, a not split
+ { 52, 18, 18 }, // a/l both split
+ // 32x32 -> 16x16
+ { 150, 40, 39 }, // a/l both not split
+ { 78, 12, 26 }, // a split, l not split
+ { 67, 33, 11 }, // l split, a not split
+ { 24, 7, 5 }, // a/l both split
+ // 64x64 -> 32x32
+ { 174, 35, 49 }, // a/l both not split
+ { 68, 11, 27 }, // a split, l not split
+ { 57, 15, 9 }, // l split, a not split
+ { 12, 3, 3 }, // a/l both split
+};
+
+static const vp9_prob default_partition_probs[PARTITION_CONTEXTS]
+ [PARTITION_TYPES - 1] = {
+ // 8x8 -> 4x4
+ { 199, 122, 141 }, // a/l both not split
+ { 147, 63, 159 }, // a split, l not split
+ { 148, 133, 118 }, // l split, a not split
+ { 121, 104, 114 }, // a/l both split
+ // 16x16 -> 8x8
+ { 174, 73, 87 }, // a/l both not split
+ { 92, 41, 83 }, // a split, l not split
+ { 82, 99, 50 }, // l split, a not split
+ { 53, 39, 39 }, // a/l both split
+ // 32x32 -> 16x16
+ { 177, 58, 59 }, // a/l both not split
+ { 68, 26, 63 }, // a split, l not split
+ { 52, 79, 25 }, // l split, a not split
+ { 17, 14, 12 }, // a/l both split
+ // 64x64 -> 32x32
+ { 222, 34, 30 }, // a/l both not split
+ { 72, 16, 44 }, // a split, l not split
+ { 58, 32, 12 }, // l split, a not split
+ { 10, 7, 6 }, // a/l both split
};
static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
@@ -226,7 +221,7 @@ static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
};
/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = {
+const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
-DC_PRED, 2, /* 0 = DC_NODE */
-TM_PRED, 4, /* 1 = TM_NODE */
-V_PRED, 6, /* 2 = V_NODE */
@@ -237,22 +232,20 @@ const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = {
-D63_PRED, 16, /* 7 = D63_NODE */
-D153_PRED, -D207_PRED /* 8 = D153_NODE */
};
+struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-const vp9_tree_index vp9_inter_mode_tree[6] = {
+const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
-ZEROMV, 2,
-NEARESTMV, 4,
-NEARMV, -NEWMV
};
+struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-const vp9_tree_index vp9_partition_tree[6] = {
+const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
-PARTITION_NONE, 2,
-PARTITION_HORZ, 4,
-PARTITION_VERT, -PARTITION_SPLIT
};
-
-struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-
struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
@@ -286,7 +279,7 @@ static const struct tx_probs default_tx_probs = {
{ 66 } }
};
-void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
+void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
unsigned int (*ct_32x32p)[2]) {
ct_32x32p[0][0] = tx_count_32x32p[TX_4X4];
ct_32x32p[0][1] = tx_count_32x32p[TX_8X8] +
@@ -299,7 +292,7 @@ void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
ct_32x32p[2][1] = tx_count_32x32p[TX_32X32];
}
-void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
+void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
unsigned int (*ct_16x16p)[2]) {
ct_16x16p[0][0] = tx_count_16x16p[TX_4X4];
ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16];
@@ -307,7 +300,7 @@ void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
ct_16x16p[1][1] = tx_count_16x16p[TX_16X16];
}
-void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
+void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
unsigned int (*ct_8x8p)[2]) {
ct_8x8p[0][0] = tx_count_8x8p[TX_4X4];
ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
@@ -317,8 +310,8 @@ static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = {
192, 128, 64
};
-static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTERS+1]
- [SWITCHABLE_FILTERS-1] = {
+static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+ [SWITCHABLE_FILTERS - 1] = {
{ 235, 162, },
{ 36, 255, },
{ 34, 3, },
@@ -338,7 +331,8 @@ void vp9_init_mbmode_probs(VP9_COMMON *cm) {
vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
}
-const vp9_tree_index vp9_switchable_interp_tree[SWITCHABLE_FILTERS*2-2] = {
+const vp9_tree_index vp9_switchable_interp_tree
+ [TREE_SIZE(SWITCHABLE_FILTERS)] = {
-EIGHTTAP, 2,
-EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
};
@@ -356,76 +350,58 @@ void vp9_entropy_mode_init() {
#define COUNT_SAT 20
#define MAX_UPDATE_FACTOR 128
-static int update_ct(vp9_prob pre_prob, vp9_prob prob, unsigned int ct[2]) {
- return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
-}
-
-static int update_ct2(vp9_prob pre_prob, unsigned int ct[2]) {
- return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
+static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) {
+ return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
}
-static void update_mode_probs(int n_modes,
- const vp9_tree_index *tree, unsigned int *cnt,
- vp9_prob *pre_probs, vp9_prob *dst_probs,
- unsigned int tok0_offset) {
-#define MAX_PROBS 32
- vp9_prob probs[MAX_PROBS];
- unsigned int branch_ct[MAX_PROBS][2];
- int t;
-
- assert(n_modes - 1 < MAX_PROBS);
- vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset);
- for (t = 0; t < n_modes - 1; ++t)
- dst_probs[t] = update_ct(pre_probs[t], probs[t], branch_ct[t]);
+static void adapt_probs(const vp9_tree_index *tree,
+ const vp9_prob *pre_probs, const unsigned int *counts,
+ unsigned int offset, vp9_prob *probs) {
+ tree_merge_probs(tree, pre_probs, counts, offset,
+ COUNT_SAT, MAX_UPDATE_FACTOR, probs);
}
void vp9_adapt_mode_probs(VP9_COMMON *cm) {
int i, j;
FRAME_CONTEXT *fc = &cm->fc;
- FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
- FRAME_COUNTS *counts = &cm->counts;
+ const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+ const FRAME_COUNTS *counts = &cm->counts;
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
- fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i],
+ fc->intra_inter_prob[i] = adapt_prob(pre_fc->intra_inter_prob[i],
counts->intra_inter[i]);
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
- fc->comp_inter_prob[i] = update_ct2(pre_fc->comp_inter_prob[i],
+ fc->comp_inter_prob[i] = adapt_prob(pre_fc->comp_inter_prob[i],
counts->comp_inter[i]);
for (i = 0; i < REF_CONTEXTS; i++)
- fc->comp_ref_prob[i] = update_ct2(pre_fc->comp_ref_prob[i],
+ fc->comp_ref_prob[i] = adapt_prob(pre_fc->comp_ref_prob[i],
counts->comp_ref[i]);
for (i = 0; i < REF_CONTEXTS; i++)
for (j = 0; j < 2; j++)
- fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j],
+ fc->single_ref_prob[i][j] = adapt_prob(pre_fc->single_ref_prob[i][j],
counts->single_ref[i][j]);
for (i = 0; i < INTER_MODE_CONTEXTS; i++)
- update_mode_probs(INTER_MODES, vp9_inter_mode_tree,
- counts->inter_mode[i], pre_fc->inter_mode_probs[i],
- fc->inter_mode_probs[i], NEARESTMV);
+ adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
+ counts->inter_mode[i], NEARESTMV, fc->inter_mode_probs[i]);
for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
- update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
- counts->y_mode[i], pre_fc->y_mode_prob[i],
- fc->y_mode_prob[i], 0);
+ adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
+ counts->y_mode[i], 0, fc->y_mode_prob[i]);
for (i = 0; i < INTRA_MODES; ++i)
- update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
- counts->uv_mode[i], pre_fc->uv_mode_prob[i],
- fc->uv_mode_prob[i], 0);
+ adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
+ counts->uv_mode[i], 0, fc->uv_mode_prob[i]);
- for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
- update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
- counts->partition[i],
- pre_fc->partition_prob[INTER_FRAME][i],
- fc->partition_prob[INTER_FRAME][i], 0);
+ for (i = 0; i < PARTITION_CONTEXTS; i++)
+ adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i],
+ counts->partition[i], 0, fc->partition_prob[i]);
if (cm->mcomp_filter_type == SWITCHABLE) {
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
- update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
- counts->switchable_interp[i],
- pre_fc->switchable_interp_prob[i],
- fc->switchable_interp_prob[i], 0);
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+ adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
+ counts->switchable_interp[i], 0,
+ fc->switchable_interp_prob[i]);
}
if (cm->tx_mode == TX_MODE_SELECT) {
@@ -437,23 +413,23 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
for (j = 0; j < TX_SIZES - 3; ++j)
- fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
+ fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j],
branch_ct_8x8p[j]);
tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
for (j = 0; j < TX_SIZES - 2; ++j)
- fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
+ fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j],
branch_ct_16x16p[j]);
tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
for (j = 0; j < TX_SIZES - 1; ++j)
- fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
+ fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j],
branch_ct_32x32p[j]);
}
}
for (i = 0; i < MBSKIP_CONTEXTS; ++i)
- fc->mbskip_probs[i] = update_ct2(pre_fc->mbskip_probs[i],
+ fc->mbskip_probs[i] = adapt_prob(pre_fc->mbskip_probs[i],
counts->mbskip[i]);
}
diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h
index 4cf4c03..38b4199 100644
--- a/libvpx/vp9/common/vp9_entropymode.h
+++ b/libvpx/vp9/common/vp9_entropymode.h
@@ -14,10 +14,9 @@
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_treecoder.h"
-#define SUBMVREF_COUNT 5
#define TX_SIZE_CONTEXTS 2
-#define MODE_UPDATE_PROB 252
#define SWITCHABLE_FILTERS 3 // number of switchable filters
+#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
// #define MODE_STATS
@@ -39,19 +38,20 @@ extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
[INTRA_MODES - 1];
-extern const vp9_tree_index vp9_intra_mode_tree[];
-extern const vp9_tree_index vp9_inter_mode_tree[];
+extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+ [PARTITION_TYPES - 1];
+extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
+
+extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-// probability models for partition information
-extern const vp9_tree_index vp9_partition_tree[];
+extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
extern const vp9_tree_index vp9_switchable_interp_tree
- [2 * (SWITCHABLE_FILTERS - 1)];
-
+ [TREE_SIZE(SWITCHABLE_FILTERS)];
extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
void vp9_entropy_mode_init();
@@ -62,11 +62,11 @@ void vp9_init_mbmode_probs(struct VP9Common *cm);
void vp9_adapt_mode_probs(struct VP9Common *cm);
-void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
+void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
unsigned int (*ct_32x32p)[2]);
-void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
+void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
unsigned int (*ct_16x16p)[2]);
-void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
+void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
unsigned int (*ct_8x8p)[2]);
#endif // VP9_COMMON_VP9_ENTROPYMODE_H_
diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c
index 2e973e5..b061cdb 100644
--- a/libvpx/vp9/common/vp9_entropymv.c
+++ b/libvpx/vp9/common/vp9_entropymv.c
@@ -18,14 +18,14 @@
/* Integer pel reference mv threshold for use of high-precision 1/8 mv */
#define COMPANDED_MVREF_THRESH 8
-const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
+const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
-MV_JOINT_ZERO, 2,
-MV_JOINT_HNZVZ, 4,
-MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
};
struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
-const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
+const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
-MV_CLASS_0, 2,
-MV_CLASS_1, 4,
6, 8,
@@ -39,12 +39,12 @@ const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
};
struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
-const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = {
+const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
-0, -1,
};
struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
-const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {
+const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)] = {
-0, 2,
-1, 4,
-2, -3
@@ -53,8 +53,8 @@ struct vp9_token vp9_mv_fp_encodings[4];
static const nmv_context default_nmv_context = {
{32, 64, 96},
- {
- { /* vert component */
+ { // NOLINT
+ { /* vert component */ // NOLINT
128, /* sign */
{224, 144, 192, 168, 192, 176, 192, 198, 198, 245}, /* class */
{216}, /* class0 */
@@ -64,7 +64,7 @@ static const nmv_context default_nmv_context = {
160, /* class0_hp bit */
128, /* hp */
},
- { /* hor component */
+ { /* hor component */ // NOLINT
128, /* sign */
{216, 128, 176, 160, 176, 176, 192, 198, 198, 208}, /* class */
{208}, /* class0 */
@@ -149,7 +149,7 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
static void inc_mv_component(int v, nmv_component_counts *comp_counts,
int incr, int usehp) {
int s, z, c, o, d, e, f;
- assert (v != 0); /* should not be zero */
+ assert(v != 0); /* should not be zero */
s = v < 0;
comp_counts->sign[s] += incr;
z = (s ? -v : v) - 1; /* magnitude - 1 */
@@ -175,77 +175,63 @@ static void inc_mv_component(int v, nmv_component_counts *comp_counts,
}
}
+void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) {
+ if (counts != NULL) {
+ const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
+ ++counts->joints[j];
-void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) {
- const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
- ++counts->joints[j];
-
- if (mv_joint_vertical(j)) {
- inc_mv_component(mv->row, &counts->comps[0], 1, 1);
- }
+ if (mv_joint_vertical(j)) {
+ inc_mv_component(mv->row, &counts->comps[0], 1, 1);
+ }
- if (mv_joint_horizontal(j)) {
- inc_mv_component(mv->col, &counts->comps[1], 1, 1);
+ if (mv_joint_horizontal(j)) {
+ inc_mv_component(mv->col, &counts->comps[1], 1, 1);
+ }
}
}
static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
- return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
+ return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
}
-static unsigned int adapt_probs(unsigned int i,
- vp9_tree tree,
- vp9_prob this_probs[],
- const vp9_prob last_probs[],
- const unsigned int num_events[]) {
-
-
- const unsigned int left = tree[i] <= 0
- ? num_events[-tree[i]]
- : adapt_probs(tree[i], tree, this_probs, last_probs, num_events);
-
- const unsigned int right = tree[i + 1] <= 0
- ? num_events[-tree[i + 1]]
- : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events);
- const unsigned int ct[2] = { left, right };
- this_probs[i >> 1] = adapt_prob(last_probs[i >> 1], ct);
- return left + right;
+static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
+ const unsigned int *counts, vp9_prob *probs) {
+ tree_merge_probs(tree, pre_probs, counts, 0,
+ MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR, probs);
}
-
void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
int i, j;
- FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+ nmv_context *fc = &cm->fc.nmvc;
+ const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
+ const nmv_context_counts *counts = &cm->counts.mv;
- nmv_context *ctx = &cm->fc.nmvc;
- nmv_context *pre_ctx = &pre_fc->nmvc;
- nmv_context_counts *cts = &cm->counts.mv;
-
- adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
+ adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
+ fc->joints);
for (i = 0; i < 2; ++i) {
- ctx->comps[i].sign = adapt_prob(pre_ctx->comps[i].sign, cts->comps[i].sign);
- adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes,
- pre_ctx->comps[i].classes, cts->comps[i].classes);
- adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0,
- pre_ctx->comps[i].class0, cts->comps[i].class0);
+ nmv_component *comp = &fc->comps[i];
+ const nmv_component *pre_comp = &pre_fc->comps[i];
+ const nmv_component_counts *c = &counts->comps[i];
+
+ comp->sign = adapt_prob(pre_comp->sign, c->sign);
+ adapt_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
+ comp->classes);
+ adapt_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, comp->class0);
for (j = 0; j < MV_OFFSET_BITS; ++j)
- ctx->comps[i].bits[j] = adapt_prob(pre_ctx->comps[i].bits[j],
- cts->comps[i].bits[j]);
+ comp->bits[j] = adapt_prob(pre_comp->bits[j], c->bits[j]);
for (j = 0; j < CLASS0_SIZE; ++j)
- adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j],
- pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]);
+ adapt_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], c->class0_fp[j],
+ comp->class0_fp[j]);
- adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp,
- cts->comps[i].fp);
+ adapt_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
if (allow_hp) {
- ctx->comps[i].class0_hp = adapt_prob(pre_ctx->comps[i].class0_hp,
- cts->comps[i].class0_hp);
- ctx->comps[i].hp = adapt_prob(pre_ctx->comps[i].hp, cts->comps[i].hp);
+ comp->class0_hp = adapt_prob(pre_comp->class0_hp, c->class0_hp);
+ comp->hp = adapt_prob(pre_comp->hp, c->hp);
}
}
}
diff --git a/libvpx/vp9/common/vp9_entropymv.h b/libvpx/vp9/common/vp9_entropymv.h
index a10c933..d843f5b 100644
--- a/libvpx/vp9/common/vp9_entropymv.h
+++ b/libvpx/vp9/common/vp9_entropymv.h
@@ -13,7 +13,7 @@
#define VP9_COMMON_VP9_ENTROPYMV_H_
#include "vp9/common/vp9_treecoder.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
#include "vp9/common/vp9_blockd.h"
struct VP9Common;
@@ -43,9 +43,6 @@ static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
}
-extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];
-extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
-
/* Symbols for coding magnitude class of nonzero components */
#define MV_CLASSES 11
typedef enum {
@@ -62,9 +59,6 @@ typedef enum {
MV_CLASS_10 = 10, /* (1024,2048] integer pel */
} MV_CLASS_TYPE;
-extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];
-extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
-
#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
#define CLASS0_SIZE (1 << CLASS0_BITS)
#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
@@ -73,10 +67,20 @@ extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
#define MV_MAX ((1 << MV_MAX_BITS) - 1)
#define MV_VALS ((MV_MAX << 1) + 1)
-extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];
+#define MV_IN_USE_BITS 14
+#define MV_UPP ((1 << MV_IN_USE_BITS) - 1)
+#define MV_LOW (-(1 << MV_IN_USE_BITS))
+
+extern const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)];
+extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
+
+extern const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)];
+extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
+
+extern const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)];
extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
-extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2];
+extern const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)];
extern struct vp9_token vp9_mv_fp_encodings[4];
typedef struct {
@@ -108,7 +112,6 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);
typedef struct {
- unsigned int mvcount[MV_VALS];
unsigned int sign[2];
unsigned int classes[MV_CLASSES];
unsigned int class0[CLASS0_SIZE];
diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h
index 1bf0742..1651b90 100644
--- a/libvpx/vp9/common/vp9_enums.h
+++ b/libvpx/vp9/common/vp9_enums.h
@@ -50,7 +50,7 @@ typedef enum PARTITION_TYPE {
} PARTITION_TYPE;
#define PARTITION_PLOFFSET 4 // number of probability models per block size
-#define NUM_PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
+#define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
typedef enum {
TX_4X4 = 0, // 4x4 dct transform
@@ -76,4 +76,15 @@ typedef enum {
ADST_ADST = 3 // ADST in both directions
} TX_TYPE;
+typedef enum {
+ UNKNOWN = 0,
+ BT_601 = 1, // YUV
+ BT_709 = 2, // YUV
+ SMPTE_170 = 3, // YUV
+ SMPTE_240 = 4, // YUV
+ RESERVED_1 = 5,
+ RESERVED_2 = 6,
+ SRGB = 7 // RGB
+} COLOR_SPACE;
+
#endif // VP9_COMMON_VP9_ENUMS_H_
diff --git a/libvpx/vp9/common/vp9_filter.c b/libvpx/vp9/common/vp9_filter.c
index 4ac2bc9..79ace14 100644
--- a/libvpx/vp9/common/vp9_filter.c
+++ b/libvpx/vp9/common/vp9_filter.c
@@ -8,12 +8,14 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
+
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_filter.h"
-DECLARE_ALIGNED(256, const int16_t,
- vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const subpel_kernel,
+ vp9_bilinear_filters[SUBPEL_SHIFTS]) = {
{ 0, 0, 0, 128, 0, 0, 0, 0 },
{ 0, 0, 0, 120, 8, 0, 0, 0 },
{ 0, 0, 0, 112, 16, 0, 0, 0 },
@@ -33,8 +35,8 @@ DECLARE_ALIGNED(256, const int16_t,
};
// Lagrangian interpolation filter
-DECLARE_ALIGNED(256, const int16_t,
- vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const subpel_kernel,
+ vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
{ 0, 0, 0, 128, 0, 0, 0, 0},
{ 0, 1, -5, 126, 8, -3, 1, 0},
{ -1, 3, -10, 122, 18, -6, 2, 0},
@@ -54,8 +56,8 @@ DECLARE_ALIGNED(256, const int16_t,
};
// DCT based filter
-DECLARE_ALIGNED(256, const int16_t,
- vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const subpel_kernel,
+ vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
{0, 0, 0, 128, 0, 0, 0, 0},
{-1, 3, -7, 127, 8, -3, 1, 0},
{-2, 5, -13, 125, 17, -6, 3, -1},
@@ -75,8 +77,8 @@ DECLARE_ALIGNED(256, const int16_t,
};
// freqmultiplier = 0.5
-DECLARE_ALIGNED(256, const int16_t,
- vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS]) = {
+DECLARE_ALIGNED(256, const subpel_kernel,
+ vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
{ 0, 0, 0, 128, 0, 0, 0, 0},
{-3, -1, 32, 64, 38, 1, -3, 0},
{-2, -2, 29, 63, 41, 2, -3, 0},
@@ -94,3 +96,16 @@ DECLARE_ALIGNED(256, const int16_t,
{ 0, -3, 2, 41, 63, 29, -2, -2},
{ 0, -3, 1, 38, 64, 32, -1, -3}
};
+
+
+static const subpel_kernel* vp9_filter_kernels[4] = {
+ vp9_sub_pel_filters_8,
+ vp9_sub_pel_filters_8lp,
+ vp9_sub_pel_filters_8s,
+ vp9_bilinear_filters
+};
+
+const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type) {
+ return vp9_filter_kernels[type];
+}
+
diff --git a/libvpx/vp9/common/vp9_filter.h b/libvpx/vp9/common/vp9_filter.h
index 7b1ffae..b1e7e64 100644
--- a/libvpx/vp9/common/vp9_filter.h
+++ b/libvpx/vp9/common/vp9_filter.h
@@ -11,19 +11,37 @@
#ifndef VP9_COMMON_VP9_FILTER_H_
#define VP9_COMMON_VP9_FILTER_H_
-#include "vpx_config.h"
+#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+#define FILTER_BITS 7
+
#define SUBPEL_BITS 4
#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
#define SUBPEL_TAPS 8
-extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS];
-extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS];
+typedef enum {
+ EIGHTTAP = 0,
+ EIGHTTAP_SMOOTH = 1,
+ EIGHTTAP_SHARP = 2,
+ BILINEAR = 3,
+ SWITCHABLE = 4 /* should be the last one */
+} INTERPOLATION_TYPE;
+
+typedef int16_t subpel_kernel[SUBPEL_TAPS];
+
+struct subpix_fn_table {
+ const subpel_kernel *filter_x;
+ const subpel_kernel *filter_y;
+};
+
+const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type);
+
+extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
+extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
// filter kernel as a 2 tap filter.
diff --git a/libvpx/vp9/common/vp9_findnearmv.c b/libvpx/vp9/common/vp9_findnearmv.c
index 49a731f..b91c501 100644
--- a/libvpx/vp9/common/vp9_findnearmv.c
+++ b/libvpx/vp9/common/vp9_findnearmv.c
@@ -22,14 +22,12 @@ static void lower_mv_precision(MV *mv, int allow_hp) {
}
-void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
- int_mv *mvlist,
- int_mv *nearest,
- int_mv *near) {
+void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
+ int_mv *mvlist, int_mv *nearest, int_mv *near) {
int i;
// Make sure all the candidates are properly clamped etc
for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
- lower_mv_precision(&mvlist[i].as_mv, xd->allow_high_precision_mv);
+ lower_mv_precision(&mvlist[i].as_mv, allow_hp);
clamp_mv2(&mvlist[i].as_mv, xd);
}
*nearest = mvlist[0];
@@ -37,27 +35,28 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
}
void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+ const TileInfo *const tile,
int_mv *dst_nearest,
int_mv *dst_near,
int block_idx, int ref_idx,
int mi_row, int mi_col) {
int_mv dst_list[MAX_MV_REF_CANDIDATES];
int_mv mv_list[MAX_MV_REF_CANDIDATES];
- MODE_INFO *const mi = xd->this_mi;
+ MODE_INFO *const mi = xd->mi_8x8[0];
assert(ref_idx == 0 || ref_idx == 1);
assert(MAX_MV_REF_CANDIDATES == 2); // makes code here slightly easier
- vp9_find_mv_refs_idx(cm, xd, mi, xd->last_mi,
+ vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi,
mi->mbmi.ref_frame[ref_idx],
mv_list, block_idx, mi_row, mi_col);
dst_list[1].as_int = 0;
if (block_idx == 0) {
- memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv));
+ vpx_memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv));
} else if (block_idx == 1 || block_idx == 2) {
int dst = 0, n;
- union b_mode_info *bmi = mi->bmi;
+ b_mode_info *bmi = mi->bmi;
dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int;
for (n = 0; dst < MAX_MV_REF_CANDIDATES &&
@@ -66,7 +65,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
dst_list[dst++].as_int = mv_list[n].as_int;
} else {
int dst = 0, n;
- union b_mode_info *bmi = mi->bmi;
+ b_mode_info *bmi = mi->bmi;
assert(block_idx == 3);
dst_list[dst++].as_int = bmi[2].as_mv[ref_idx].as_int;
diff --git a/libvpx/vp9/common/vp9_findnearmv.h b/libvpx/vp9/common/vp9_findnearmv.h
index ad0d882..2362caa 100644
--- a/libvpx/vp9/common/vp9_findnearmv.h
+++ b/libvpx/vp9/common/vp9_findnearmv.h
@@ -23,10 +23,8 @@
// check a list of motion vectors by sad score using a number rows of pixels
// above and a number cols of pixels in the left to select the one with best
// score to use as ref motion vector
-void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
- int_mv *mvlist,
- int_mv *nearest,
- int_mv *near);
+void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
+ int_mv *mvlist, int_mv *nearest, int_mv *near);
// TODO(jingning): this mv clamping function should be block size dependent.
static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
@@ -36,57 +34,39 @@ static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
}
-void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm,
- MACROBLOCKD *xd,
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+ const TileInfo *const tile,
int_mv *dst_nearest,
int_mv *dst_near,
int block_idx, int ref_idx,
int mi_row, int mi_col);
-static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb,
- const MODE_INFO *left_mb, int b) {
- // FIXME(rbultje, jingning): temporary hack because jenkins doesn't
- // understand this condition. This will go away soon.
- const MODE_INFO *mi = cur_mb;
-
+static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi,
+ const MODE_INFO *left_mi, int b) {
if (b == 0 || b == 2) {
- /* On L edge, get from MB to left of us */
- mi = left_mb;
- if (!mi)
+ if (!left_mi || is_inter_block(&left_mi->mbmi))
return DC_PRED;
- if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
- return DC_PRED;
- } else if (mi->mbmi.sb_type < BLOCK_8X8) {
- return ((mi->bmi + 1 + b)->as_mode);
- } else {
- return mi->mbmi.mode;
- }
+ return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
+ : left_mi->mbmi.mode;
+ } else {
+ assert(b == 1 || b == 3);
+ return cur_mi->bmi[b - 1].as_mode;
}
- assert(b == 1 || b == 3);
- return (mi->bmi + b - 1)->as_mode;
}
-static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
- const MODE_INFO *above_mb, int b) {
- const MODE_INFO *mi = cur_mb;
-
- if (!(b >> 1)) {
- /* On top edge, get from MB above us */
- mi = above_mb;
- if (!mi)
+static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi,
+ const MODE_INFO *above_mi, int b) {
+ if (b == 0 || b == 1) {
+ if (!above_mi || is_inter_block(&above_mi->mbmi))
return DC_PRED;
- if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
- return DC_PRED;
- } else if (mi->mbmi.sb_type < BLOCK_8X8) {
- return ((mi->bmi + 2 + b)->as_mode);
- } else {
- return mi->mbmi.mode;
- }
+ return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode
+ : above_mi->mbmi.mode;
+ } else {
+ assert(b == 2 || b == 3);
+ return cur_mi->bmi[b - 2].as_mode;
}
-
- return (mi->bmi + b - 2)->as_mode;
}
#endif // VP9_COMMON_VP9_FINDNEARMV_H_
diff --git a/libvpx/vp9/common/vp9_idct.c b/libvpx/vp9/common/vp9_idct.c
index a224525..ea8683e 100644
--- a/libvpx/vp9/common/vp9_idct.c
+++ b/libvpx/vp9/common/vp9_idct.c
@@ -18,20 +18,20 @@
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h"
-void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
0.5 shifts per pixel. */
int i;
int16_t output[16];
int a1, b1, c1, d1, e1;
- int16_t *ip = input;
+ const int16_t *ip = input;
int16_t *op = output;
for (i = 0; i < 4; i++) {
- a1 = ip[0] >> WHT_UPSCALE_FACTOR;
- c1 = ip[1] >> WHT_UPSCALE_FACTOR;
- d1 = ip[2] >> WHT_UPSCALE_FACTOR;
- b1 = ip[3] >> WHT_UPSCALE_FACTOR;
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ c1 = ip[1] >> UNIT_QUANT_SHIFT;
+ d1 = ip[2] >> UNIT_QUANT_SHIFT;
+ b1 = ip[3] >> UNIT_QUANT_SHIFT;
a1 += c1;
d1 -= b1;
e1 = (a1 - d1) >> 1;
@@ -60,24 +60,24 @@ void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
c1 = e1 - c1;
a1 -= b1;
d1 += c1;
- dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
- dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + b1);
- dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + c1);
- dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + d1);
+ dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
+ dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
+ dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
+ dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
ip++;
dest++;
}
}
-void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
int i;
int a1, e1;
int16_t tmp[4];
- int16_t *ip = in;
+ const int16_t *ip = in;
int16_t *op = tmp;
- a1 = ip[0] >> WHT_UPSCALE_FACTOR;
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
e1 = a1 >> 1;
a1 -= e1;
op[0] = a1;
@@ -96,7 +96,7 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
}
}
-void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
+static void idct4_1d(const int16_t *input, int16_t *output) {
int16_t step[4];
int temp1, temp2;
// stage 1
@@ -116,7 +116,7 @@ void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
output[3] = step[0] - step[3];
}
-void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[4 * 4];
int16_t *outptr = out;
int i, j;
@@ -124,7 +124,7 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
// Rows
for (i = 0; i < 4; ++i) {
- vp9_idct4_1d(input, outptr);
+ idct4_1d(input, outptr);
input += 4;
outptr += 4;
}
@@ -133,14 +133,14 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = out[j * 4 + i];
- vp9_idct4_1d(temp_in, temp_out);
+ idct4_1d(temp_in, temp_out);
for (j = 0; j < 4; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ + dest[j * stride + i]);
}
}
-void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
int i;
int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -156,7 +156,7 @@ void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
}
}
-static void idct8_1d(int16_t *input, int16_t *output) {
+static void idct8_1d(const int16_t *input, int16_t *output) {
int16_t step1[8], step2[8];
int temp1, temp2;
// stage 1
@@ -174,7 +174,7 @@ static void idct8_1d(int16_t *input, int16_t *output) {
step1[6] = dct_const_round_shift(temp2);
// stage 2 & stage 3 - even half
- vp9_idct4_1d(step1, step1);
+ idct4_1d(step1, step1);
// stage 2 - odd half
step2[4] = step1[4] + step1[5];
@@ -201,7 +201,7 @@ static void idct8_1d(int16_t *input, int16_t *output) {
output[7] = step1[0] - step1[7];
}
-void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[8 * 8];
int16_t *outptr = out;
int i, j;
@@ -220,12 +220,12 @@ void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
temp_in[j] = out[j * 8 + i];
idct8_1d(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * stride + i]);
}
}
-void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
int i, j;
int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -234,11 +234,11 @@ void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i)
dest[i] = clip_pixel(dest[i] + a1);
- dest += dest_stride;
+ dest += stride;
}
}
-static void iadst4_1d(int16_t *input, int16_t *output) {
+static void iadst4_1d(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
int x0 = input[0];
@@ -280,13 +280,13 @@ static void iadst4_1d(int16_t *input, int16_t *output) {
output[3] = dct_const_round_shift(s3);
}
-void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
- int tx_type) {
+void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
const transform_2d IHT_4[] = {
- { vp9_idct4_1d, vp9_idct4_1d }, // DCT_DCT = 0
- { iadst4_1d, vp9_idct4_1d }, // ADST_DCT = 1
- { vp9_idct4_1d, iadst4_1d }, // DCT_ADST = 2
- { iadst4_1d, iadst4_1d } // ADST_ADST = 3
+ { idct4_1d, idct4_1d }, // DCT_DCT = 0
+ { iadst4_1d, idct4_1d }, // ADST_DCT = 1
+ { idct4_1d, iadst4_1d }, // DCT_ADST = 2
+ { iadst4_1d, iadst4_1d } // ADST_ADST = 3
};
int i, j;
@@ -307,11 +307,11 @@ void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
temp_in[j] = out[j * 4 + i];
IHT_4[tx_type].cols(temp_in, temp_out);
for (j = 0; j < 4; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ + dest[j * stride + i]);
}
}
-static void iadst8_1d(int16_t *input, int16_t *output) {
+static void iadst8_1d(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
int x0 = input[7];
@@ -395,8 +395,8 @@ static const transform_2d IHT_8[] = {
{ iadst8_1d, iadst8_1d } // ADST_ADST = 3
};
-void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
- int tx_type) {
+void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
int i, j;
int16_t out[8 * 8];
int16_t *outptr = out;
@@ -416,12 +416,12 @@ void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
temp_in[j] = out[j * 8 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
- + dest[j * dest_stride + i]); }
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * stride + i]);
+ }
}
-void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[8 * 8] = { 0 };
int16_t *outptr = out;
int i, j;
@@ -441,12 +441,12 @@ void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
temp_in[j] = out[j * 8 + i];
idct8_1d(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * stride + i]);
}
}
-static void idct16_1d(int16_t *input, int16_t *output) {
+static void idct16_1d(const int16_t *input, int16_t *output) {
int16_t step1[16], step2[16];
int temp1, temp2;
@@ -611,7 +611,7 @@ static void idct16_1d(int16_t *input, int16_t *output) {
output[15] = step2[0] - step2[15];
}
-void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[16 * 16];
int16_t *outptr = out;
int i, j;
@@ -630,12 +630,12 @@ void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
temp_in[j] = out[j * 16 + i];
idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]);
}
}
-void iadst16_1d(int16_t *input, int16_t *output) {
+static void iadst16_1d(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
int x0 = input[15];
@@ -813,8 +813,8 @@ static const transform_2d IHT_16[] = {
{ iadst16_1d, iadst16_1d } // ADST_ADST = 3
};
-void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
- int tx_type) {
+void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
int i, j;
int16_t out[16 * 16];
int16_t *outptr = out;
@@ -834,12 +834,11 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
temp_in[j] = out[j * 16 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
- + dest[j * dest_stride + i]); }
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]); }
}
-void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[16 * 16] = { 0 };
int16_t *outptr = out;
int i, j;
@@ -859,13 +858,12 @@ void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
temp_in[j] = out[j*16 + i];
idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]);
}
}
-void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
int i, j;
int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -874,11 +872,11 @@ void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
for (j = 0; j < 16; ++j) {
for (i = 0; i < 16; ++i)
dest[i] = clip_pixel(dest[i] + a1);
- dest += dest_stride;
+ dest += stride;
}
}
-static void idct32_1d(int16_t *input, int16_t *output) {
+static void idct32_1d(const int16_t *input, int16_t *output) {
int16_t step1[32], step2[32];
int temp1, temp2;
@@ -1245,7 +1243,7 @@ static void idct32_1d(int16_t *input, int16_t *output) {
output[31] = step1[0] - step1[31];
}
-void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[32 * 32];
int16_t *outptr = out;
int i, j;
@@ -1253,6 +1251,44 @@ void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
// Rows
for (i = 0; i < 32; ++i) {
+ int16_t zero_coeff[16];
+ for (j = 0; j < 16; ++j)
+ zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+ for (j = 0; j < 8; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 4; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 2; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+ if (zero_coeff[0] | zero_coeff[1])
+ idct32_1d(input, outptr);
+ else
+ vpx_memset(outptr, 0, sizeof(int16_t) * 32);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ idct32_1d(temp_in, temp_out);
+ for (j = 0; j < 32; ++j)
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]);
+ }
+}
+
+void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
+ int16_t out[32 * 32] = {0};
+ int16_t *outptr = out;
+ int i, j;
+ int16_t temp_in[32], temp_out[32];
+
+ // Rows
+ // only upper-left 8x8 has non-zero coeff
+ for (i = 0; i < 8; ++i) {
idct32_1d(input, outptr);
input += 32;
outptr += 32;
@@ -1264,13 +1300,116 @@ void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
temp_in[j] = out[j * 32 + i];
idct32_1d(temp_in, temp_out);
for (j = 0; j < 32; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]);
}
}
-void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
+void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
+ int i, j;
+ int a1;
+
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
- output[0] = ROUND_POWER_OF_TWO(out, 6);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ for (j = 0; j < 32; ++j) {
+ for (i = 0; i < 32; ++i)
+ dest[i] = clip_pixel(dest[i] + a1);
+ dest += stride;
+ }
+}
+
+// idct
+void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+ if (eob > 1)
+ vp9_idct4x4_16_add(input, dest, stride);
+ else
+ vp9_idct4x4_1_add(input, dest, stride);
+}
+
+
+void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+ if (eob > 1)
+ vp9_iwht4x4_16_add(input, dest, stride);
+ else
+ vp9_iwht4x4_1_add(input, dest, stride);
+}
+
+void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+ // If dc is 1, then input[0] is the reconstructed value, do not need
+ // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+ // The calculation can be simplified if there are not many non-zero dct
+ // coefficients. Use eobs to decide what to do.
+ // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+ // Combine that with code here.
+ if (eob) {
+ if (eob == 1)
+ // DC only DCT coefficient
+ vp9_idct8x8_1_add(input, dest, stride);
+ else if (eob <= 10)
+ vp9_idct8x8_10_add(input, dest, stride);
+ else
+ vp9_idct8x8_64_add(input, dest, stride);
+ }
+}
+
+void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
+ int eob) {
+ /* The calculation can be simplified if there are not many non-zero dct
+ * coefficients. Use eobs to separate different cases. */
+ if (eob) {
+ if (eob == 1)
+ /* DC only DCT coefficient. */
+ vp9_idct16x16_1_add(input, dest, stride);
+ else if (eob <= 10)
+ vp9_idct16x16_10_add(input, dest, stride);
+ else
+ vp9_idct16x16_256_add(input, dest, stride);
+ }
+}
+
+void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+ int eob) {
+ if (eob) {
+ if (eob == 1)
+ vp9_idct32x32_1_add(input, dest, stride);
+ else if (eob <= 34)
+ // non-zero coeff only in upper-left 8x8
+ vp9_idct32x32_34_add(input, dest, stride);
+ else
+ vp9_idct32x32_1024_add(input, dest, stride);
+ }
+}
+
+// iht
+void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+ int stride, int eob) {
+ if (tx_type == DCT_DCT)
+ vp9_idct4x4_add(input, dest, stride, eob);
+ else
+ vp9_iht4x4_16_add(input, dest, stride, tx_type);
+}
+
+void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+ int stride, int eob) {
+ if (tx_type == DCT_DCT) {
+ vp9_idct8x8_add(input, dest, stride, eob);
+ } else {
+ if (eob > 0) {
+ vp9_iht8x8_64_add(input, dest, stride, tx_type);
+ }
+ }
+}
+
+void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+ int stride, int eob) {
+ if (tx_type == DCT_DCT) {
+ vp9_idct16x16_add(input, dest, stride, eob);
+ } else {
+ if (eob > 0) {
+ vp9_iht16x16_256_add(input, dest, stride, tx_type);
+ }
+ }
}
diff --git a/libvpx/vp9/common/vp9_idct.h b/libvpx/vp9/common/vp9_idct.h
index 0c47da6..2b3f35f 100644
--- a/libvpx/vp9/common/vp9_idct.h
+++ b/libvpx/vp9/common/vp9_idct.h
@@ -16,16 +16,18 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
// Constants and Macros used by all idct/dct functions
#define DCT_CONST_BITS 14
#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
-#define WHT_UPSCALE_FACTOR 2
+#define UNIT_QUANT_SHIFT 2
+#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
#define pair_set_epi16(a, b) \
- _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))
+ _mm_set_epi16(b, a, b, a, b, a, b, a)
#define pair_set_epi32(a, b) \
_mm_set_epi32(b, a, b, a)
@@ -79,10 +81,27 @@ static INLINE int dct_const_round_shift(int input) {
return rv;
}
-typedef void (*transform_1d)(int16_t*, int16_t*);
+typedef void (*transform_1d)(const int16_t*, int16_t*);
typedef struct {
transform_1d cols, rows; // vertical and horizontal
} transform_2d;
+void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+
+void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int
+ eob);
+void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+ int eob);
+
+void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+ int stride, int eob);
+void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+ int stride, int eob);
+void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+ int stride, int eob);
+
+
#endif // VP9_COMMON_VP9_IDCT_H_
diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index cfb5cd4..218e12e 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vpx_config.h"
+#include "./vpx_config.h"
#include "vp9/common/vp9_loopfilter.h"
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_reconinter.h"
@@ -16,12 +16,6 @@
#include "vp9/common/vp9_seg_common.h"
-struct loop_filter_info {
- const uint8_t *mblim;
- const uint8_t *lim;
- const uint8_t *hev_thr;
-};
-
// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
// Each 1 bit represents a position in which we want to apply the loop filter.
// Left_ entries refer to whether we apply a filter on the border to the
@@ -259,8 +253,8 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
if (block_inside_limit < 1)
block_inside_limit = 1;
- vpx_memset(lfi->lim[lvl], block_inside_limit, SIMD_WIDTH);
- vpx_memset(lfi->mblim[lvl], (2 * (lvl + 2) + block_inside_limit),
+ vpx_memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+ vpx_memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
SIMD_WIDTH);
}
}
@@ -268,7 +262,7 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
void vp9_loop_filter_init(VP9_COMMON *cm) {
loop_filter_info_n *lfi = &cm->lf_info;
struct loopfilter *lf = &cm->lf;
- int i;
+ int lvl;
// init limits for given sharpness
update_sharpness(lfi, lf->sharpness_level);
@@ -278,8 +272,8 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
lf_init_lut(lfi);
// init hev threshold const vectors
- for (i = 0; i < 4; i++)
- vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+ for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
+ vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
}
void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
@@ -316,13 +310,13 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
continue;
}
- intra_lvl = lvl_seg + (lf->ref_deltas[INTRA_FRAME] << n_shift);
+ intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * (1 << n_shift);
lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref)
for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
- const int inter_lvl = lvl_seg + (lf->ref_deltas[ref] << n_shift)
- + (lf->mode_deltas[mode] << n_shift);
+ const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * (1 << n_shift)
+ + lf->mode_deltas[mode] * (1 << n_shift);
lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
}
}
@@ -330,16 +324,14 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
static int build_lfi(const loop_filter_info_n *lfi_n,
const MB_MODE_INFO *mbmi,
- struct loop_filter_info *lfi) {
+ const loop_filter_thresh **lfi) {
const int seg = mbmi->segment_id;
const int ref = mbmi->ref_frame[0];
const int mode = lfi_n->mode_lf_lut[mbmi->mode];
const int filter_level = lfi_n->lvl[seg][ref][mode];
if (filter_level > 0) {
- lfi->mblim = lfi_n->mblim[filter_level];
- lfi->lim = lfi_n->lim[filter_level];
- lfi->hev_thr = lfi_n->hev_thr[filter_level >> 4];
+ *lfi = &lfi_n->lfthr[filter_level];
return 1;
} else {
return 0;
@@ -351,11 +343,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
unsigned int mask_8x8,
unsigned int mask_4x4,
unsigned int mask_4x4_int,
- const struct loop_filter_info *lfi) {
+ const loop_filter_thresh **p_lfi) {
unsigned int mask;
for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
mask; mask >>= 1) {
+ const loop_filter_thresh *lfi = *p_lfi;
+
if (mask & 1) {
if (mask_16x16 & 1) {
vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
@@ -379,7 +373,7 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
lfi->hev_thr, 1);
s += 8;
- lfi++;
+ p_lfi++;
mask_16x16 >>= 1;
mask_8x8 >>= 1;
mask_4x4 >>= 1;
@@ -393,12 +387,14 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
unsigned int mask_4x4,
unsigned int mask_4x4_int,
int only_4x4_1,
- const struct loop_filter_info *lfi) {
+ const loop_filter_thresh **p_lfi) {
unsigned int mask;
int count;
for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
mask; mask >>= count) {
+ const loop_filter_thresh *lfi = *p_lfi;
+
count = 1;
if (mask & 1) {
if (!only_4x4_1) {
@@ -432,7 +428,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
lfi->lim, lfi->hev_thr, 1);
}
s += 8 * count;
- lfi += count;
+ p_lfi += count;
mask_16x16 >>= count;
mask_8x8 >>= count;
mask_4x4 >>= count;
@@ -805,7 +801,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
- struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+ const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
int r, c;
for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
@@ -834,7 +830,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
// Filter level can vary per MI
- if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
+ if (!build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]))
continue;
// Build masks based on the transform size of each block
@@ -925,7 +921,7 @@ static void filter_block_plane(VP9_COMMON *const cm,
struct buf_2d *const dst = &plane->dst;
uint8_t* const dst0 = dst->buf;
unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
- struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+ const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
int r, c;
int row_shift = 3 - ss_x;
int row_mask = 0xff >> (ss_x << 2);
@@ -938,8 +934,8 @@ static void filter_block_plane(VP9_COMMON *const cm,
// Determine the vertical edges that need filtering
for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
const MODE_INFO *mi = mi_8x8[c];
- if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
- continue;
+
+ build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]);
}
if (!plane->plane_type) {
mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);
diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h
index 91d40ac..62389ea 100644
--- a/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libvpx/vp9/common/vp9_loopfilter.h
@@ -12,7 +12,7 @@
#define VP9_COMMON_VP9_LOOPFILTER_H_
#include "vpx_ports/mem.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_seg_common.h"
@@ -46,12 +46,13 @@ struct loopfilter {
// Need to align this structure so when it is declared and
// passed it can be loaded into vector registers.
typedef struct {
- DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
- mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
- DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
- lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
- DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
- hev_thr[4][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
+} loop_filter_thresh;
+
+typedef struct {
+ loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
uint8_t mode_lf_lut[MB_MODE_COUNT];
} loop_filter_info_n;
diff --git a/libvpx/vp9/common/vp9_loopfilter_filters.c b/libvpx/vp9/common/vp9_loopfilter_filters.c
index 88130d8..2c4bf6c 100644
--- a/libvpx/vp9/common/vp9_loopfilter_filters.c
+++ b/libvpx/vp9/common/vp9_loopfilter_filters.c
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vpx_config.h"
+#include "./vpx_config.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_loopfilter.h"
#include "vp9/common/vp9_onyxc_int.h"
diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c
index bfeeb57..8df8aec 100644
--- a/libvpx/vp9/common/vp9_mvref_common.c
+++ b/libvpx/vp9/common/vp9_mvref_common.c
@@ -108,7 +108,7 @@ static const int idx_n_column_to_subblock[4][2] = {
};
// clamp_mv_ref
-#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
+#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
@@ -119,10 +119,9 @@ static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
// This function returns either the appropriate sub block or block's mv
// on whether the block_size < 8x8 and we have check_sub_blocks set.
-static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate,
- int check_sub_blocks, int which_mv,
+static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
int search_col, int block_idx) {
- return check_sub_blocks && candidate->mbmi.sb_type < BLOCK_8X8
+ return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
.as_mv[which_mv]
: candidate->mbmi.mv[which_mv];
@@ -171,17 +170,19 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
// Checks that the given mi_row, mi_col and search point
// are inside the borders of the tile.
-static INLINE int is_inside(const VP9_COMMON *cm, int mi_col, int mi_row,
+static INLINE int is_inside(const TileInfo *const tile,
+ int mi_col, int mi_row, int mi_rows,
const MV *mv) {
return !(mi_row + mv->row < 0 ||
- mi_col + mv->col < cm->cur_tile_mi_col_start ||
- mi_row + mv->row >= cm->mi_rows ||
- mi_col + mv->col >= cm->cur_tile_mi_col_end);
+ mi_col + mv->col < tile->mi_col_start ||
+ mi_row + mv->row >= mi_rows ||
+ mi_col + mv->col >= tile->mi_col_end);
}
// This function searches the neighbourhood of a given MB/SB
// to try and find candidate reference vectors.
void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+ const TileInfo *const tile,
MODE_INFO *mi, const MODE_INFO *prev_mi,
MV_REFERENCE_FRAME ref_frame,
int_mv *mv_ref_list,
@@ -202,8 +203,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
// and we also need to keep a mode count.
for (i = 0; i < 2; ++i) {
const MV *const mv_ref = &mv_ref_search[i];
- if (is_inside(cm, mi_col, mi_row, mv_ref)) {
- const int check_sub_blocks = block_idx >= 0;
+ if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
const MODE_INFO *const candidate_mi = xd->mi_8x8[mv_ref->col + mv_ref->row
* xd->mode_info_stride];
const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
@@ -212,13 +212,13 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
// Check if the candidate comes from the same reference frame.
if (candidate->ref_frame[0] == ref_frame) {
- ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, check_sub_blocks, 0,
+ ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0,
mv_ref->col, block_idx));
different_ref_found = candidate->ref_frame[1] != ref_frame;
} else {
if (candidate->ref_frame[1] == ref_frame)
// Add second motion vector if it has the same ref_frame.
- ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, check_sub_blocks, 1,
+ ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1,
mv_ref->col, block_idx));
different_ref_found = 1;
}
@@ -230,7 +230,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
// mode counts.
for (; i < MVREF_NEIGHBOURS; ++i) {
const MV *const mv_ref = &mv_ref_search[i];
- if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+ if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
mv_ref->row
* xd->mode_info_stride]->mbmi;
@@ -260,7 +260,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
if (different_ref_found) {
for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
const MV *mv_ref = &mv_ref_search[i];
- if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+ if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
mv_ref->row
* xd->mode_info_stride]->mbmi;
diff --git a/libvpx/vp9/common/vp9_mvref_common.h b/libvpx/vp9/common/vp9_mvref_common.h
index 39ebdb0..ce4c559 100644
--- a/libvpx/vp9/common/vp9_mvref_common.h
+++ b/libvpx/vp9/common/vp9_mvref_common.h
@@ -15,6 +15,7 @@
#define VP9_COMMON_VP9_MVREF_COMMON_H_
void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+ const TileInfo *const tile,
MODE_INFO *mi, const MODE_INFO *prev_mi,
MV_REFERENCE_FRAME ref_frame,
int_mv *mv_ref_list,
@@ -22,11 +23,12 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
int mi_row, int mi_col);
static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+ const TileInfo *const tile,
MODE_INFO *mi, const MODE_INFO *prev_mi,
MV_REFERENCE_FRAME ref_frame,
int_mv *mv_ref_list,
int mi_row, int mi_col) {
- vp9_find_mv_refs_idx(cm, xd, mi, prev_mi, ref_frame,
+ vp9_find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame,
mv_ref_list, -1, mi_row, mi_col);
}
diff --git a/libvpx/vp9/common/vp9_onyx.h b/libvpx/vp9/common/vp9_onyx.h
index f424e6a..452dd6b 100644
--- a/libvpx/vp9/common/vp9_onyx.h
+++ b/libvpx/vp9/common/vp9_onyx.h
@@ -13,7 +13,7 @@
#ifdef __cplusplus
extern "C"
-{
+{ // NOLINT
#endif
#include "./vpx_config.h"
@@ -33,7 +33,6 @@ extern "C"
FOURFIVE = 1,
THREEFIVE = 2,
ONETWO = 3
-
} VPX_SCALING;
typedef enum {
@@ -71,42 +70,48 @@ extern "C"
// 3 - lowest quality/fastest decode
int width; // width of data passed to the compressor
int height; // height of data passed to the compressor
- double framerate; // set to passed in framerate
- int64_t target_bandwidth; // bandwidth to be used in kilobits per second
+ double framerate; // set to passed in framerate
+ int64_t target_bandwidth; // bandwidth to be used in kilobits per second
- int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0
- int Sharpness; // parameter used for sharpening output: recommendation 0:
+ int noise_sensitivity; // pre processing blur: recommendation 0
+ int Sharpness; // sharpening output: recommendation 0:
int cpu_used;
unsigned int rc_max_intra_bitrate_pct;
// mode ->
- // (0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing
- // a television signal or feed from a live camera). ( speed setting controls how fast )
- // (1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to
- // encode the output. ( speed setting controls how fast )
- // (2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding
- // speed. The output is compressed at the highest possible quality. This option takes the longest
- // amount of time to encode. ( speed setting ignored )
- // (3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding
- // pass. ( speed setting controls how fast )
- // (4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding
- // pass to create the compressed output. ( speed setting controls how fast )
- // (5)=Two Pass - Second Pass Best. The encoder uses the statistics that were generated in the first
- // encoding pass to create the compressed output using the highest possible quality, and taking a
+ // (0)=Realtime/Live Encoding. This mode is optimized for realtime
+ // encoding (for example, capturing a television signal or feed from
+ // a live camera). ( speed setting controls how fast )
+ // (1)=Good Quality Fast Encoding. The encoder balances quality with the
+ // amount of time it takes to encode the output. ( speed setting
+ // controls how fast )
+ // (2)=One Pass - Best Quality. The encoder places priority on the
+ // quality of the output over encoding speed. The output is compressed
+ // at the highest possible quality. This option takes the longest
+ // amount of time to encode. ( speed setting ignored )
+ // (3)=Two Pass - First Pass. The encoder generates a file of statistics
+ // for use in the second encoding pass. ( speed setting controls how
+ // fast )
+ // (4)=Two Pass - Second Pass. The encoder uses the statistics that were
+ // generated in the first encoding pass to create the compressed
+ // output. ( speed setting controls how fast )
+ // (5)=Two Pass - Second Pass Best. The encoder uses the statistics that
+ // were generated in the first encoding pass to create the compressed
+ // output using the highest possible quality, and taking a
// longer amount of time to encode.. ( speed setting ignored )
- int Mode; //
+ int Mode;
// Key Framing Operations
- int auto_key; // automatically detect cut scenes and set the keyframes
- int key_freq; // maximum distance to key frame.
+ int auto_key; // autodetect cut scenes and set the keyframes
+ int key_freq; // maximum distance to key frame.
- int allow_lag; // allow lagged compression (if 0 lagin frames is ignored)
- int lag_in_frames; // how many frames lag before we start encoding
+ int allow_lag; // allow lagged compression (if 0 lagin frames is ignored)
+ int lag_in_frames; // how many frames lag before we start encoding
// ----------------------------------------------------------------
// DATARATE CONTROL OPTIONS
- int end_usage; // vbr or cbr
+ int end_usage; // vbr or cbr
// buffer targeting aggressiveness
int under_shoot_pct;
@@ -138,7 +143,7 @@ extern "C"
int play_alternate;
int alt_freq;
- int encode_breakout; // early breakout encode threshold : for video conf recommend 800
+ int encode_breakout; // early breakout : for video conf recommend 800
/* Bitfield defining the error resiliency features to enable.
* Can provide decodable frames after losses in previous
@@ -173,8 +178,8 @@ extern "C"
void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf);
-// receive a frames worth of data caller can assume that a copy of this frame is made
-// and not just a copy of the pointer..
+ // receive a frames worth of data. caller can assume that a copy of this
+ // frame is made and not just a copy of the pointer..
int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags,
YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
int64_t end_time_stamp);
@@ -216,8 +221,6 @@ extern "C"
int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
unsigned int height);
- int vp9_switch_layer(VP9_PTR comp, int layer);
-
void vp9_set_svc(VP9_PTR comp, int use_svc);
int vp9_get_quantizer(VP9_PTR c);
diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h
index 0431e14..a2af57a 100644
--- a/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libvpx/vp9/common/vp9_onyxc_int.h
@@ -11,14 +11,15 @@
#ifndef VP9_COMMON_VP9_ONYXC_INT_H_
#define VP9_COMMON_VP9_ONYXC_INT_H_
-#include "vpx_config.h"
+#include "./vpx_config.h"
#include "vpx/internal/vpx_codec_internal.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
#include "vp9/common/vp9_loopfilter.h"
#include "vp9/common/vp9_entropymv.h"
#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_tile_common.h"
#if CONFIG_VP9_POSTPROC
#include "vp9/common/vp9_postproc.h"
@@ -40,10 +41,9 @@
typedef struct frame_contexts {
vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
- vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
- [PARTITION_TYPES - 1];
+ vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
- vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
+ vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
[SWITCHABLE_FILTERS - 1];
vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
@@ -58,11 +58,11 @@ typedef struct frame_contexts {
typedef struct {
unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
- unsigned int partition[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
+ unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
[COEF_BANDS][PREV_COEF_CONTEXTS];
- unsigned int switchable_interp[SWITCHABLE_FILTERS + 1]
+ unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
[SWITCHABLE_FILTERS];
unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
@@ -91,6 +91,8 @@ typedef struct VP9Common {
DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]);
#endif
+ COLOR_SPACE color_space;
+
int width;
int height;
int display_width;
@@ -116,11 +118,12 @@ typedef struct VP9Common {
// Each frame can reference ALLOWED_REFS_PER_FRAME buffers
int active_ref_idx[ALLOWED_REFS_PER_FRAME];
struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME];
+ struct scale_factors_common active_ref_scale_comm[ALLOWED_REFS_PER_FRAME];
int new_fb_idx;
YV12_BUFFER_CONFIG post_proc_buffer;
- FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */
+ FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/
FRAME_TYPE frame_type;
int show_frame;
@@ -129,6 +132,8 @@ typedef struct VP9Common {
// Flag signaling that the frame is encoded using only INTRA modes.
int intra_only;
+ int allow_high_precision_mv;
+
// Flag signaling that the frame context should be reset to default values.
// 0 or 1 implies don't reset, 2 reset just the context specified in the
// frame header, 3 reset all contexts.
@@ -146,8 +151,6 @@ typedef struct VP9Common {
TX_MODE tx_mode;
int base_qindex;
- int last_kf_gf_q; /* Q used on the last GF or KF */
-
int y_dc_delta_q;
int uv_dc_delta_q;
int uv_ac_delta_q;
@@ -172,7 +175,7 @@ typedef struct VP9Common {
// Persistent mb segment id map used in prediction.
unsigned char *last_frame_seg_map;
- INTERPOLATIONFILTERTYPE mcomp_filter_type;
+ INTERPOLATION_TYPE mcomp_filter_type;
loop_filter_info_n lf_info;
@@ -183,14 +186,6 @@ typedef struct VP9Common {
struct loopfilter lf;
struct segmentation seg;
- /* Y,U,V */
- ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
- ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
-
- // partition contexts
- PARTITION_CONTEXT *above_seg_context;
- PARTITION_CONTEXT left_seg_context[8];
-
// Context probabilities for reference frame prediction
int allow_comp_inter_inter;
MV_REFERENCE_FRAME comp_fixed_ref;
@@ -213,10 +208,19 @@ typedef struct VP9Common {
int frame_parallel_decoding_mode;
int log2_tile_cols, log2_tile_rows;
- int cur_tile_mi_col_start, cur_tile_mi_col_end;
- int cur_tile_mi_row_start, cur_tile_mi_row_end;
} VP9_COMMON;
+// ref == 0 => LAST_FRAME
+// ref == 1 => GOLDEN_FRAME
+// ref == 2 => ALTREF_FRAME
+static YV12_BUFFER_CONFIG *get_frame_ref_buffer(VP9_COMMON *cm, int ref) {
+ return &cm->yv12_fb[cm->active_ref_idx[ref]];
+}
+
+static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
+ return &cm->yv12_fb[cm->new_fb_idx];
+}
+
static int get_free_fb(VP9_COMMON *cm) {
int i;
for (i = 0; i < NUM_YV12_BUFFERS; i++)
@@ -241,58 +245,38 @@ static int mi_cols_aligned_to_sb(int n_mis) {
return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
}
-static INLINE void set_skip_context(VP9_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col) {
+static INLINE const vp9_prob* get_partition_probs(VP9_COMMON *cm, int ctx) {
+ return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx]
+ : cm->fc.partition_prob[ctx];
+}
+
+static INLINE void set_skip_context(
+ MACROBLOCKD *xd,
+ ENTROPY_CONTEXT *above_context[MAX_MB_PLANE],
+ ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16],
+ int mi_row, int mi_col) {
const int above_idx = mi_col * 2;
const int left_idx = (mi_row * 2) & 15;
int i;
for (i = 0; i < MAX_MB_PLANE; i++) {
struct macroblockd_plane *const pd = &xd->plane[i];
- pd->above_context = cm->above_context[i] + (above_idx >> pd->subsampling_x);
- pd->left_context = cm->left_context[i] + (left_idx >> pd->subsampling_y);
+ pd->above_context = above_context[i] + (above_idx >> pd->subsampling_x);
+ pd->left_context = left_context[i] + (left_idx >> pd->subsampling_y);
}
}
-static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col) {
- xd->above_seg_context = cm->above_seg_context + mi_col;
- xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
-}
-
-// return the node index in the prob tree for binary coding
-static int check_bsize_coverage(int bs, int mi_rows, int mi_cols,
- int mi_row, int mi_col) {
- const int r = (mi_row + bs < mi_rows);
- const int c = (mi_col + bs < mi_cols);
-
- if (r && c)
- return 0;
-
- if (c && !r)
- return 1; // only allow horizontal/split partition types
-
- if (r && !c)
- return 2; // only allow vertical/split partition types
-
- return -1;
-}
-
-static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int bh,
- int mi_col, int bw) {
- xd->mb_to_top_edge = -((mi_row * MI_SIZE) << 3);
- xd->mb_to_bottom_edge = ((cm->mi_rows - bh - mi_row) * MI_SIZE) << 3;
- xd->mb_to_left_edge = -((mi_col * MI_SIZE) << 3);
- xd->mb_to_right_edge = ((cm->mi_cols - bw - mi_col) * MI_SIZE) << 3;
+static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+ int mi_row, int bh,
+ int mi_col, int bw,
+ int mi_rows, int mi_cols) {
+ xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+ xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
+ xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+ xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
// Are edges available for intra prediction?
xd->up_available = (mi_row != 0);
- xd->left_available = (mi_col > cm->cur_tile_mi_col_start);
- xd->right_available = (mi_col + bw < cm->cur_tile_mi_col_end);
-}
-
-static int get_token_alloc(int mb_rows, int mb_cols) {
- return mb_rows * mb_cols * (48 * 16 + 4);
+ xd->left_available = (mi_col > tile->mi_col_start);
}
static void set_prev_mi(VP9_COMMON *cm) {
@@ -306,4 +290,62 @@ static void set_prev_mi(VP9_COMMON *cm) {
cm->prev_mi = use_prev_in_find_mv_refs ?
cm->prev_mip + cm->mode_info_stride + 1 : NULL;
}
+
+static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
+ return cm->frame_type == KEY_FRAME || cm->intra_only;
+}
+
+static INLINE void update_partition_context(
+ PARTITION_CONTEXT *above_seg_context,
+ PARTITION_CONTEXT left_seg_context[8],
+ int mi_row, int mi_col,
+ BLOCK_SIZE sb_type,
+ BLOCK_SIZE sb_size) {
+ PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
+ PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
+
+ const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
+ const int bwl = b_width_log2(sb_type);
+ const int bhl = b_height_log2(sb_type);
+ const int boffset = b_width_log2(BLOCK_64X64) - bsl;
+ const char pcval0 = ~(0xe << boffset);
+ const char pcval1 = ~(0xf << boffset);
+ const char pcvalue[2] = {pcval0, pcval1};
+
+ assert(MAX(bwl, bhl) <= bsl);
+
+ // update the partition context at the end notes. set partition bits
+ // of block sizes larger than the current one to be one, and partition
+ // bits of smaller block sizes to be zero.
+ vpx_memset(above_ctx, pcvalue[bwl == bsl], bs);
+ vpx_memset(left_ctx, pcvalue[bhl == bsl], bs);
+}
+
+static INLINE int partition_plane_context(
+ const PARTITION_CONTEXT *above_seg_context,
+ const PARTITION_CONTEXT left_seg_context[8],
+ int mi_row, int mi_col,
+ BLOCK_SIZE sb_type) {
+ const PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
+ const PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
+
+ int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
+ int above = 0, left = 0, i;
+ int boffset = mi_width_log2(BLOCK_64X64) - bsl;
+
+ assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
+ assert(bsl >= 0);
+ assert(boffset >= 0);
+
+ for (i = 0; i < bs; i++)
+ above |= (above_ctx[i] & (1 << boffset));
+ for (i = 0; i < bs; i++)
+ left |= (left_ctx[i] & (1 << boffset));
+
+ above = (above > 0);
+ left = (left > 0);
+
+ return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
#endif // VP9_COMMON_VP9_ONYXC_INT_H_
diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c
index 955e676..212a28a 100644
--- a/libvpx/vp9/common/vp9_postproc.c
+++ b/libvpx/vp9/common/vp9_postproc.c
@@ -8,6 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
#include "./vpx_config.h"
#include "vpx_scale/yv12config.h"
@@ -18,11 +21,6 @@
#include "./vp9_rtcd.h"
#include "./vpx_scale_rtcd.h"
-
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-
#define RGB_TO_YUV(t) \
( (0.257*(float)(t >> 16)) + (0.504*(float)(t >> 8 & 0xff)) + \
(0.098*(float)(t & 0xff)) + 16), \
@@ -155,7 +153,6 @@ void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
p_dst = dst_ptr;
for (col = 0; col < cols; col++) {
-
int kernel = 4;
int v = p_src[col];
@@ -257,7 +254,7 @@ void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch,
int rows, int cols, int flimit) {
int r, c, i;
- const short *rv3 = &vp9_rv[63 & rand()];
+ const short *rv3 = &vp9_rv[63 & rand()]; // NOLINT
for (c = 0; c < cols; c++) {
uint8_t *s = &dst[c];
@@ -408,7 +405,6 @@ static void fillrd(struct postproc_state *state, int q, int a) {
next = next + j;
}
-
}
for (; next < 256; next++)
@@ -416,7 +412,7 @@ static void fillrd(struct postproc_state *state, int q, int a) {
}
for (i = 0; i < 3072; i++) {
- state->noise[i] = char_dist[rand() & 0xff];
+ state->noise[i] = char_dist[rand() & 0xff]; // NOLINT
}
for (i = 0; i < 16; i++) {
@@ -680,13 +676,14 @@ int vp9_post_proc_frame(struct VP9Common *cm,
#if 0 && CONFIG_POSTPROC_VISUALIZER
if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
char message[512];
- sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
- (cm->frame_type == KEY_FRAME),
- cm->refresh_golden_frame,
- cm->base_qindex,
- cm->filter_level,
- flags,
- cm->mb_cols, cm->mb_rows);
+ snprintf(message, sizeof(message) -1,
+ "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
+ (cm->frame_type == KEY_FRAME),
+ cm->refresh_golden_frame,
+ cm->base_qindex,
+ cm->filter_level,
+ flags,
+ cm->mb_cols, cm->mb_rows);
vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
cm->post_proc_buffer.y_stride);
}
@@ -707,7 +704,7 @@ int vp9_post_proc_frame(struct VP9Common *cm,
for (j = 0; j < mb_cols; j++) {
char zz[4];
- sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
+ snprintf(zz, sizeof(zz) - 1, "%c", mi[mb_index].mbmi.mode + 'a');
vp9_blit_text(zz, y_ptr, post->y_stride);
mb_index++;
@@ -716,7 +713,6 @@ int vp9_post_proc_frame(struct VP9Common *cm,
mb_index++; /* border */
y_ptr += post->y_stride * 16 - post->y_width;
-
}
}
@@ -740,9 +736,9 @@ int vp9_post_proc_frame(struct VP9Common *cm,
mi[mb_index].mbmi.skip_coeff);
if (cm->frame_type == KEY_FRAME)
- sprintf(zz, "a");
+ snprintf(zz, sizeof(zz) - 1, "a");
else
- sprintf(zz, "%c", dc_diff + '0');
+ snprintf(zz, sizeof(zz) - 1, "%c", dc_diff + '0');
vp9_blit_text(zz, y_ptr, post->y_stride);
mb_index++;
@@ -751,7 +747,6 @@ int vp9_post_proc_frame(struct VP9Common *cm,
mb_index++; /* border */
y_ptr += post->y_stride * 16 - post->y_width;
-
}
}
@@ -894,8 +889,9 @@ int vp9_post_proc_frame(struct VP9Common *cm,
constrain_line(lx0, &x1, ly0 + 1, &y1, width, height);
vp9_blit_line(lx0, x1, ly0 + 1, y1, y_buffer, y_stride);
- } else
+ } else {
vp9_blit_line(lx0, x1, ly0, y1, y_buffer, y_stride);
+ }
}
mi++;
diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c
index 81fbf1f..6018e17 100644
--- a/libvpx/vp9/common/vp9_pred_common.c
+++ b/libvpx/vp9/common/vp9_pred_common.c
@@ -16,25 +16,33 @@
#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_treecoder.h"
+static INLINE const MB_MODE_INFO *get_above_mbmi(const MODE_INFO *const above) {
+ return (above != NULL) ? &above->mbmi : NULL;
+}
+
+static INLINE const MB_MODE_INFO *get_left_mbmi(const MODE_INFO *const left) {
+ return (left != NULL) ? &left->mbmi : NULL;
+}
+
// Returns a context number for the given MB prediction signal
unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
- const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
- const MODE_INFO * const left_mi = xd->mi_8x8[-1];
- const int left_in_image = xd->left_available && left_mi;
- const int above_in_image = xd->up_available && above_mi;
+ const MODE_INFO *const above_mi = get_above_mi(xd);
+ const MODE_INFO *const left_mi = get_left_mi(xd);
+ const int above_in_image = above_mi != NULL;
+ const int left_in_image = left_mi != NULL;
// Note:
// The mode info data structure has a one element border above and to the
// left of the entries correpsonding to real macroblocks.
// The prediction flags in these dummy entries are initialised to 0.
// left
- const int left_mv_pred = left_in_image ? is_inter_mode(left_mi->mbmi.mode)
+ const int left_mv_pred = left_in_image ? is_inter_block(&left_mi->mbmi)
: 0;
const int left_interp = left_in_image && left_mv_pred
? left_mi->mbmi.interp_filter
: SWITCHABLE_FILTERS;
// above
- const int above_mv_pred = above_in_image ? is_inter_mode(above_mi->mbmi.mode)
+ const int above_mv_pred = above_in_image ? is_inter_block(&above_mi->mbmi)
: 0;
const int above_interp = above_in_image && above_mv_pred
? above_mi->mbmi.interp_filter
@@ -53,14 +61,14 @@ unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
}
// Returns a context number for the given MB prediction signal
unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
- const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
- const MODE_INFO * const left_mi = xd->mi_8x8[-1];
- const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
- const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
- const int left_in_image = xd->left_available && left_mi;
- const int above_in_image = xd->up_available && above_mi;
- const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+ const MODE_INFO *const above_mi = get_above_mi(xd);
+ const MODE_INFO *const left_mi = get_left_mi(xd);
+ const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
+ const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
+ const int above_in_image = above_mi != NULL;
+ const int left_in_image = left_mi != NULL;
const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
+ const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
// The mode info data structure has a one element border above and to the
// left of the entries corresponding to real macroblocks.
@@ -81,12 +89,12 @@ unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
const MACROBLOCKD *xd) {
int pred_context;
- const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
- const MODE_INFO * const left_mi = xd->mi_8x8[-1];
- const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
- const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
- const int left_in_image = xd->left_available && left_mi;
- const int above_in_image = xd->up_available && above_mi;
+ const MODE_INFO *const above_mi = get_above_mi(xd);
+ const MODE_INFO *const left_mi = get_left_mi(xd);
+ const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
+ const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
+ const int above_in_image = above_mi != NULL;
+ const int left_in_image = left_mi != NULL;
// Note:
// The mode info data structure has a one element border above and to the
// left of the entries correpsonding to real macroblocks.
@@ -126,14 +134,14 @@ unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
const MACROBLOCKD *xd) {
int pred_context;
- const MODE_INFO * const above_mi = xd->mi_8x8[-cm->mode_info_stride];
- const MODE_INFO * const left_mi = xd->mi_8x8[-1];
- const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
- const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
- const int left_in_image = xd->left_available && left_mi;
- const int above_in_image = xd->up_available && above_mi;
- const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+ const MODE_INFO *const above_mi = get_above_mi(xd);
+ const MODE_INFO *const left_mi = get_left_mi(xd);
+ const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
+ const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
+ const int above_in_image = above_mi != NULL;
+ const int left_in_image = left_mi != NULL;
const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
+ const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
// Note:
// The mode info data structure has a one element border above and to the
// left of the entries correpsonding to real macroblocks.
@@ -206,14 +214,14 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
}
unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
int pred_context;
- const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
- const MODE_INFO * const left_mi = xd->mi_8x8[-1];
- const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
- const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
- const int left_in_image = xd->left_available && left_mi;
- const int above_in_image = xd->up_available && above_mi;
- const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+ const MODE_INFO *const above_mi = get_above_mi(xd);
+ const MODE_INFO *const left_mi = get_left_mi(xd);
+ const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
+ const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
+ const int above_in_image = above_mi != NULL;
+ const int left_in_image = left_mi != NULL;
const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
+ const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
// Note:
// The mode info data structure has a one element border above and to the
// left of the entries correpsonding to real macroblocks.
@@ -272,14 +280,14 @@ unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
int pred_context;
- const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
- const MODE_INFO * const left_mi = xd->mi_8x8[-1];
- const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
- const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
- const int left_in_image = xd->left_available && left_mi;
- const int above_in_image = xd->up_available && above_mi;
- const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+ const MODE_INFO *const above_mi = get_above_mi(xd);
+ const MODE_INFO *const left_mi = get_left_mi(xd);
+ const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
+ const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
+ const int above_in_image = above_mi != NULL;
+ const int left_in_image = left_mi != NULL;
const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
+ const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
// Note:
// The mode info data structure has a one element border above and to the
@@ -361,12 +369,12 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
// left of the entries corresponding to real blocks.
// The prediction flags in these dummy entries are initialized to 0.
unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
- const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
- const MODE_INFO * const left_mi = xd->mi_8x8[-1];
- const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0;
- const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0;
- const int left_in_image = xd->left_available && left_mi;
- const int above_in_image = xd->up_available && above_mi;
+ const MODE_INFO *const above_mi = get_above_mi(xd);
+ const MODE_INFO *const left_mi = get_left_mi(xd);
+ const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
+ const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
+ const int above_in_image = above_mi != NULL;
+ const int left_in_image = left_mi != NULL;
const int max_tx_size = max_txsize_lookup[xd->mi_8x8[0]->mbmi.sb_type];
int above_context = max_tx_size;
int left_context = max_tx_size;
@@ -389,19 +397,14 @@ unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
}
void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) {
- xd->this_mi->mbmi.seg_id_predicted = pred_flag;
-}
-
-void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize,
- uint8_t pred_flag) {
- xd->this_mi->mbmi.skip_coeff = pred_flag;
+ xd->mi_8x8[0]->mbmi.seg_id_predicted = pred_flag;
}
int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
BLOCK_SIZE bsize, int mi_row, int mi_col) {
const int mi_offset = mi_row * cm->mi_cols + mi_col;
- const int bw = 1 << mi_width_log2(bsize);
- const int bh = 1 << mi_height_log2(bsize);
+ const int bw = num_8x8_blocks_wide_lookup[bsize];
+ const int bh = num_8x8_blocks_high_lookup[bsize];
const int xmis = MIN(cm->mi_cols - mi_col, bw);
const int ymis = MIN(cm->mi_rows - mi_row, bh);
int x, y, segment_id = INT_MAX;
diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h
index 47ca8ab..19032bf 100644
--- a/libvpx/vp9/common/vp9_pred_common.h
+++ b/libvpx/vp9/common/vp9_pred_common.h
@@ -14,17 +14,25 @@
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_onyxc_int.h"
+static INLINE const MODE_INFO *get_above_mi(const MACROBLOCKD *const xd) {
+ return xd->up_available ? xd->mi_8x8[-xd->mode_info_stride] : NULL;
+}
+
+static INLINE const MODE_INFO *get_left_mi(const MACROBLOCKD *const xd) {
+ return xd->left_available ? xd->mi_8x8[-1] : NULL;
+}
+
int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
BLOCK_SIZE bsize, int mi_row, int mi_col);
-
static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
- const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
- const MODE_INFO * const left_mi = xd->mi_8x8[-1];
- const int above_sip = above_mi ? above_mi->mbmi.seg_id_predicted : 0;
- const int left_sip = left_mi ? left_mi->mbmi.seg_id_predicted : 0;
+ const MODE_INFO *const above_mi = get_above_mi(xd);
+ const MODE_INFO *const left_mi = get_left_mi(xd);
+ const int above_sip = (above_mi != NULL) ?
+ above_mi->mbmi.seg_id_predicted : 0;
+ const int left_sip = (left_mi != NULL) ? left_mi->mbmi.seg_id_predicted : 0;
- return above_sip + (xd->left_available ? left_sip : 0);
+ return above_sip + left_sip;
}
static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg,
@@ -35,12 +43,13 @@ static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg,
void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag);
static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) {
- const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride];
- const MODE_INFO * const left_mi = xd->mi_8x8[-1];
- const int above_skip_coeff = above_mi ? above_mi->mbmi.skip_coeff : 0;
- const int left_skip_coeff = left_mi ? left_mi->mbmi.skip_coeff : 0;
+ const MODE_INFO *const above_mi = get_above_mi(xd);
+ const MODE_INFO *const left_mi = get_left_mi(xd);
+ const int above_skip_coeff = (above_mi != NULL) ?
+ above_mi->mbmi.skip_coeff : 0;
+ const int left_skip_coeff = (left_mi != NULL) ? left_mi->mbmi.skip_coeff : 0;
- return above_skip_coeff + (xd->left_available ? left_skip_coeff : 0);
+ return above_skip_coeff + left_skip_coeff;
}
static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm,
@@ -49,12 +58,9 @@ static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm,
}
static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) {
- return xd->this_mi->mbmi.skip_coeff;
+ return xd->mi_8x8[0]->mbmi.skip_coeff;
}
-void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize,
- uint8_t pred_flag);
-
unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd);
@@ -69,8 +75,9 @@ unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
const MACROBLOCKD *xd);
-static INLINE vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm,
- const MACROBLOCKD *xd) {
+static INLINE
+vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm,
+ const MACROBLOCKD *xd) {
const int pred_context = vp9_get_pred_context_comp_inter_inter(cm, xd);
return cm->fc.comp_inter_prob[pred_context];
}
@@ -120,14 +127,14 @@ static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
return get_tx_probs(bsize, context, tx_probs);
}
-static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context,
- TX_SIZE tx_size, struct tx_counts *tx_counts) {
- if (bsize >= BLOCK_32X32)
- tx_counts->p32x32[context][tx_size]++;
- else if (bsize >= BLOCK_16X16)
- tx_counts->p16x16[context][tx_size]++;
+static unsigned int *get_tx_counts(BLOCK_SIZE bsize, uint8_t context,
+ struct tx_counts *tx_counts) {
+ if (bsize < BLOCK_16X16)
+ return tx_counts->p8x8[context];
+ else if (bsize < BLOCK_32X32)
+ return tx_counts->p16x16[context];
else
- tx_counts->p8x8[context][tx_size]++;
+ return tx_counts->p32x32[context];
}
#endif // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_quant_common.c b/libvpx/vp9/common/vp9_quant_common.c
index bc40854..6dbdb42 100644
--- a/libvpx/vp9/common/vp9_quant_common.c
+++ b/libvpx/vp9/common/vp9_quant_common.c
@@ -14,69 +14,69 @@
#if 1
static const int16_t dc_qlookup[QINDEX_RANGE] = {
- 4, 8, 8, 9, 10, 11, 12, 12,
- 13, 14, 15, 16, 17, 18, 19, 19,
- 20, 21, 22, 23, 24, 25, 26, 26,
- 27, 28, 29, 30, 31, 32, 32, 33,
- 34, 35, 36, 37, 38, 38, 39, 40,
- 41, 42, 43, 43, 44, 45, 46, 47,
- 48, 48, 49, 50, 51, 52, 53, 53,
- 54, 55, 56, 57, 57, 58, 59, 60,
- 61, 62, 62, 63, 64, 65, 66, 66,
- 67, 68, 69, 70, 70, 71, 72, 73,
- 74, 74, 75, 76, 77, 78, 78, 79,
- 80, 81, 81, 82, 83, 84, 85, 85,
- 87, 88, 90, 92, 93, 95, 96, 98,
- 99, 101, 102, 104, 105, 107, 108, 110,
- 111, 113, 114, 116, 117, 118, 120, 121,
- 123, 125, 127, 129, 131, 134, 136, 138,
- 140, 142, 144, 146, 148, 150, 152, 154,
- 156, 158, 161, 164, 166, 169, 172, 174,
- 177, 180, 182, 185, 187, 190, 192, 195,
- 199, 202, 205, 208, 211, 214, 217, 220,
- 223, 226, 230, 233, 237, 240, 243, 247,
- 250, 253, 257, 261, 265, 269, 272, 276,
- 280, 284, 288, 292, 296, 300, 304, 309,
- 313, 317, 322, 326, 330, 335, 340, 344,
- 349, 354, 359, 364, 369, 374, 379, 384,
- 389, 395, 400, 406, 411, 417, 423, 429,
- 435, 441, 447, 454, 461, 467, 475, 482,
- 489, 497, 505, 513, 522, 530, 539, 549,
- 559, 569, 579, 590, 602, 614, 626, 640,
- 654, 668, 684, 700, 717, 736, 755, 775,
- 796, 819, 843, 869, 896, 925, 955, 988,
+ 4, 8, 8, 9, 10, 11, 12, 12,
+ 13, 14, 15, 16, 17, 18, 19, 19,
+ 20, 21, 22, 23, 24, 25, 26, 26,
+ 27, 28, 29, 30, 31, 32, 32, 33,
+ 34, 35, 36, 37, 38, 38, 39, 40,
+ 41, 42, 43, 43, 44, 45, 46, 47,
+ 48, 48, 49, 50, 51, 52, 53, 53,
+ 54, 55, 56, 57, 57, 58, 59, 60,
+ 61, 62, 62, 63, 64, 65, 66, 66,
+ 67, 68, 69, 70, 70, 71, 72, 73,
+ 74, 74, 75, 76, 77, 78, 78, 79,
+ 80, 81, 81, 82, 83, 84, 85, 85,
+ 87, 88, 90, 92, 93, 95, 96, 98,
+ 99, 101, 102, 104, 105, 107, 108, 110,
+ 111, 113, 114, 116, 117, 118, 120, 121,
+ 123, 125, 127, 129, 131, 134, 136, 138,
+ 140, 142, 144, 146, 148, 150, 152, 154,
+ 156, 158, 161, 164, 166, 169, 172, 174,
+ 177, 180, 182, 185, 187, 190, 192, 195,
+ 199, 202, 205, 208, 211, 214, 217, 220,
+ 223, 226, 230, 233, 237, 240, 243, 247,
+ 250, 253, 257, 261, 265, 269, 272, 276,
+ 280, 284, 288, 292, 296, 300, 304, 309,
+ 313, 317, 322, 326, 330, 335, 340, 344,
+ 349, 354, 359, 364, 369, 374, 379, 384,
+ 389, 395, 400, 406, 411, 417, 423, 429,
+ 435, 441, 447, 454, 461, 467, 475, 482,
+ 489, 497, 505, 513, 522, 530, 539, 549,
+ 559, 569, 579, 590, 602, 614, 626, 640,
+ 654, 668, 684, 700, 717, 736, 755, 775,
+ 796, 819, 843, 869, 896, 925, 955, 988,
1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
};
static const int16_t ac_qlookup[QINDEX_RANGE] = {
- 4, 8, 9, 10, 11, 12, 13, 14,
- 15, 16, 17, 18, 19, 20, 21, 22,
- 23, 24, 25, 26, 27, 28, 29, 30,
- 31, 32, 33, 34, 35, 36, 37, 38,
- 39, 40, 41, 42, 43, 44, 45, 46,
- 47, 48, 49, 50, 51, 52, 53, 54,
- 55, 56, 57, 58, 59, 60, 61, 62,
- 63, 64, 65, 66, 67, 68, 69, 70,
- 71, 72, 73, 74, 75, 76, 77, 78,
- 79, 80, 81, 82, 83, 84, 85, 86,
- 87, 88, 89, 90, 91, 92, 93, 94,
- 95, 96, 97, 98, 99, 100, 101, 102,
- 104, 106, 108, 110, 112, 114, 116, 118,
- 120, 122, 124, 126, 128, 130, 132, 134,
- 136, 138, 140, 142, 144, 146, 148, 150,
- 152, 155, 158, 161, 164, 167, 170, 173,
- 176, 179, 182, 185, 188, 191, 194, 197,
- 200, 203, 207, 211, 215, 219, 223, 227,
- 231, 235, 239, 243, 247, 251, 255, 260,
- 265, 270, 275, 280, 285, 290, 295, 300,
- 305, 311, 317, 323, 329, 335, 341, 347,
- 353, 359, 366, 373, 380, 387, 394, 401,
- 408, 416, 424, 432, 440, 448, 456, 465,
- 474, 483, 492, 501, 510, 520, 530, 540,
- 550, 560, 571, 582, 593, 604, 615, 627,
- 639, 651, 663, 676, 689, 702, 715, 729,
- 743, 757, 771, 786, 801, 816, 832, 848,
- 864, 881, 898, 915, 933, 951, 969, 988,
+ 4, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22,
+ 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46,
+ 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 64, 65, 66, 67, 68, 69, 70,
+ 71, 72, 73, 74, 75, 76, 77, 78,
+ 79, 80, 81, 82, 83, 84, 85, 86,
+ 87, 88, 89, 90, 91, 92, 93, 94,
+ 95, 96, 97, 98, 99, 100, 101, 102,
+ 104, 106, 108, 110, 112, 114, 116, 118,
+ 120, 122, 124, 126, 128, 130, 132, 134,
+ 136, 138, 140, 142, 144, 146, 148, 150,
+ 152, 155, 158, 161, 164, 167, 170, 173,
+ 176, 179, 182, 185, 188, 191, 194, 197,
+ 200, 203, 207, 211, 215, 219, 223, 227,
+ 231, 235, 239, 243, 247, 251, 255, 260,
+ 265, 270, 275, 280, 285, 290, 295, 300,
+ 305, 311, 317, 323, 329, 335, 341, 347,
+ 353, 359, 366, 373, 380, 387, 394, 401,
+ 408, 416, 424, 432, 440, 448, 456, 465,
+ 474, 483, 492, 501, 510, 520, 530, 540,
+ 550, 560, 571, 582, 593, 604, 615, 627,
+ 639, 651, 663, 676, 689, 702, 715, 729,
+ 743, 757, 771, 786, 801, 816, 832, 848,
+ 864, 881, 898, 915, 933, 951, 969, 988,
1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151,
1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567,
diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c
index dc1d46c..1c96788 100644
--- a/libvpx/vp9/common/vp9_reconinter.c
+++ b/libvpx/vp9/common/vp9_reconinter.c
@@ -20,37 +20,44 @@
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
-
void vp9_setup_interp_filters(MACROBLOCKD *xd,
- INTERPOLATIONFILTERTYPE mcomp_filter_type,
+ INTERPOLATION_TYPE mcomp_filter_type,
VP9_COMMON *cm) {
- if (xd->mi_8x8 && xd->this_mi) {
- MB_MODE_INFO * mbmi = &xd->this_mi->mbmi;
+ if (xd->mi_8x8 && xd->mi_8x8[0]) {
+ MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
- set_scale_factors(xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1,
- cm->active_ref_scale);
+ set_scale_factors(xd, mbmi->ref_frame[0] - LAST_FRAME,
+ mbmi->ref_frame[1] - LAST_FRAME,
+ cm->active_ref_scale);
} else {
set_scale_factors(xd, -1, -1, cm->active_ref_scale);
}
- switch (mcomp_filter_type) {
- case EIGHTTAP:
- case SWITCHABLE:
- xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8;
- break;
- case EIGHTTAP_SMOOTH:
- xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp;
- break;
- case EIGHTTAP_SHARP:
- xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s;
- break;
- case BILINEAR:
- xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters;
- break;
- }
+ xd->subpix.filter_x = xd->subpix.filter_y =
+ vp9_get_filter_kernel(mcomp_filter_type == SWITCHABLE ?
+ EIGHTTAP : mcomp_filter_type);
+
assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
}
+static void inter_predictor(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const MV32 *mv,
+ const struct scale_factors *scale,
+ int w, int h, int ref,
+ const struct subpix_fn_table *subpix,
+ int xs, int ys) {
+ const int subpel_x = mv->col & SUBPEL_MASK;
+ const int subpel_y = mv->row & SUBPEL_MASK;
+
+ src += (mv->row >> SUBPEL_BITS) * src_stride + (mv->col >> SUBPEL_BITS);
+ scale->sfc->predict[subpel_x != 0][subpel_y != 0][ref](
+ src, src_stride, dst, dst_stride,
+ subpix->filter_x[subpel_x], xs,
+ subpix->filter_y[subpel_y], ys,
+ w, h);
+}
+
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const MV *src_mv,
@@ -59,18 +66,13 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
const struct subpix_fn_table *subpix,
enum mv_precision precision) {
const int is_q4 = precision == MV_PRECISION_Q4;
- const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row << 1,
- is_q4 ? src_mv->col : src_mv->col << 1 };
- const MV32 mv = scale->scale_mv(&mv_q4, scale);
- const int subpel_x = mv.col & SUBPEL_MASK;
- const int subpel_y = mv.row & SUBPEL_MASK;
-
- src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
- scale->predict[subpel_x != 0][subpel_y != 0][ref](
- src, src_stride, dst, dst_stride,
- subpix->filter_x[subpel_x], scale->x_step_q4,
- subpix->filter_y[subpel_y], scale->y_step_q4,
- w, h);
+ const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+ is_q4 ? src_mv->col : src_mv->col * 2 };
+ const struct scale_factors_common *sfc = scale->sfc;
+ const MV32 mv = sfc->scale_mv(&mv_q4, scale);
+
+ inter_predictor(src, src_stride, dst, dst_stride, &mv, scale,
+ w, h, ref, subpix, sfc->x_step_q4, sfc->y_step_q4);
}
static INLINE int round_mv_comp_q4(int value) {
@@ -100,16 +102,17 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
const int spel_top = (VP9_INTERP_EXTEND + bh) << SUBPEL_BITS;
const int spel_bottom = spel_top - SUBPEL_SHIFTS;
MV clamped_mv = {
- src_mv->row << (1 - ss_y),
- src_mv->col << (1 - ss_x)
+ src_mv->row * (1 << (1 - ss_y)),
+ src_mv->col * (1 << (1 - ss_x))
};
assert(ss_x <= 1);
assert(ss_y <= 1);
- clamp_mv(&clamped_mv, (xd->mb_to_left_edge << (1 - ss_x)) - spel_left,
- (xd->mb_to_right_edge << (1 - ss_x)) + spel_right,
- (xd->mb_to_top_edge << (1 - ss_y)) - spel_top,
- (xd->mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
+ clamp_mv(&clamped_mv,
+ xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
+ xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
+ xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
+ xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom);
return clamped_mv;
}
@@ -130,8 +133,8 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
const int bh = plane_block_height(bsize, pd);
const int x = 4 * (block & ((1 << bwl) - 1));
const int y = 4 * (block >> bwl);
- const MODE_INFO *mi = xd->this_mi;
- const int use_second_ref = mi->mbmi.ref_frame[1] > 0;
+ const MODE_INFO *mi = xd->mi_8x8[0];
+ const int is_compound = has_second_ref(&mi->mbmi);
int ref;
assert(x < bw);
@@ -139,14 +142,10 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw);
assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh);
- for (ref = 0; ref < 1 + use_second_ref; ++ref) {
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
struct scale_factors *const scale = &xd->scale_factor[ref];
struct buf_2d *const pre_buf = &pd->pre[ref];
struct buf_2d *const dst_buf = &pd->dst;
-
- const uint8_t *const pre = pre_buf->buf + scaled_buffer_offset(x, y,
- pre_buf->stride, scale);
-
uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
// TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
@@ -162,15 +161,32 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
// scaling case. It needs to be done on the scaled MV, not the pre-scaling
// MV. Note however that it performs the subsampling aware scaling so
// that the result is always q4.
- const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
- pd->subsampling_x,
- pd->subsampling_y);
-
- scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
- vp9_build_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
- &res_mv, scale,
- 4 << pred_w, 4 << pred_h, ref,
- &xd->subpix, MV_PRECISION_Q4);
+ // mv_precision precision is MV_PRECISION_Q4.
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+ pd->subsampling_x,
+ pd->subsampling_y);
+
+ uint8_t *pre;
+ MV32 scaled_mv;
+ int xs, ys;
+
+ if (vp9_is_scaled(scale->sfc)) {
+ pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale);
+ scale->sfc->set_scaled_offsets(scale, arg->y + y, arg->x + x);
+ scaled_mv = scale->sfc->scale_mv(&mv_q4, scale);
+ xs = scale->sfc->x_step_q4;
+ ys = scale->sfc->y_step_q4;
+ } else {
+ pre = pre_buf->buf + (y * pre_buf->stride + x);
+ scaled_mv.row = mv_q4.row;
+ scaled_mv.col = mv_q4.col;
+ xs = ys = 16;
+ }
+
+ inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+ &scaled_mv, scale,
+ 4 << pred_w, 4 << pred_h, ref,
+ &xd->subpix, xs, ys);
}
}
@@ -184,36 +200,17 @@ typedef void (*foreach_predicted_block_visitor)(int plane, int block,
static INLINE void foreach_predicted_block_in_plane(
const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane,
foreach_predicted_block_visitor visit, void *arg) {
- int i, x, y;
-
- // block sizes in number of 4x4 blocks log 2 ("*_b")
- // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
- // subsampled size of the block
const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
- // size of the predictor to use.
- int pred_w, pred_h;
-
- if (xd->this_mi->mbmi.sb_type < BLOCK_8X8) {
+ if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) {
+ int i = 0, x, y;
assert(bsize == BLOCK_8X8);
- pred_w = 0;
- pred_h = 0;
+ for (y = 0; y < 1 << bhl; ++y)
+ for (x = 0; x < 1 << bwl; ++x)
+ visit(plane, i++, bsize, 0, 0, arg);
} else {
- pred_w = bwl;
- pred_h = bhl;
- }
- assert(pred_w <= bwl);
- assert(pred_h <= bhl);
-
- // visit each subblock in raster order
- i = 0;
- for (y = 0; y < 1 << bhl; y += 1 << pred_h) {
- for (x = 0; x < 1 << bwl; x += 1 << pred_w) {
- visit(plane, i, bsize, pred_w, pred_h, arg);
- i += 1 << pred_w;
- }
- i += (1 << (bwl + pred_h)) - (1 << bwl);
+ visit(plane, 0, bsize, bwl, bhl, arg);
}
}
@@ -249,15 +246,17 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
const int ref = cm->active_ref_idx[i];
struct scale_factors *const sf = &cm->active_ref_scale[i];
+ struct scale_factors_common *const sfc = &cm->active_ref_scale_comm[i];
if (ref >= NUM_YV12_BUFFERS) {
vp9_zero(*sf);
+ vp9_zero(*sfc);
} else {
YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref];
- vp9_setup_scale_factors_for_frame(sf,
+ vp9_setup_scale_factors_for_frame(sf, sfc,
fb->y_crop_width, fb->y_crop_height,
cm->width, cm->height);
- if (vp9_is_scaled(sf))
+ if (vp9_is_scaled(sfc))
vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y);
}
}
diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h
index 504b793..2c8a6e4 100644
--- a/libvpx/vp9/common/vp9_reconinter.h
+++ b/libvpx/vp9/common/vp9_reconinter.h
@@ -25,7 +25,7 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
BLOCK_SIZE bsize);
void vp9_setup_interp_filters(MACROBLOCKD *xd,
- INTERPOLATIONFILTERTYPE filter,
+ INTERPOLATION_TYPE filter,
VP9_COMMON *cm);
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
@@ -38,8 +38,10 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
static int scaled_buffer_offset(int x_offset, int y_offset, int stride,
const struct scale_factors *scale) {
- const int x = scale ? scale->scale_value_x(x_offset, scale) : x_offset;
- const int y = scale ? scale->scale_value_y(y_offset, scale) : y_offset;
+ const int x = scale ? scale->sfc->scale_value_x(x_offset, scale->sfc) :
+ x_offset;
+ const int y = scale ? scale->sfc->scale_value_y(y_offset, scale->sfc) :
+ y_offset;
return y * stride + x;
}
diff --git a/libvpx/vp9/common/vp9_reconintra.c b/libvpx/vp9/common/vp9_reconintra.c
index 4a451b9..eb643b0 100644
--- a/libvpx/vp9/common/vp9_reconintra.c
+++ b/libvpx/vp9/common/vp9_reconintra.c
@@ -13,7 +13,7 @@
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/vpx_once.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_onyxc_int.h"
@@ -369,7 +369,7 @@ static void build_intra_predictors(const uint8_t *ref, int ref_stride,
}
}
-void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
TX_SIZE tx_size, int mode,
const uint8_t *ref, int ref_stride,
uint8_t *dst, int dst_stride) {
diff --git a/libvpx/vp9/common/vp9_reconintra.h b/libvpx/vp9/common/vp9_reconintra.h
index e9d0dbf..6e3f55c 100644
--- a/libvpx/vp9/common/vp9_reconintra.h
+++ b/libvpx/vp9/common/vp9_reconintra.h
@@ -14,8 +14,8 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_blockd.h"
-void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
- TX_SIZE tx_size, int mode,
- const uint8_t *ref, int ref_stride,
- uint8_t *dst, int dst_stride);
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
+ TX_SIZE tx_size, int mode,
+ const uint8_t *ref, int ref_stride,
+ uint8_t *dst, int dst_stride);
#endif // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/libvpx/vp9/common/vp9_rtcd.c b/libvpx/vp9/common/vp9_rtcd.c
index 72613ae..dc15a84 100644
--- a/libvpx/vp9/common/vp9_rtcd.c
+++ b/libvpx/vp9/common/vp9_rtcd.c
@@ -7,9 +7,9 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vpx_config.h"
+#include "./vpx_config.h"
#define RTCD_C
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
#include "vpx_ports/vpx_once.h"
void vpx_scale_rtcd(void);
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh
index 042afbb..debec61 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.sh
+++ b/libvpx/vp9/common/vp9_rtcd_defs.sh
@@ -22,38 +22,23 @@ forward_decls vp9_common_forward_decls
# x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
[ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse_x86inc=sse &&
- sse2_x86inc=sse2 && ssse3_x86inc=ssse3
+ sse2_x86inc=sse2 && ssse3_x86inc=ssse3 && avx_x86inc=avx && avx2_x86inc=avx2
# this variable is for functions that are 64 bit only.
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3
-
-#
-# Dequant
-#
-
-prototype void vp9_idct_add_16x16 "int16_t *input, uint8_t *dest, int stride, int eob"
-specialize vp9_idct_add_16x16
-
-prototype void vp9_idct_add_8x8 "int16_t *input, uint8_t *dest, int stride, int eob"
-specialize vp9_idct_add_8x8
-
-prototype void vp9_idct_add "int16_t *input, uint8_t *dest, int stride, int eob"
-specialize vp9_idct_add
-
-prototype void vp9_idct_add_32x32 "int16_t *q, uint8_t *dst, int stride, int eob"
-specialize vp9_idct_add_32x32
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 &&
+ ssse3_x86_64=ssse3 && avx_x86_64=avx && avx2_x86_64=avx2
#
# RECON
#
prototype void vp9_d207_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_4x4
+specialize vp9_d207_predictor_4x4 $ssse3_x86inc
prototype void vp9_d45_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d45_predictor_4x4 $ssse3_x86inc
prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_4x4
+specialize vp9_d63_predictor_4x4 $ssse3_x86inc
prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_h_predictor_4x4 $ssse3_x86inc
@@ -65,7 +50,7 @@ prototype void vp9_d135_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const u
specialize vp9_d135_predictor_4x4
prototype void vp9_d153_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_4x4
+specialize vp9_d153_predictor_4x4 $ssse3_x86inc
prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_v_predictor_4x4 $sse_x86inc
@@ -86,13 +71,13 @@ prototype void vp9_dc_128_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const
specialize vp9_dc_128_predictor_4x4
prototype void vp9_d207_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_8x8
+specialize vp9_d207_predictor_8x8 $ssse3_x86inc
prototype void vp9_d45_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d45_predictor_8x8 $ssse3_x86inc
prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_8x8
+specialize vp9_d63_predictor_8x8 $ssse3_x86inc
prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_h_predictor_8x8 $ssse3_x86inc
@@ -104,7 +89,7 @@ prototype void vp9_d135_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const u
specialize vp9_d135_predictor_8x8
prototype void vp9_d153_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_8x8
+specialize vp9_d153_predictor_8x8 $ssse3_x86inc
prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_v_predictor_8x8 $sse_x86inc
@@ -125,13 +110,13 @@ prototype void vp9_dc_128_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const
specialize vp9_dc_128_predictor_8x8
prototype void vp9_d207_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_16x16
+specialize vp9_d207_predictor_16x16 $ssse3_x86inc
prototype void vp9_d45_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d45_predictor_16x16 $ssse3_x86inc
prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_16x16
+specialize vp9_d63_predictor_16x16 $ssse3_x86inc
prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_h_predictor_16x16 $ssse3_x86inc
@@ -143,7 +128,7 @@ prototype void vp9_d135_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const
specialize vp9_d135_predictor_16x16
prototype void vp9_d153_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_16x16
+specialize vp9_d153_predictor_16x16 $ssse3_x86inc
prototype void vp9_v_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_v_predictor_16x16 $sse2_x86inc
@@ -164,16 +149,16 @@ prototype void vp9_dc_128_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, con
specialize vp9_dc_128_predictor_16x16
prototype void vp9_d207_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_32x32
+specialize vp9_d207_predictor_32x32 $ssse3_x86inc
prototype void vp9_d45_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d45_predictor_32x32 $ssse3_x86inc
prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_32x32
+specialize vp9_d63_predictor_32x32 $ssse3_x86inc
prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_32x32 $ssse3 x86inc
+specialize vp9_h_predictor_32x32 $ssse3_x86inc
prototype void vp9_d117_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d117_predictor_32x32
@@ -202,17 +187,6 @@ specialize vp9_dc_left_predictor_32x32
prototype void vp9_dc_128_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_dc_128_predictor_32x32
-if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
-prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_8x8 sse2 neon
-
-prototype void vp9_add_constant_residual_16x16 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_16x16 sse2 neon
-
-prototype void vp9_add_constant_residual_32x32 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_32x32 sse2 neon
-fi
-
#
# Loopfilter
#
@@ -226,7 +200,7 @@ prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8
specialize vp9_loop_filter_vertical_edge mmx neon
prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mb_lpf_horizontal_edge_w sse2 neon
+specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon
prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_mbloop_filter_horizontal_edge sse2 neon
@@ -268,80 +242,81 @@ specialize vp9_blend_b
# Sub Pixel Filters
#
prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_copy $sse2_x86inc neon
+specialize vp9_convolve_copy $sse2_x86inc neon dspr2
prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_avg $sse2_x86inc neon
+specialize vp9_convolve_avg $sse2_x86inc neon dspr2
prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8 ssse3 neon
+specialize vp9_convolve8 sse2 ssse3 neon dspr2
prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_horiz ssse3 neon
+specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2
prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_vert ssse3 neon
+specialize vp9_convolve8_vert sse2 ssse3 neon dspr2
prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg ssse3 neon
+specialize vp9_convolve8_avg sse2 ssse3 neon dspr2
prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_horiz ssse3 neon
+specialize vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2
prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_vert ssse3 neon
+specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2
#
# dct
#
-prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct4x4_1_add sse2 neon
+prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct4x4_1_add sse2 neon dspr2
-prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct4x4_add sse2 neon
+prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct4x4_16_add sse2 neon dspr2
-prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_1_add sse2 neon
+prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct8x8_1_add sse2 neon dspr2
-prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_add sse2 neon
+prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct8x8_64_add sse2 neon dspr2
-prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct10_8x8_add sse2 neon
+prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct8x8_10_add sse2 neon dspr2
-prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_1_add sse2 neon
+prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct16x16_1_add sse2 neon dspr2
-prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_add sse2 neon
+prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct16x16_256_add sse2 neon dspr2
-prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct10_16x16_add sse2 neon
+prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct16x16_10_add sse2 neon dspr2
-prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct32x32_add sse2 neon
+prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct32x32_1024_add sse2 neon dspr2
-prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
-specialize vp9_short_idct1_32x32
+prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct32x32_34_add sse2
-prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht4x4_add sse2 neon
+prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct32x32_1_add sse2 dspr2
-prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht8x8_add sse2 neon
+prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_iht4x4_16_add sse2 neon dspr2
-prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
-specialize vp9_short_iht16x16_add sse2
+prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_iht8x8_64_add sse2 neon dspr2
+
+prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type"
+specialize vp9_iht16x16_256_add sse2 dspr2
-prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
-specialize vp9_idct4_1d sse2
# dct and add
-prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_iwalsh4x4_1_add
+prototype void vp9_iwht4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_iwht4x4_1_add
-prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_iwalsh4x4_add
+prototype void vp9_iwht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_iwht4x4_16_add
#
# Encoder functions below this point.
@@ -697,10 +672,10 @@ specialize vp9_block_error $sse2_x86inc
prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
specialize vp9_subtract_block $sse2_x86inc
-prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+prototype void vp9_quantize_b "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
specialize vp9_quantize_b $ssse3_x86_64
-prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+prototype void vp9_quantize_b_32x32 "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
specialize vp9_quantize_b_32x32 $ssse3_x86_64
#
@@ -715,38 +690,32 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
fi
# fdct functions
-prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
specialize vp9_short_fht4x4 sse2
-prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
specialize vp9_short_fht8x8 sse2
-prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
specialize vp9_short_fht16x16 sse2
-prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct8x8 sse2
-
-prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct4x4 sse2
-
-prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct8x4 sse2
+prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fwht4x4
-prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct32x32 sse2
+prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct4x4 sse2
-prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct32x32_rd sse2
+prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct8x8 sse2
-prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct16x16 sse2
+prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct16x16 sse2
-prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_walsh4x4
+prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct32x32 sse2
-prototype void vp9_short_walsh8x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_walsh8x4
+prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct32x32_rd sse2
#
# Motion search
diff --git a/libvpx/vp9/common/vp9_scale.c b/libvpx/vp9/common/vp9_scale.c
index 989206c..3f0994f 100644
--- a/libvpx/vp9/common/vp9_scale.c
+++ b/libvpx/vp9/common/vp9_scale.c
@@ -12,23 +12,23 @@
#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_scale.h"
-static INLINE int scaled_x(int val, const struct scale_factors *scale) {
- return val * scale->x_scale_fp >> REF_SCALE_SHIFT;
+static INLINE int scaled_x(int val, const struct scale_factors_common *sfc) {
+ return val * sfc->x_scale_fp >> REF_SCALE_SHIFT;
}
-static INLINE int scaled_y(int val, const struct scale_factors *scale) {
- return val * scale->y_scale_fp >> REF_SCALE_SHIFT;
+static INLINE int scaled_y(int val, const struct scale_factors_common *sfc) {
+ return val * sfc->y_scale_fp >> REF_SCALE_SHIFT;
}
-static int unscaled_value(int val, const struct scale_factors *scale) {
- (void) scale;
+static int unscaled_value(int val, const struct scale_factors_common *sfc) {
+ (void) sfc;
return val;
}
static MV32 scaled_mv(const MV *mv, const struct scale_factors *scale) {
const MV32 res = {
- scaled_y(mv->row, scale) + scale->y_offset_q4,
- scaled_x(mv->col, scale) + scale->x_offset_q4
+ scaled_y(mv->row, scale->sfc) + scale->y_offset_q4,
+ scaled_x(mv->col, scale->sfc) + scale->x_offset_q4
};
return res;
}
@@ -43,8 +43,8 @@ static MV32 unscaled_mv(const MV *mv, const struct scale_factors *scale) {
static void set_offsets_with_scaling(struct scale_factors *scale,
int row, int col) {
- scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale) & SUBPEL_MASK;
- scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale) & SUBPEL_MASK;
+ scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK;
+ scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK;
}
static void set_offsets_without_scaling(struct scale_factors *scale,
@@ -70,31 +70,30 @@ static int check_scale_factors(int other_w, int other_h,
}
void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+ struct scale_factors_common *scale_comm,
int other_w, int other_h,
int this_w, int this_h) {
if (!check_scale_factors(other_w, other_h, this_w, this_h)) {
- scale->x_scale_fp = REF_INVALID_SCALE;
- scale->y_scale_fp = REF_INVALID_SCALE;
+ scale_comm->x_scale_fp = REF_INVALID_SCALE;
+ scale_comm->y_scale_fp = REF_INVALID_SCALE;
return;
}
- scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
- scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
- scale->x_step_q4 = scaled_x(16, scale);
- scale->y_step_q4 = scaled_y(16, scale);
- scale->x_offset_q4 = 0; // calculated per block
- scale->y_offset_q4 = 0; // calculated per block
+ scale_comm->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+ scale_comm->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+ scale_comm->x_step_q4 = scaled_x(16, scale_comm);
+ scale_comm->y_step_q4 = scaled_y(16, scale_comm);
- if (vp9_is_scaled(scale)) {
- scale->scale_value_x = scaled_x;
- scale->scale_value_y = scaled_y;
- scale->set_scaled_offsets = set_offsets_with_scaling;
- scale->scale_mv = scaled_mv;
+ if (vp9_is_scaled(scale_comm)) {
+ scale_comm->scale_value_x = scaled_x;
+ scale_comm->scale_value_y = scaled_y;
+ scale_comm->set_scaled_offsets = set_offsets_with_scaling;
+ scale_comm->scale_mv = scaled_mv;
} else {
- scale->scale_value_x = unscaled_value;
- scale->scale_value_y = unscaled_value;
- scale->set_scaled_offsets = set_offsets_without_scaling;
- scale->scale_mv = unscaled_mv;
+ scale_comm->scale_value_x = unscaled_value;
+ scale_comm->scale_value_y = unscaled_value;
+ scale_comm->set_scaled_offsets = set_offsets_without_scaling;
+ scale_comm->scale_mv = unscaled_mv;
}
// TODO(agrange): Investigate the best choice of functions to use here
@@ -103,44 +102,48 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
// applied in one direction only, and not at all for 0,0, seems to give the
// best quality, but it may be worth trying an additional mode that does
// do the filtering on full-pel.
- if (scale->x_step_q4 == 16) {
- if (scale->y_step_q4 == 16) {
+ if (scale_comm->x_step_q4 == 16) {
+ if (scale_comm->y_step_q4 == 16) {
// No scaling in either direction.
- scale->predict[0][0][0] = vp9_convolve_copy;
- scale->predict[0][0][1] = vp9_convolve_avg;
- scale->predict[0][1][0] = vp9_convolve8_vert;
- scale->predict[0][1][1] = vp9_convolve8_avg_vert;
- scale->predict[1][0][0] = vp9_convolve8_horiz;
- scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+ scale_comm->predict[0][0][0] = vp9_convolve_copy;
+ scale_comm->predict[0][0][1] = vp9_convolve_avg;
+ scale_comm->predict[0][1][0] = vp9_convolve8_vert;
+ scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert;
+ scale_comm->predict[1][0][0] = vp9_convolve8_horiz;
+ scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz;
} else {
// No scaling in x direction. Must always scale in the y direction.
- scale->predict[0][0][0] = vp9_convolve8_vert;
- scale->predict[0][0][1] = vp9_convolve8_avg_vert;
- scale->predict[0][1][0] = vp9_convolve8_vert;
- scale->predict[0][1][1] = vp9_convolve8_avg_vert;
- scale->predict[1][0][0] = vp9_convolve8;
- scale->predict[1][0][1] = vp9_convolve8_avg;
+ scale_comm->predict[0][0][0] = vp9_convolve8_vert;
+ scale_comm->predict[0][0][1] = vp9_convolve8_avg_vert;
+ scale_comm->predict[0][1][0] = vp9_convolve8_vert;
+ scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert;
+ scale_comm->predict[1][0][0] = vp9_convolve8;
+ scale_comm->predict[1][0][1] = vp9_convolve8_avg;
}
} else {
- if (scale->y_step_q4 == 16) {
+ if (scale_comm->y_step_q4 == 16) {
// No scaling in the y direction. Must always scale in the x direction.
- scale->predict[0][0][0] = vp9_convolve8_horiz;
- scale->predict[0][0][1] = vp9_convolve8_avg_horiz;
- scale->predict[0][1][0] = vp9_convolve8;
- scale->predict[0][1][1] = vp9_convolve8_avg;
- scale->predict[1][0][0] = vp9_convolve8_horiz;
- scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+ scale_comm->predict[0][0][0] = vp9_convolve8_horiz;
+ scale_comm->predict[0][0][1] = vp9_convolve8_avg_horiz;
+ scale_comm->predict[0][1][0] = vp9_convolve8;
+ scale_comm->predict[0][1][1] = vp9_convolve8_avg;
+ scale_comm->predict[1][0][0] = vp9_convolve8_horiz;
+ scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz;
} else {
// Must always scale in both directions.
- scale->predict[0][0][0] = vp9_convolve8;
- scale->predict[0][0][1] = vp9_convolve8_avg;
- scale->predict[0][1][0] = vp9_convolve8;
- scale->predict[0][1][1] = vp9_convolve8_avg;
- scale->predict[1][0][0] = vp9_convolve8;
- scale->predict[1][0][1] = vp9_convolve8_avg;
+ scale_comm->predict[0][0][0] = vp9_convolve8;
+ scale_comm->predict[0][0][1] = vp9_convolve8_avg;
+ scale_comm->predict[0][1][0] = vp9_convolve8;
+ scale_comm->predict[0][1][1] = vp9_convolve8_avg;
+ scale_comm->predict[1][0][0] = vp9_convolve8;
+ scale_comm->predict[1][0][1] = vp9_convolve8_avg;
}
}
// 2D subpel motion always gets filtered in both directions
- scale->predict[1][1][0] = vp9_convolve8;
- scale->predict[1][1][1] = vp9_convolve8_avg;
+ scale_comm->predict[1][1][0] = vp9_convolve8;
+ scale_comm->predict[1][1][1] = vp9_convolve8_avg;
+
+ scale->sfc = scale_comm;
+ scale->x_offset_q4 = 0; // calculated per block
+ scale->y_offset_q4 = 0; // calculated per block
}
diff --git a/libvpx/vp9/common/vp9_scale.h b/libvpx/vp9/common/vp9_scale.h
index 7a720d0..1437fcd 100644
--- a/libvpx/vp9/common/vp9_scale.h
+++ b/libvpx/vp9/common/vp9_scale.h
@@ -18,34 +18,40 @@
#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
#define REF_INVALID_SCALE -1
-struct scale_factors {
+struct scale_factors;
+struct scale_factors_common {
int x_scale_fp; // horizontal fixed point scale factor
int y_scale_fp; // vertical fixed point scale factor
- int x_offset_q4;
int x_step_q4;
- int y_offset_q4;
int y_step_q4;
- int (*scale_value_x)(int val, const struct scale_factors *scale);
- int (*scale_value_y)(int val, const struct scale_factors *scale);
+ int (*scale_value_x)(int val, const struct scale_factors_common *sfc);
+ int (*scale_value_y)(int val, const struct scale_factors_common *sfc);
void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
MV32 (*scale_mv)(const MV *mv, const struct scale_factors *scale);
convolve_fn_t predict[2][2][2]; // horiz, vert, avg
};
+struct scale_factors {
+ int x_offset_q4;
+ int y_offset_q4;
+ const struct scale_factors_common *sfc;
+};
+
void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+ struct scale_factors_common *scale_comm,
int other_w, int other_h,
int this_w, int this_h);
-static int vp9_is_valid_scale(const struct scale_factors *sf) {
- return sf->x_scale_fp != REF_INVALID_SCALE &&
- sf->y_scale_fp != REF_INVALID_SCALE;
+static int vp9_is_valid_scale(const struct scale_factors_common *sfc) {
+ return sfc->x_scale_fp != REF_INVALID_SCALE &&
+ sfc->y_scale_fp != REF_INVALID_SCALE;
}
-static int vp9_is_scaled(const struct scale_factors *sf) {
- return sf->x_scale_fp != REF_NO_SCALE ||
- sf->y_scale_fp != REF_NO_SCALE;
+static int vp9_is_scaled(const struct scale_factors_common *sfc) {
+ return sfc->x_scale_fp != REF_NO_SCALE ||
+ sfc->y_scale_fp != REF_NO_SCALE;
}
-#endif // VP9_COMMON_VP9_SCALE_H_
+#endif // VP9_COMMON_VP9_SCALE_H_
diff --git a/libvpx/vp9/common/vp9_scan.c b/libvpx/vp9/common/vp9_scan.c
new file mode 100644
index 0000000..f17da91
--- /dev/null
+++ b/libvpx/vp9/common/vp9_scan.c
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_scan.h"
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = {
+ 0, 4, 1, 5,
+ 8, 2, 12, 9,
+ 3, 6, 13, 10,
+ 7, 14, 11, 15,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = {
+ 0, 4, 8, 1,
+ 12, 5, 9, 2,
+ 13, 6, 10, 3,
+ 7, 14, 11, 15,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
+ 0, 1, 4, 2,
+ 5, 3, 6, 8,
+ 9, 7, 12, 10,
+ 13, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
+ 0, 8, 1, 16, 9, 2, 17, 24,
+ 10, 3, 18, 25, 32, 11, 4, 26,
+ 33, 19, 40, 12, 34, 27, 5, 41,
+ 20, 48, 13, 35, 42, 28, 21, 6,
+ 49, 56, 36, 43, 29, 7, 14, 50,
+ 57, 44, 22, 37, 15, 51, 58, 30,
+ 45, 23, 52, 59, 38, 31, 60, 53,
+ 46, 39, 61, 54, 47, 62, 55, 63,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
+ 0, 8, 16, 1, 24, 9, 32, 17,
+ 2, 40, 25, 10, 33, 18, 48, 3,
+ 26, 41, 11, 56, 19, 34, 4, 49,
+ 27, 42, 12, 35, 20, 57, 50, 28,
+ 5, 43, 13, 36, 58, 51, 21, 44,
+ 6, 29, 59, 37, 14, 52, 22, 7,
+ 45, 60, 30, 15, 38, 53, 23, 46,
+ 31, 61, 39, 54, 47, 62, 55, 63,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
+ 0, 1, 2, 8, 9, 3, 16, 10,
+ 4, 17, 11, 24, 5, 18, 25, 12,
+ 19, 26, 32, 6, 13, 20, 33, 27,
+ 7, 34, 40, 21, 28, 41, 14, 35,
+ 48, 42, 29, 36, 49, 22, 43, 15,
+ 56, 37, 50, 44, 30, 57, 23, 51,
+ 58, 45, 38, 52, 31, 59, 53, 46,
+ 60, 39, 61, 47, 54, 55, 62, 63,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
+ 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
+ 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
+ 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
+ 100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146,
+ 55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25,
+ 133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119,
+ 26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194,
+ 180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59,
+ 12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13,
+ 226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169,
+ 242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108,
+ 77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140,
+ 230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141,
+ 63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142,
+ 219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159,
+ 251,
+ 190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239,
+ 255,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
+ 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
+ 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
+ 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
+ 146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85,
+ 22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179,
+ 225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24,
+ 87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227,
+ 88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167,
+ 213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229,
+ 74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59,
+ 200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170,
+ 60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202,
+ 233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125,
+ 62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79,
+ 126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205,
+ 236,
+ 159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239,
+ 255,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
+ 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
+ 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
+ 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
+ 25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100,
+ 13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102,
+ 144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160,
+ 89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176,
+ 75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136,
+ 165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166,
+ 167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108,
+ 197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170,
+ 124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186,
+ 156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110,
+ 157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111,
+ 158,
+ 188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220,
+ 175,
+ 190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
+ 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
+ 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193,
+ 68, 131, 37, 100,
+ 225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38,
+ 258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321,
+ 102, 352, 8, 197,
+ 71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292,
+ 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293,
+ 41, 417, 199, 136,
+ 262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105,
+ 419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169,
+ 295, 420, 106, 451,
+ 481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421,
+ 75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391,
+ 453, 139, 44, 234,
+ 484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108,
+ 546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577,
+ 486, 77, 204, 362,
+ 608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173,
+ 610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17,
+ 111, 238, 48, 143,
+ 80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51,
+ 83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424,
+ 393, 300, 269, 176, 145,
+ 52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301,
+ 270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581,
+ 550, 519, 488, 457, 426, 395,
+ 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737,
+ 706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241,
+ 210, 179, 117, 86, 55, 738, 707,
+ 614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491,
+ 367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676,
+ 645, 552, 521, 428, 397, 304,
+ 273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553,
+ 522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26,
+ 864, 833, 802, 771, 740, 709,
+ 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306,
+ 275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741,
+ 710, 679, 617, 586, 555, 493,
+ 462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835,
+ 742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867,
+ 743, 619, 495, 371, 247, 123,
+ 896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680,
+ 649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929,
+ 898, 836, 805, 774, 712, 681,
+ 650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154,
+ 92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682,
+ 651, 620, 589, 558, 527,
+ 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124,
+ 93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590,
+ 559, 497, 466, 435, 373,
+ 342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715,
+ 622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623,
+ 499, 375, 251, 127,
+ 900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560,
+ 529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716,
+ 685, 654, 592, 561,
+ 530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903,
+ 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469,
+ 438, 407, 376, 345,
+ 314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718,
+ 687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998,
+ 967, 874, 843, 750,
+ 719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503,
+ 379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657,
+ 564, 533, 440, 409,
+ 316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534,
+ 472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783,
+ 752, 721, 690, 659,
+ 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970,
+ 939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381,
+ 350, 319, 1002, 971,
+ 878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631,
+ 507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568,
+ 537, 444, 413, 972,
+ 941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414,
+ 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601,
+ 570, 539, 508, 477,
+ 446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571,
+ 509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479,
+ 1007, 883, 759, 635, 511,
+ 912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945,
+ 914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915,
+ 884, 853, 822, 791,
+ 760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823,
+ 761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607,
+ 1011, 887, 763, 639,
+ 916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825,
+ 794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733,
+ 702, 671, 1013, 982,
+ 951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015,
+ 891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798,
+ 1016, 985, 954, 923,
+ 892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863,
+ 1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021,
+ 990, 959, 1022, 991, 1023,
+};
+
+// Neighborhood 5-tuples for various scans and blocksizes,
+// in {top, left, topleft, topright, bottomleft} order
+// for each position in raster scan order.
+// -1 indicates the neighbor does not exist.
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+
+
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+
+static int find_in_scan(const int16_t *scan, int l, int idx) {
+ int n, l2 = l * l;
+ for (n = 0; n < l2; n++) {
+ int rc = scan[n];
+ if (rc == idx)
+ return n;
+ }
+ assert(0);
+ return -1;
+}
+static void init_scan_neighbors(const int16_t *scan,
+ int16_t *iscan,
+ int l, int16_t *neighbors) {
+ int l2 = l * l;
+ int n, i, j;
+
+ // dc doesn't use this type of prediction
+ neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
+ neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
+ iscan[0] = find_in_scan(scan, l, 0);
+ for (n = 1; n < l2; n++) {
+ int rc = scan[n];
+ iscan[n] = find_in_scan(scan, l, n);
+ i = rc / l;
+ j = rc % l;
+ if (i > 0 && j > 0) {
+ // col/row scan is used for adst/dct, and generally means that
+ // energy decreases to zero much faster in the dimension in
+ // which ADST is used compared to the direction in which DCT
+ // is used. Likewise, we find much higher correlation between
+ // coefficients within the direction in which DCT is used.
+ // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
+ // as a context. If ADST or DCT is used in both directions, we
+ // use the combination of the two as a context.
+ int a = (i - 1) * l + j;
+ int b = i * l + j - 1;
+ if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
+ scan == vp9_col_scan_16x16) {
+ // in the col/row scan cases (as well as left/top edge cases), we set
+ // both contexts to the same value, so we can branchlessly do a+b+1>>1
+ // which automatically becomes a if a == b
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = a;
+ } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
+ scan == vp9_row_scan_16x16) {
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = b;
+ } else {
+ neighbors[MAX_NEIGHBORS * n + 0] = a;
+ neighbors[MAX_NEIGHBORS * n + 1] = b;
+ }
+ } else if (i > 0) {
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
+ } else {
+ assert(j > 0);
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = i * l + j - 1;
+ }
+ assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
+ }
+ // one padding item so we don't have to add branches in code to handle
+ // calls to get_coef_context() for the token after the final dc token
+ neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
+ neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
+}
+
+void vp9_init_neighbors() {
+ init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
+ vp9_default_scan_4x4_neighbors);
+ init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
+ vp9_row_scan_4x4_neighbors);
+ init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
+ vp9_col_scan_4x4_neighbors);
+ init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
+ vp9_default_scan_8x8_neighbors);
+ init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
+ vp9_row_scan_8x8_neighbors);
+ init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
+ vp9_col_scan_8x8_neighbors);
+ init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
+ vp9_default_scan_16x16_neighbors);
+ init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
+ vp9_row_scan_16x16_neighbors);
+ init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
+ vp9_col_scan_16x16_neighbors);
+ init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
+ vp9_default_scan_32x32_neighbors);
+}
diff --git a/libvpx/vp9/common/vp9_scan.h b/libvpx/vp9/common/vp9_scan.h
new file mode 100644
index 0000000..14a1a7e
--- /dev/null
+++ b/libvpx/vp9/common/vp9_scan.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_SCAN_H_
+#define VP9_COMMON_VP9_SCAN_H_
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_enums.h"
+
+#define MAX_NEIGHBORS 2
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+
+
+void vp9_init_neighbors();
+
+static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_scan_4x4;
+ case DCT_ADST:
+ return vp9_col_scan_4x4;
+ default:
+ return vp9_default_scan_4x4;
+ }
+}
+
+static INLINE void get_scan_nb_4x4(TX_TYPE tx_type,
+ const int16_t **scan, const int16_t **nb) {
+ switch (tx_type) {
+ case ADST_DCT:
+ *scan = vp9_row_scan_4x4;
+ *nb = vp9_row_scan_4x4_neighbors;
+ break;
+ case DCT_ADST:
+ *scan = vp9_col_scan_4x4;
+ *nb = vp9_col_scan_4x4_neighbors;
+ break;
+ default:
+ *scan = vp9_default_scan_4x4;
+ *nb = vp9_default_scan_4x4_neighbors;
+ break;
+ }
+}
+
+static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_iscan_4x4;
+ case DCT_ADST:
+ return vp9_col_iscan_4x4;
+ default:
+ return vp9_default_iscan_4x4;
+ }
+}
+
+static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_scan_8x8;
+ case DCT_ADST:
+ return vp9_col_scan_8x8;
+ default:
+ return vp9_default_scan_8x8;
+ }
+}
+
+static INLINE void get_scan_nb_8x8(TX_TYPE tx_type,
+ const int16_t **scan, const int16_t **nb) {
+ switch (tx_type) {
+ case ADST_DCT:
+ *scan = vp9_row_scan_8x8;
+ *nb = vp9_row_scan_8x8_neighbors;
+ break;
+ case DCT_ADST:
+ *scan = vp9_col_scan_8x8;
+ *nb = vp9_col_scan_8x8_neighbors;
+ break;
+ default:
+ *scan = vp9_default_scan_8x8;
+ *nb = vp9_default_scan_8x8_neighbors;
+ break;
+ }
+}
+
+static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_iscan_8x8;
+ case DCT_ADST:
+ return vp9_col_iscan_8x8;
+ default:
+ return vp9_default_iscan_8x8;
+ }
+}
+
+static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_scan_16x16;
+ case DCT_ADST:
+ return vp9_col_scan_16x16;
+ default:
+ return vp9_default_scan_16x16;
+ }
+}
+
+static INLINE void get_scan_nb_16x16(TX_TYPE tx_type,
+ const int16_t **scan, const int16_t **nb) {
+ switch (tx_type) {
+ case ADST_DCT:
+ *scan = vp9_row_scan_16x16;
+ *nb = vp9_row_scan_16x16_neighbors;
+ break;
+ case DCT_ADST:
+ *scan = vp9_col_scan_16x16;
+ *nb = vp9_col_scan_16x16_neighbors;
+ break;
+ default:
+ *scan = vp9_default_scan_16x16;
+ *nb = vp9_default_scan_16x16_neighbors;
+ break;
+ }
+}
+
+static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_iscan_16x16;
+ case DCT_ADST:
+ return vp9_col_iscan_16x16;
+ default:
+ return vp9_default_iscan_16x16;
+ }
+}
+
+static INLINE int get_coef_context(const int16_t *neighbors,
+ const uint8_t *token_cache, int c) {
+ return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
+ token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
+}
+
+#endif // VP9_COMMON_VP9_SCAN_H_
diff --git a/libvpx/vp9/common/vp9_seg_common.c b/libvpx/vp9/common/vp9_seg_common.c
index 6bfd8f8..ef30404 100644
--- a/libvpx/vp9/common/vp9_seg_common.c
+++ b/libvpx/vp9/common/vp9_seg_common.c
@@ -76,7 +76,7 @@ int vp9_get_segdata(const struct segmentation *seg, int segment_id,
}
-const vp9_tree_index vp9_segment_tree[14] = {
+const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
2, 4, 6, 8, 10, 12,
0, -1, -2, -3, -4, -5, -6, -7
};
diff --git a/libvpx/vp9/common/vp9_seg_common.h b/libvpx/vp9/common/vp9_seg_common.h
index f22239b..eb38c06 100644
--- a/libvpx/vp9/common/vp9_seg_common.h
+++ b/libvpx/vp9/common/vp9_seg_common.h
@@ -76,7 +76,7 @@ int vp9_get_segdata(const struct segmentation *seg,
int segment_id,
SEG_LVL_FEATURES feature_id);
-extern const vp9_tree_index vp9_segment_tree[14];
+extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
#endif // VP9_COMMON_VP9_SEG_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_subpelvar.h b/libvpx/vp9/common/vp9_subpelvar.h
deleted file mode 100644
index fe75481..0000000
--- a/libvpx/vp9/common/vp9_subpelvar.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_SUBPELVAR_H_
-#define VP9_COMMON_VP9_SUBPELVAR_H_
-
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_convolve.h"
-
-static void variance(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int recon_stride,
- int w,
- int h,
- unsigned int *sse,
- int *sum) {
- int i, j;
- int diff;
-
- *sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- diff = src_ptr[j] - ref_ptr[j];
- *sum += diff;
- *sse += diff * diff;
- }
-
- src_ptr += source_stride;
- ref_ptr += recon_stride;
- }
-}
-
-/****************************************************************************
- *
- * ROUTINE : filter_block2d_bil_first_pass
- *
- * INPUTS : uint8_t *src_ptr : Pointer to source block.
- * uint32_t src_pixels_per_line : Stride of input block.
- * uint32_t pixel_step : Offset between filter input samples (see notes).
- * uint32_t output_height : Input block height.
- * uint32_t output_width : Input block width.
- * int32_t *vp9_filter : Array of 2 bi-linear filter taps.
- *
- * OUTPUTS : int32_t *output_ptr : Pointer to filtered block.
- *
- * RETURNS : void
- *
- * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
- * either horizontal or vertical direction to produce the
- * filtered output block. Used to implement first-pass
- * of 2-D separable filter.
- *
- * SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
- * Two filter taps should sum to VP9_FILTER_WEIGHT.
- * pixel_step defines whether the filter is applied
- * horizontally (pixel_step=1) or vertically (pixel_step=stride).
- * It defines the offset required to move from one input
- * to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
- uint16_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const int16_t *vp9_filter) {
- unsigned int i, j;
-
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
- (int)src_ptr[pixel_step] * vp9_filter[1],
- FILTER_BITS);
-
- src_ptr++;
- }
-
- // Next row...
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_width;
- }
-}
-
-/****************************************************************************
- *
- * ROUTINE : filter_block2d_bil_second_pass
- *
- * INPUTS : int32_t *src_ptr : Pointer to source block.
- * uint32_t src_pixels_per_line : Stride of input block.
- * uint32_t pixel_step : Offset between filter input samples (see notes).
- * uint32_t output_height : Input block height.
- * uint32_t output_width : Input block width.
- * int32_t *vp9_filter : Array of 2 bi-linear filter taps.
- *
- * OUTPUTS : uint16_t *output_ptr : Pointer to filtered block.
- *
- * RETURNS : void
- *
- * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
- * either horizontal or vertical direction to produce the
- * filtered output block. Used to implement second-pass
- * of 2-D separable filter.
- *
- * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- * Two filter taps should sum to VP9_FILTER_WEIGHT.
- * pixel_step defines whether the filter is applied
- * horizontally (pixel_step=1) or vertically (pixel_step=stride).
- * It defines the offset required to move from one input
- * to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const int16_t *vp9_filter) {
- unsigned int i, j;
-
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
- (int)src_ptr[pixel_step] * vp9_filter[1],
- FILTER_BITS);
- src_ptr++;
- }
-
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_width;
- }
-}
-
-#endif // VP9_COMMON_VP9_SUBPELVAR_H_
diff --git a/libvpx/vp9/common/vp9_systemdependent.h b/libvpx/vp9/common/vp9_systemdependent.h
index cc909e2..254a431 100644
--- a/libvpx/vp9/common/vp9_systemdependent.h
+++ b/libvpx/vp9/common/vp9_systemdependent.h
@@ -13,6 +13,7 @@
#ifdef _MSC_VER
#include <math.h>
+#define snprintf _snprintf
#endif
#include "./vpx_config.h"
@@ -23,8 +24,8 @@ void vpx_reset_mmx_state(void);
#define vp9_clear_system_state()
#endif
-#ifdef _MSC_VER
-// round is not defined in MSVC
+#if defined(_MSC_VER) && _MSC_VER < 1800
+// round is not defined in MSVC before VS2013.
static int round(double x) {
if (x < 0)
return (int)ceil(x - 0.5);
diff --git a/libvpx/vp9/common/vp9_tile_common.c b/libvpx/vp9/common/vp9_tile_common.c
index 1791c1a..e3035d0 100644
--- a/libvpx/vp9/common/vp9_tile_common.c
+++ b/libvpx/vp9/common/vp9_tile_common.c
@@ -10,6 +10,8 @@
#include "vp9/common/vp9_tile_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
#define MIN_TILE_WIDTH_B64 4
#define MAX_TILE_WIDTH_B64 64
@@ -17,8 +19,8 @@ static int to_sbs(n_mis) {
return mi_cols_aligned_to_sb(n_mis) >> MI_BLOCK_SIZE_LOG2;
}
-static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off,
- int tile_idx, int log2_n_tiles, int n_mis) {
+static void get_tile_offsets(int *min_tile_off, int *max_tile_off,
+ int tile_idx, int log2_n_tiles, int n_mis) {
const int n_sbs = to_sbs(n_mis);
const int sb_off1 = (tile_idx * n_sbs) >> log2_n_tiles;
const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
@@ -27,17 +29,14 @@ static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off,
*max_tile_off = MIN(sb_off2 << 3, n_mis);
}
-void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) {
- vp9_get_tile_offsets(&cm->cur_tile_mi_col_start, &cm->cur_tile_mi_col_end,
- tile_col_idx, cm->log2_tile_cols, cm->mi_cols);
-}
-
-void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) {
- vp9_get_tile_offsets(&cm->cur_tile_mi_row_start, &cm->cur_tile_mi_row_end,
- tile_row_idx, cm->log2_tile_rows, cm->mi_rows);
+void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm,
+ int row_idx, int col_idx) {
+ get_tile_offsets(&tile->mi_row_start, &tile->mi_row_end,
+ row_idx, cm->log2_tile_rows, cm->mi_rows);
+ get_tile_offsets(&tile->mi_col_start, &tile->mi_col_end,
+ col_idx, cm->log2_tile_cols, cm->mi_cols);
}
-
void vp9_get_tile_n_bits(int mi_cols,
int *min_log2_tile_cols, int *max_log2_tile_cols) {
const int sb_cols = to_sbs(mi_cols);
diff --git a/libvpx/vp9/common/vp9_tile_common.h b/libvpx/vp9/common/vp9_tile_common.h
index 6d14560..a110abb 100644
--- a/libvpx/vp9/common/vp9_tile_common.h
+++ b/libvpx/vp9/common/vp9_tile_common.h
@@ -11,11 +11,17 @@
#ifndef VP9_COMMON_VP9_TILE_COMMON_H_
#define VP9_COMMON_VP9_TILE_COMMON_H_
-#include "vp9/common/vp9_onyxc_int.h"
+struct VP9Common;
-void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx);
+typedef struct TileInfo {
+ int mi_row_start, mi_row_end;
+ int mi_col_start, mi_col_end;
+} TileInfo;
-void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx);
+// initializes 'tile->mi_(row|col)_(start|end)' for (row_idx, col_idx) based on
+// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)'
+void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm,
+ int row_idx, int col_idx);
void vp9_get_tile_n_bits(int mi_cols,
int *min_log2_tile_cols, int *max_log2_tile_cols);
diff --git a/libvpx/vp9/common/vp9_treecoder.c b/libvpx/vp9/common/vp9_treecoder.c
index 2e21a5b..1805fb4 100644
--- a/libvpx/vp9/common/vp9_treecoder.c
+++ b/libvpx/vp9/common/vp9_treecoder.c
@@ -25,8 +25,9 @@ static void tree2tok(struct vp9_token *const p, vp9_tree t,
if (j <= 0) {
p[-j].value = v;
p[-j].len = l;
- } else
+ } else {
tree2tok(p, t, j, v, l);
+ }
} while (++v & 1);
}
@@ -39,9 +40,7 @@ void vp9_tokens_from_tree_offset(struct vp9_token *p, vp9_tree t,
tree2tok(p - offset, t, 0, 0, 0);
}
-static unsigned int convert_distribution(unsigned int i,
- vp9_tree tree,
- vp9_prob probs[],
+static unsigned int convert_distribution(unsigned int i, vp9_tree tree,
unsigned int branch_ct[][2],
const unsigned int num_events[],
unsigned int tok0_offset) {
@@ -50,26 +49,25 @@ static unsigned int convert_distribution(unsigned int i,
if (tree[i] <= 0) {
left = num_events[-tree[i] - tok0_offset];
} else {
- left = convert_distribution(tree[i], tree, probs, branch_ct,
- num_events, tok0_offset);
+ left = convert_distribution(tree[i], tree, branch_ct, num_events,
+ tok0_offset);
}
if (tree[i + 1] <= 0)
right = num_events[-tree[i + 1] - tok0_offset];
else
- right = convert_distribution(tree[i + 1], tree, probs, branch_ct,
- num_events, tok0_offset);
+ right = convert_distribution(tree[i + 1], tree, branch_ct, num_events,
+ tok0_offset);
- probs[i>>1] = get_binary_prob(left, right);
- branch_ct[i>>1][0] = left;
- branch_ct[i>>1][1] = right;
+ branch_ct[i >> 1][0] = left;
+ branch_ct[i >> 1][1] = right;
return left + right;
}
-void vp9_tree_probs_from_distribution(
- vp9_tree tree,
- vp9_prob probs [ /* n-1 */ ],
- unsigned int branch_ct [ /* n-1 */ ] [2],
- const unsigned int num_events[ /* n */ ],
- unsigned int tok0_offset) {
- convert_distribution(0, tree, probs, branch_ct, num_events, tok0_offset);
+void vp9_tree_probs_from_distribution(vp9_tree tree,
+ unsigned int branch_ct[/* n-1 */][2],
+ const unsigned int num_events[/* n */],
+ unsigned int tok0_offset) {
+ convert_distribution(0, tree, branch_ct, num_events, tok0_offset);
}
+
+
diff --git a/libvpx/vp9/common/vp9_treecoder.h b/libvpx/vp9/common/vp9_treecoder.h
index 31182c3..9c776d6 100644
--- a/libvpx/vp9/common/vp9_treecoder.h
+++ b/libvpx/vp9/common/vp9_treecoder.h
@@ -21,6 +21,8 @@ typedef uint8_t vp9_prob;
typedef int8_t vp9_tree_index;
+#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
+
#define vp9_complement(x) (255 - x)
/* We build coding trees compactly in arrays.
@@ -30,7 +32,7 @@ typedef int8_t vp9_tree_index;
Index > 0 means need another bit, specification at index.
Nonnegative indices are always even; processing begins at node 0. */
-typedef const vp9_tree_index vp9_tree[], *vp9_tree_p;
+typedef const vp9_tree_index vp9_tree[];
struct vp9_token {
int value;
@@ -48,11 +50,11 @@ void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset);
probability updates. */
void vp9_tree_probs_from_distribution(vp9_tree tree,
- vp9_prob probs[ /* n - 1 */ ],
unsigned int branch_ct[ /* n - 1 */ ][2],
const unsigned int num_events[ /* n */ ],
unsigned int tok0_offset);
+
static INLINE vp9_prob clip_prob(int p) {
return (p > 255) ? 255u : (p < 1) ? 1u : p;
}
@@ -79,21 +81,46 @@ static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
}
-static INLINE vp9_prob merge_probs(vp9_prob pre_prob, vp9_prob prob,
+static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
const unsigned int ct[2],
unsigned int count_sat,
unsigned int max_update_factor) {
+ const vp9_prob prob = get_binary_prob(ct[0], ct[1]);
const unsigned int count = MIN(ct[0] + ct[1], count_sat);
const unsigned int factor = max_update_factor * count / count_sat;
return weighted_prob(pre_prob, prob, factor);
}
-static INLINE vp9_prob merge_probs2(vp9_prob pre_prob,
- const unsigned int ct[2],
- unsigned int count_sat,
- unsigned int max_update_factor) {
- return merge_probs(pre_prob, get_binary_prob(ct[0], ct[1]), ct, count_sat,
- max_update_factor);
+static unsigned int tree_merge_probs_impl(unsigned int i,
+ const vp9_tree_index *tree,
+ const vp9_prob *pre_probs,
+ const unsigned int *counts,
+ unsigned int count_sat,
+ unsigned int max_update_factor,
+ vp9_prob *probs) {
+ const int l = tree[i];
+ const unsigned int left_count = (l <= 0)
+ ? counts[-l]
+ : tree_merge_probs_impl(l, tree, pre_probs, counts,
+ count_sat, max_update_factor, probs);
+ const int r = tree[i + 1];
+ const unsigned int right_count = (r <= 0)
+ ? counts[-r]
+ : tree_merge_probs_impl(r, tree, pre_probs, counts,
+ count_sat, max_update_factor, probs);
+ const unsigned int ct[2] = { left_count, right_count };
+ probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
+ count_sat, max_update_factor);
+ return left_count + right_count;
+}
+
+static void tree_merge_probs(const vp9_tree_index *tree,
+ const vp9_prob *pre_probs,
+ const unsigned int *counts, int offset,
+ unsigned int count_sat,
+ unsigned int max_update_factor, vp9_prob *probs) {
+ tree_merge_probs_impl(0, tree, pre_probs, &counts[-offset],
+ count_sat, max_update_factor, probs);
}
diff --git a/libvpx/vp9/common/x86/vp9_asm_stubs.c b/libvpx/vp9/common/x86/vp9_asm_stubs.c
index 3f1c198..106e6d4 100644
--- a/libvpx/vp9/common/x86/vp9_asm_stubs.c
+++ b/libvpx/vp9/common/x86/vp9_asm_stubs.c
@@ -36,90 +36,28 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
{ 8, 8, 8, 8, 120, 120, 120, 120 }
};
-#if HAVE_SSSE3
-void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
+typedef void filter8_1dfunction (
+ const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter
+);
-void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
+#if HAVE_SSSE3
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
@@ -279,7 +217,7 @@ void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
assert(w <= 64);
assert(h <= 64);
@@ -300,7 +238,7 @@ void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
assert(w <= 64);
assert(h <= 64);
@@ -317,3 +255,214 @@ void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
}
}
#endif
+
+#if HAVE_SSE2
+filter8_1dfunction vp9_filter_block1d16_v8_sse2;
+filter8_1dfunction vp9_filter_block1d16_h8_sse2;
+filter8_1dfunction vp9_filter_block1d8_v8_sse2;
+filter8_1dfunction vp9_filter_block1d8_h8_sse2;
+filter8_1dfunction vp9_filter_block1d4_v8_sse2;
+filter8_1dfunction vp9_filter_block1d4_h8_sse2;
+filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
+
+void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ /* Ensure the filter can be compressed to int16_t. */
+ if (x_step_q4 == 16 && filter_x[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_h8_sse2(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_h8_sse2(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_h8_sse2(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
+
+void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (y_step_q4 == 16 && filter_y[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
+
+void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (x_step_q4 == 16 && filter_x[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_h8_avg_sse2(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_h8_avg_sse2(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_h8_avg_sse2(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
+
+void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (y_step_q4 == 16 && filter_y[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
+
+void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
+
+ assert(w <= 64);
+ assert(h <= 64);
+ if (x_step_q4 == 16 && y_step_q4 == 16) {
+ vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h + 7);
+ vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+ } else {
+ vp9_convolve8_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+ }
+}
+
+void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
+
+ assert(w <= 64);
+ assert(h <= 64);
+ if (x_step_q4 == 16 && y_step_q4 == 16) {
+ vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h + 7);
+ vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ } else {
+ vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+ }
+}
+#endif
diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index 8f740f4..ccf5aac 100644
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -15,7 +15,7 @@
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h"
-void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@@ -26,10 +26,10 @@ void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
__m128i input0, input1, input2, input3;
// Rows
- input0 = _mm_loadl_epi64((__m128i *)input);
- input1 = _mm_loadl_epi64((__m128i *)(input + 4));
- input2 = _mm_loadl_epi64((__m128i *)(input + 8));
- input3 = _mm_loadl_epi64((__m128i *)(input + 12));
+ input0 = _mm_loadl_epi64((const __m128i *)input);
+ input1 = _mm_loadl_epi64((const __m128i *)(input + 4));
+ input2 = _mm_loadl_epi64((const __m128i *)(input + 8));
+ input3 = _mm_loadl_epi64((const __m128i *)(input + 12));
// Construct i3, i1, i3, i1, i2, i0, i2, i0
input0 = _mm_shufflelo_epi16(input0, 0xd8);
@@ -148,7 +148,7 @@ void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
RECON_AND_STORE4X4(dest, input3);
}
-void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
__m128i dc_value;
const __m128i zero = _mm_setzero_si128();
int a;
@@ -165,41 +165,6 @@ void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
RECON_AND_STORE4X4(dest, dc_value);
}
-void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
- (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
- (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
- (int16_t)cospi_8_64, (int16_t)cospi_24_64);
- const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1);
-
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i in, temp;
-
- // Load input data.
- in = _mm_loadl_epi64((__m128i *)input);
-
- // Construct i3, i1, i3, i1, i2, i0, i2, i0
- in = _mm_shufflelo_epi16(in, 0xd8);
- in = _mm_unpacklo_epi32(in, in);
-
- // Stage 1
- in = _mm_madd_epi16(in, c1);
- in = _mm_add_epi32(in, rounding);
- in = _mm_srai_epi32(in, DCT_CONST_BITS);
- in = _mm_packs_epi32(in, zero);
-
- // Stage 2
- temp = _mm_shufflelo_epi16(in, 0x9c);
- in = _mm_shufflelo_epi16(in, 0xc9);
- in = _mm_unpacklo_epi64(temp, in);
- in = _mm_madd_epi16(in, c2);
- in = _mm_packs_epi32(in, zero);
-
- // Store results
- _mm_storel_epi64((__m128i *)output, in);
-}
-
static INLINE void transpose_4x4(__m128i *res) {
const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
@@ -210,7 +175,7 @@ static INLINE void transpose_4x4(__m128i *res) {
res[3] = _mm_unpackhi_epi64(res[2], res[2]);
}
-void idct4_1d_sse2(__m128i *in) {
+static void idct4_1d_sse2(__m128i *in) {
const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -249,7 +214,7 @@ void idct4_1d_sse2(__m128i *in) {
in[3] = _mm_sub_epi16(u[0], u[3]);
}
-void iadst4_1d_sse2(__m128i *in) {
+static void iadst4_1d_sse2(__m128i *in) {
const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
@@ -299,16 +264,16 @@ void iadst4_1d_sse2(__m128i *in) {
in[3] = _mm_unpackhi_epi64(in[1], in[1]);
}
-void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
- int tx_type) {
+void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
__m128i in[4];
const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
- in[0] = _mm_loadl_epi64((__m128i *)input);
- in[1] = _mm_loadl_epi64((__m128i *)(input + 4));
- in[2] = _mm_loadl_epi64((__m128i *)(input + 8));
- in[3] = _mm_loadl_epi64((__m128i *)(input + 12));
+ in[0] = _mm_loadl_epi64((const __m128i *)input);
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 4));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
switch (tx_type) {
case 0: // DCT_DCT
@@ -450,7 +415,7 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
res3 = _mm_packs_epi32(tmp6, tmp7); \
}
-#define IDCT8x8_1D \
+#define IDCT8_1D \
/* Stage1 */ \
{ \
const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
@@ -529,7 +494,7 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
dest += stride; \
}
-void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -549,23 +514,23 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
int i;
// Load input data.
- in0 = _mm_load_si128((__m128i *)input);
- in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
- in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
- in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
- in4 = _mm_load_si128((__m128i *)(input + 8 * 4));
- in5 = _mm_load_si128((__m128i *)(input + 8 * 5));
- in6 = _mm_load_si128((__m128i *)(input + 8 * 6));
- in7 = _mm_load_si128((__m128i *)(input + 8 * 7));
+ in0 = _mm_load_si128((const __m128i *)input);
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
// 2-D
for (i = 0; i < 2; i++) {
- // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
+ // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in4, in5, in6, in7);
// 4-stage 1D idct8x8
- IDCT8x8_1D
+ IDCT8_1D
}
// Final rounding and shift
@@ -597,7 +562,7 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
RECON_AND_STORE(dest, in7);
}
-void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
__m128i dc_value;
const __m128i zero = _mm_setzero_si128();
int a;
@@ -648,7 +613,7 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
}
-void idct8_1d_sse2(__m128i *in) {
+static void idct8_1d_sse2(__m128i *in) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
@@ -673,12 +638,12 @@ void idct8_1d_sse2(__m128i *in) {
in6 = in[6];
in7 = in[7];
- // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
+ // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in4, in5, in6, in7);
// 4-stage 1D idct8x8
- IDCT8x8_1D
+ IDCT8_1D
in[0] = in0;
in[1] = in1;
in[2] = in2;
@@ -689,7 +654,7 @@ void idct8_1d_sse2(__m128i *in) {
in[7] = in7;
}
-void iadst8_1d_sse2(__m128i *in) {
+static void iadst8_1d_sse2(__m128i *in) {
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -918,21 +883,21 @@ void iadst8_1d_sse2(__m128i *in) {
}
-void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride,
- int tx_type) {
+void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
__m128i in[8];
const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1<<4);
// load input data
- in[0] = _mm_load_si128((__m128i *)input);
- in[1] = _mm_load_si128((__m128i *)(input + 8 * 1));
- in[2] = _mm_load_si128((__m128i *)(input + 8 * 2));
- in[3] = _mm_load_si128((__m128i *)(input + 8 * 3));
- in[4] = _mm_load_si128((__m128i *)(input + 8 * 4));
- in[5] = _mm_load_si128((__m128i *)(input + 8 * 5));
- in[6] = _mm_load_si128((__m128i *)(input + 8 * 6));
- in[7] = _mm_load_si128((__m128i *)(input + 8 * 7));
+ in[0] = _mm_load_si128((const __m128i *)input);
+ in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
switch (tx_type) {
case 0: // DCT_DCT
@@ -985,7 +950,7 @@ void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride,
RECON_AND_STORE(dest, in[7]);
}
-void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -1005,16 +970,16 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
// Rows. Load 4-row input data.
- in0 = _mm_load_si128((__m128i *)input);
- in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
- in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
- in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
+ in0 = _mm_load_si128((const __m128i *)input);
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
// 8x4 Transpose
TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
// Stage1
- {
+ { //NOLINT
const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
@@ -1039,7 +1004,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
}
// Stage2
- {
+ { //NOLINT
const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
@@ -1069,7 +1034,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
}
// Stage3
- {
+ { //NOLINT
const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
@@ -1103,7 +1068,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
in4, in5, in6, in7)
// 1D idct8x8
- IDCT8x8_1D
+ IDCT8_1D
// Final rounding and shift
in0 = _mm_adds_epi16(in0, final_rounding);
@@ -1134,7 +1099,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
RECON_AND_STORE(dest, in7);
}
-#define IDCT16x16_1D \
+#define IDCT16_1D \
/* Stage2 */ \
{ \
const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
@@ -1263,7 +1228,8 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
stp2_10, stp2_13, stp2_11, stp2_12) \
}
-void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
const __m128i zero = _mm_setzero_si128();
@@ -1318,22 +1284,22 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
if (i == 1) input += 128;
// Load input data.
- in0 = _mm_load_si128((__m128i *)input);
- in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
- in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
- in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
- in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
- in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
- in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
- in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
- in4 = _mm_load_si128((__m128i *)(input + 8 * 8));
- in12 = _mm_load_si128((__m128i *)(input + 8 * 9));
- in5 = _mm_load_si128((__m128i *)(input + 8 * 10));
- in13 = _mm_load_si128((__m128i *)(input + 8 * 11));
- in6 = _mm_load_si128((__m128i *)(input + 8 * 12));
- in14 = _mm_load_si128((__m128i *)(input + 8 * 13));
- in7 = _mm_load_si128((__m128i *)(input + 8 * 14));
- in15 = _mm_load_si128((__m128i *)(input + 8 * 15));
+ in0 = _mm_load_si128((const __m128i *)input);
+ in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+ in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
+ in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
+ in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
+ in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
+ in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
+ in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
+ in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
+ in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in4, in5, in6, in7);
@@ -1355,7 +1321,7 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
in12, in13, in14, in15);
}
- IDCT16x16_1D
+ IDCT16_1D
// Stage7
if (i == 0) {
@@ -1470,7 +1436,7 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
}
}
-void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
__m128i dc_value;
const __m128i zero = _mm_setzero_si128();
int a, i;
@@ -1519,7 +1485,7 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
res0[15] = tbuf[7];
}
-void iadst16_1d_8col(__m128i *in) {
+static void iadst16_1d_8col(__m128i *in) {
// perform 16x16 1-D ADST for 8 columns
__m128i s[16], x[16], u[32], v[32];
const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -1989,7 +1955,7 @@ void iadst16_1d_8col(__m128i *in) {
in[15] = _mm_sub_epi16(kZero, s[1]);
}
-void idct16_1d_8col(__m128i *in) {
+static void idct16_1d_8col(__m128i *in) {
const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
@@ -2333,36 +2299,36 @@ void idct16_1d_8col(__m128i *in) {
in[15] = _mm_sub_epi16(s[0], s[15]);
}
-void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
idct16_1d_8col(in0);
idct16_1d_8col(in1);
}
-void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
iadst16_1d_8col(in0);
iadst16_1d_8col(in1);
}
-static INLINE void load_buffer_8x16(int16_t *input, __m128i *in) {
- in[0] = _mm_load_si128((__m128i *)(input + 0 * 16));
- in[1] = _mm_load_si128((__m128i *)(input + 1 * 16));
- in[2] = _mm_load_si128((__m128i *)(input + 2 * 16));
- in[3] = _mm_load_si128((__m128i *)(input + 3 * 16));
- in[4] = _mm_load_si128((__m128i *)(input + 4 * 16));
- in[5] = _mm_load_si128((__m128i *)(input + 5 * 16));
- in[6] = _mm_load_si128((__m128i *)(input + 6 * 16));
- in[7] = _mm_load_si128((__m128i *)(input + 7 * 16));
-
- in[8] = _mm_load_si128((__m128i *)(input + 8 * 16));
- in[9] = _mm_load_si128((__m128i *)(input + 9 * 16));
- in[10] = _mm_load_si128((__m128i *)(input + 10 * 16));
- in[11] = _mm_load_si128((__m128i *)(input + 11 * 16));
- in[12] = _mm_load_si128((__m128i *)(input + 12 * 16));
- in[13] = _mm_load_si128((__m128i *)(input + 13 * 16));
- in[14] = _mm_load_si128((__m128i *)(input + 14 * 16));
- in[15] = _mm_load_si128((__m128i *)(input + 15 * 16));
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16));
+
+ in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16));
+ in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16));
+ in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16));
+ in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16));
+ in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16));
+ in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16));
+ in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16));
+ in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16));
}
static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
@@ -2421,8 +2387,8 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
RECON_AND_STORE(dest, in[15]);
}
-void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride,
- int tx_type) {
+void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
__m128i in0[16], in1[16];
load_buffer_8x16(input, in0);
@@ -2456,8 +2422,8 @@ void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride,
write_buffer_8x16(dest, in1, stride);
}
-void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
- int stride) {
+void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
const __m128i zero = _mm_setzero_si128();
@@ -2503,14 +2469,14 @@ void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
// 1-D idct. Load input data.
- in0 = _mm_load_si128((__m128i *)input);
- in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
- in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
- in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
- in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
- in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
- in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
- in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
+ in0 = _mm_load_si128((const __m128i *)input);
+ in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
@@ -2737,7 +2703,7 @@ void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
- IDCT16x16_1D
+ IDCT16_1D
// Stage7
in0 = _mm_add_epi16(stp2_0, stp1_15);
@@ -2815,11 +2781,704 @@ void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
#define LOAD_DQCOEFF(reg, input) \
{ \
- reg = _mm_load_si128((__m128i *) input); \
+ reg = _mm_load_si128((const __m128i *) input); \
input += 8; \
} \
-void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+#define IDCT32_1D \
+/* Stage1 */ \
+{ \
+ const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
+ const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
+ const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
+ const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
+ \
+ const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
+ const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
+ const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
+ const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
+ \
+ const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
+ const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
+ const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
+ const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
+ \
+ const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
+ const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
+ const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
+ const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
+ stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
+ stp1_17, stp1_30) \
+ MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
+ stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
+ stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
+ stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
+ stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+} \
+\
+/* Stage2 */ \
+{ \
+ const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
+ const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
+ const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
+ const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
+ \
+ const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
+ const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
+ const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
+ const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
+ stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
+ stp2_14) \
+ MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
+ stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
+ stp2_11, stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
+ stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
+ stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
+ \
+ stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
+ stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
+ stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
+ stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
+ \
+ stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
+ stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
+ stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
+} \
+\
+/* Stage3 */ \
+{ \
+ const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
+ const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
+ const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
+ const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
+ \
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ \
+ MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
+ stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
+ stp1_6) \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+ stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+ \
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+ stp1_18, stp1_29) \
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+ stp1_22, stp1_25) \
+ \
+ stp1_16 = stp2_16; \
+ stp1_31 = stp2_31; \
+ stp1_19 = stp2_19; \
+ stp1_20 = stp2_20; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_27 = stp2_27; \
+ stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+ const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
+ const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
+ const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
+ const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
+ stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
+ stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+ stp2_10, stp2_13) \
+ \
+ stp2_8 = stp1_8; \
+ stp2_15 = stp1_15; \
+ stp2_11 = stp1_11; \
+ stp2_12 = stp1_12; \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_4 = stp2_4; \
+ stp1_7 = stp2_7; \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ \
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+ stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ \
+ stp1_22 = stp2_22; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_25 = stp2_25; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+ \
+ stp2_8 = stp1_8; \
+ stp2_9 = stp1_9; \
+ stp2_14 = stp1_14; \
+ stp2_15 = stp1_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+ stp2_13, stp2_11, stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+ \
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ stp1_18 = stp2_18; \
+ stp1_19 = stp2_19; \
+ \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+ \
+ stp1_28 = stp2_28; \
+ stp1_29 = stp2_29; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+ // idct constants for each stage
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
+ in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
+ in24, in25, in26, in27, in28, in29, in30, in31;
+ __m128i col[128];
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+ stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+ stp1_30, stp1_31;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+ stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+ stp2_30, stp2_31;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i, j, i32;
+
+ // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
+ for (i = 0; i < 8; i++) {
+ i32 = (i << 5);
+ if (i == 0) {
+ // First 1-D idct: first 8 rows
+ // Load input data.
+ LOAD_DQCOEFF(in0, input);
+ LOAD_DQCOEFF(in8, input);
+ LOAD_DQCOEFF(in16, input);
+ LOAD_DQCOEFF(in24, input);
+ LOAD_DQCOEFF(in1, input);
+ LOAD_DQCOEFF(in9, input);
+ LOAD_DQCOEFF(in17, input);
+ LOAD_DQCOEFF(in25, input);
+ LOAD_DQCOEFF(in2, input);
+ LOAD_DQCOEFF(in10, input);
+ LOAD_DQCOEFF(in18, input);
+ LOAD_DQCOEFF(in26, input);
+ LOAD_DQCOEFF(in3, input);
+ LOAD_DQCOEFF(in11, input);
+ LOAD_DQCOEFF(in19, input);
+ LOAD_DQCOEFF(in27, input);
+
+ LOAD_DQCOEFF(in4, input);
+ LOAD_DQCOEFF(in12, input);
+ LOAD_DQCOEFF(in20, input);
+ LOAD_DQCOEFF(in28, input);
+ LOAD_DQCOEFF(in5, input);
+ LOAD_DQCOEFF(in13, input);
+ LOAD_DQCOEFF(in21, input);
+ LOAD_DQCOEFF(in29, input);
+ LOAD_DQCOEFF(in6, input);
+ LOAD_DQCOEFF(in14, input);
+ LOAD_DQCOEFF(in22, input);
+ LOAD_DQCOEFF(in30, input);
+ LOAD_DQCOEFF(in7, input);
+ LOAD_DQCOEFF(in15, input);
+ LOAD_DQCOEFF(in23, input);
+ LOAD_DQCOEFF(in31, input);
+
+ // Transpose 32x8 block to 8x32 block
+ TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
+ in18, in19, in20, in21, in22, in23);
+ TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
+ in26, in27, in28, in29, in30, in31);
+ } else if (i < 4) {
+ // First 1-D idct: next 24 zero-coeff rows
+ col[i32 + 0] = _mm_setzero_si128();
+ col[i32 + 1] = _mm_setzero_si128();
+ col[i32 + 2] = _mm_setzero_si128();
+ col[i32 + 3] = _mm_setzero_si128();
+ col[i32 + 4] = _mm_setzero_si128();
+ col[i32 + 5] = _mm_setzero_si128();
+ col[i32 + 6] = _mm_setzero_si128();
+ col[i32 + 7] = _mm_setzero_si128();
+ col[i32 + 8] = _mm_setzero_si128();
+ col[i32 + 9] = _mm_setzero_si128();
+ col[i32 + 10] = _mm_setzero_si128();
+ col[i32 + 11] = _mm_setzero_si128();
+ col[i32 + 12] = _mm_setzero_si128();
+ col[i32 + 13] = _mm_setzero_si128();
+ col[i32 + 14] = _mm_setzero_si128();
+ col[i32 + 15] = _mm_setzero_si128();
+ col[i32 + 16] = _mm_setzero_si128();
+ col[i32 + 17] = _mm_setzero_si128();
+ col[i32 + 18] = _mm_setzero_si128();
+ col[i32 + 19] = _mm_setzero_si128();
+ col[i32 + 20] = _mm_setzero_si128();
+ col[i32 + 21] = _mm_setzero_si128();
+ col[i32 + 22] = _mm_setzero_si128();
+ col[i32 + 23] = _mm_setzero_si128();
+ col[i32 + 24] = _mm_setzero_si128();
+ col[i32 + 25] = _mm_setzero_si128();
+ col[i32 + 26] = _mm_setzero_si128();
+ col[i32 + 27] = _mm_setzero_si128();
+ col[i32 + 28] = _mm_setzero_si128();
+ col[i32 + 29] = _mm_setzero_si128();
+ col[i32 + 30] = _mm_setzero_si128();
+ col[i32 + 31] = _mm_setzero_si128();
+ continue;
+ } else {
+ // Second 1-D idct
+ j = i - 4;
+
+ // Transpose 32x8 block to 8x32 block
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ j += 4;
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
+ in11, in12, in13, in14, in15);
+ j += 4;
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
+ in19, in20, in21, in22, in23);
+ j += 4;
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
+ in28, in29, in30, in31);
+ }
+
+ IDCT32_1D
+
+ // final stage
+ if (i < 4) {
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+ col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+ col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+ col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+ col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+ col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+ col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+ col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+ col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+ col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+ col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+ col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+ col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+ col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+ col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+ col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+ col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+ col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+ col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+ col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+ col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+ col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+ col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+ col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+ col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+ col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+ col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+ col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+ col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+ col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+ col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+ col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+ } else {
+ const __m128i zero = _mm_setzero_si128();
+
+ // 2_D: Calculate the results and store them to destination.
+ in0 = _mm_add_epi16(stp1_0, stp1_31);
+ in1 = _mm_add_epi16(stp1_1, stp1_30);
+ in2 = _mm_add_epi16(stp1_2, stp1_29);
+ in3 = _mm_add_epi16(stp1_3, stp1_28);
+ in4 = _mm_add_epi16(stp1_4, stp1_27);
+ in5 = _mm_add_epi16(stp1_5, stp1_26);
+ in6 = _mm_add_epi16(stp1_6, stp1_25);
+ in7 = _mm_add_epi16(stp1_7, stp1_24);
+ in8 = _mm_add_epi16(stp1_8, stp1_23);
+ in9 = _mm_add_epi16(stp1_9, stp1_22);
+ in10 = _mm_add_epi16(stp1_10, stp1_21);
+ in11 = _mm_add_epi16(stp1_11, stp1_20);
+ in12 = _mm_add_epi16(stp1_12, stp1_19);
+ in13 = _mm_add_epi16(stp1_13, stp1_18);
+ in14 = _mm_add_epi16(stp1_14, stp1_17);
+ in15 = _mm_add_epi16(stp1_15, stp1_16);
+ in16 = _mm_sub_epi16(stp1_15, stp1_16);
+ in17 = _mm_sub_epi16(stp1_14, stp1_17);
+ in18 = _mm_sub_epi16(stp1_13, stp1_18);
+ in19 = _mm_sub_epi16(stp1_12, stp1_19);
+ in20 = _mm_sub_epi16(stp1_11, stp1_20);
+ in21 = _mm_sub_epi16(stp1_10, stp1_21);
+ in22 = _mm_sub_epi16(stp1_9, stp1_22);
+ in23 = _mm_sub_epi16(stp1_8, stp1_23);
+ in24 = _mm_sub_epi16(stp1_7, stp1_24);
+ in25 = _mm_sub_epi16(stp1_6, stp1_25);
+ in26 = _mm_sub_epi16(stp1_5, stp1_26);
+ in27 = _mm_sub_epi16(stp1_4, stp1_27);
+ in28 = _mm_sub_epi16(stp1_3, stp1_28);
+ in29 = _mm_sub_epi16(stp1_2, stp1_29);
+ in30 = _mm_sub_epi16(stp1_1, stp1_30);
+ in31 = _mm_sub_epi16(stp1_0, stp1_31);
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+ in8 = _mm_adds_epi16(in8, final_rounding);
+ in9 = _mm_adds_epi16(in9, final_rounding);
+ in10 = _mm_adds_epi16(in10, final_rounding);
+ in11 = _mm_adds_epi16(in11, final_rounding);
+ in12 = _mm_adds_epi16(in12, final_rounding);
+ in13 = _mm_adds_epi16(in13, final_rounding);
+ in14 = _mm_adds_epi16(in14, final_rounding);
+ in15 = _mm_adds_epi16(in15, final_rounding);
+ in16 = _mm_adds_epi16(in16, final_rounding);
+ in17 = _mm_adds_epi16(in17, final_rounding);
+ in18 = _mm_adds_epi16(in18, final_rounding);
+ in19 = _mm_adds_epi16(in19, final_rounding);
+ in20 = _mm_adds_epi16(in20, final_rounding);
+ in21 = _mm_adds_epi16(in21, final_rounding);
+ in22 = _mm_adds_epi16(in22, final_rounding);
+ in23 = _mm_adds_epi16(in23, final_rounding);
+ in24 = _mm_adds_epi16(in24, final_rounding);
+ in25 = _mm_adds_epi16(in25, final_rounding);
+ in26 = _mm_adds_epi16(in26, final_rounding);
+ in27 = _mm_adds_epi16(in27, final_rounding);
+ in28 = _mm_adds_epi16(in28, final_rounding);
+ in29 = _mm_adds_epi16(in29, final_rounding);
+ in30 = _mm_adds_epi16(in30, final_rounding);
+ in31 = _mm_adds_epi16(in31, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 6);
+ in1 = _mm_srai_epi16(in1, 6);
+ in2 = _mm_srai_epi16(in2, 6);
+ in3 = _mm_srai_epi16(in3, 6);
+ in4 = _mm_srai_epi16(in4, 6);
+ in5 = _mm_srai_epi16(in5, 6);
+ in6 = _mm_srai_epi16(in6, 6);
+ in7 = _mm_srai_epi16(in7, 6);
+ in8 = _mm_srai_epi16(in8, 6);
+ in9 = _mm_srai_epi16(in9, 6);
+ in10 = _mm_srai_epi16(in10, 6);
+ in11 = _mm_srai_epi16(in11, 6);
+ in12 = _mm_srai_epi16(in12, 6);
+ in13 = _mm_srai_epi16(in13, 6);
+ in14 = _mm_srai_epi16(in14, 6);
+ in15 = _mm_srai_epi16(in15, 6);
+ in16 = _mm_srai_epi16(in16, 6);
+ in17 = _mm_srai_epi16(in17, 6);
+ in18 = _mm_srai_epi16(in18, 6);
+ in19 = _mm_srai_epi16(in19, 6);
+ in20 = _mm_srai_epi16(in20, 6);
+ in21 = _mm_srai_epi16(in21, 6);
+ in22 = _mm_srai_epi16(in22, 6);
+ in23 = _mm_srai_epi16(in23, 6);
+ in24 = _mm_srai_epi16(in24, 6);
+ in25 = _mm_srai_epi16(in25, 6);
+ in26 = _mm_srai_epi16(in26, 6);
+ in27 = _mm_srai_epi16(in27, 6);
+ in28 = _mm_srai_epi16(in28, 6);
+ in29 = _mm_srai_epi16(in29, 6);
+ in30 = _mm_srai_epi16(in30, 6);
+ in31 = _mm_srai_epi16(in31, 6);
+
+ RECON_AND_STORE(dest, in0);
+ RECON_AND_STORE(dest, in1);
+ RECON_AND_STORE(dest, in2);
+ RECON_AND_STORE(dest, in3);
+ RECON_AND_STORE(dest, in4);
+ RECON_AND_STORE(dest, in5);
+ RECON_AND_STORE(dest, in6);
+ RECON_AND_STORE(dest, in7);
+ RECON_AND_STORE(dest, in8);
+ RECON_AND_STORE(dest, in9);
+ RECON_AND_STORE(dest, in10);
+ RECON_AND_STORE(dest, in11);
+ RECON_AND_STORE(dest, in12);
+ RECON_AND_STORE(dest, in13);
+ RECON_AND_STORE(dest, in14);
+ RECON_AND_STORE(dest, in15);
+ RECON_AND_STORE(dest, in16);
+ RECON_AND_STORE(dest, in17);
+ RECON_AND_STORE(dest, in18);
+ RECON_AND_STORE(dest, in19);
+ RECON_AND_STORE(dest, in20);
+ RECON_AND_STORE(dest, in21);
+ RECON_AND_STORE(dest, in22);
+ RECON_AND_STORE(dest, in23);
+ RECON_AND_STORE(dest, in24);
+ RECON_AND_STORE(dest, in25);
+ RECON_AND_STORE(dest, in26);
+ RECON_AND_STORE(dest, in27);
+ RECON_AND_STORE(dest, in28);
+ RECON_AND_STORE(dest, in29);
+ RECON_AND_STORE(dest, in30);
+ RECON_AND_STORE(dest, in31);
+
+ dest += 8 - (stride * 32);
+ }
+ }
+}
+
+void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
@@ -3042,336 +3701,7 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
in28, in29, in30, in31);
}
- // Stage1
- {
- const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31);
- const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31);
- const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15);
- const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15);
-
- const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23);
- const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23);
- const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7);
- const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7);
-
- const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27);
- const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27);
- const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11);
- const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11);
-
- const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19);
- const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19);
- const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3);
- const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3);
-
- MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,
- stg1_1, stg1_2, stg1_3, stp1_16, stp1_31,
- stp1_17, stp1_30)
- MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4,
- stg1_5, stg1_6, stg1_7, stp1_18, stp1_29,
- stp1_19, stp1_28)
- MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,
- stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,
- stp1_21, stp1_26)
- MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,
- stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,
- stp1_23, stp1_24)
- }
-
- // Stage2
- {
- const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30);
- const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30);
- const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14);
- const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14);
-
- const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22);
- const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22);
- const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6);
- const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6);
-
- MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,
- stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,
- stp2_14)
- MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,
- stg2_5, stg2_6, stg2_7, stp2_10, stp2_13,
- stp2_11, stp2_12)
-
- stp2_16 = _mm_add_epi16(stp1_16, stp1_17);
- stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);
- stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);
- stp2_19 = _mm_add_epi16(stp1_19, stp1_18);
-
- stp2_20 = _mm_add_epi16(stp1_20, stp1_21);
- stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);
- stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);
- stp2_23 = _mm_add_epi16(stp1_23, stp1_22);
-
- stp2_24 = _mm_add_epi16(stp1_24, stp1_25);
- stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);
- stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);
- stp2_27 = _mm_add_epi16(stp1_27, stp1_26);
-
- stp2_28 = _mm_add_epi16(stp1_28, stp1_29);
- stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);
- stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);
- stp2_31 = _mm_add_epi16(stp1_31, stp1_30);
- }
-
- // Stage3
- {
- const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28);
- const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28);
- const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12);
- const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12);
-
- const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);
- const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
-
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
-
- MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,
- stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,
- stp1_6)
-
- stp1_8 = _mm_add_epi16(stp2_8, stp2_9);
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
- stp1_12 = _mm_add_epi16(stp2_12, stp2_13);
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
-
- MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
- stg3_5, stg3_6, stg3_4, stp1_17, stp1_30,
- stp1_18, stp1_29)
- MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
- stg3_9, stg3_10, stg3_8, stp1_21, stp1_26,
- stp1_22, stp1_25)
-
- stp1_16 = stp2_16;
- stp1_31 = stp2_31;
- stp1_19 = stp2_19;
- stp1_20 = stp2_20;
- stp1_23 = stp2_23;
- stp1_24 = stp2_24;
- stp1_27 = stp2_27;
- stp1_28 = stp2_28;
- }
-
- // Stage4
- {
- const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16);
- const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16);
- const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24);
- const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24);
-
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
-
- MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0,
- stg4_1, stg4_2, stg4_3, stp2_0, stp2_1,
- stp2_2, stp2_3)
-
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,
- stg4_5, stg4_6, stg4_4, stp2_9, stp2_14,
- stp2_10, stp2_13)
-
- stp2_8 = stp1_8;
- stp2_15 = stp1_15;
- stp2_11 = stp1_11;
- stp2_12 = stp1_12;
-
- stp2_16 = _mm_add_epi16(stp1_16, stp1_19);
- stp2_17 = _mm_add_epi16(stp1_17, stp1_18);
- stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);
- stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);
- stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);
- stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);
- stp2_22 = _mm_add_epi16(stp1_22, stp1_21);
- stp2_23 = _mm_add_epi16(stp1_23, stp1_20);
-
- stp2_24 = _mm_add_epi16(stp1_24, stp1_27);
- stp2_25 = _mm_add_epi16(stp1_25, stp1_26);
- stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);
- stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);
- stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);
- stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);
- stp2_30 = _mm_add_epi16(stp1_29, stp1_30);
- stp2_31 = _mm_add_epi16(stp1_28, stp1_31);
- }
-
- // Stage5
- {
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
-
- const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
- const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
-
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-
- stp1_5 = _mm_packs_epi32(tmp0, tmp1);
- stp1_6 = _mm_packs_epi32(tmp2, tmp3);
-
- stp1_4 = stp2_4;
- stp1_7 = stp2_7;
-
- stp1_8 = _mm_add_epi16(stp2_8, stp2_11);
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
- stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);
- stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
- stp1_15 = _mm_add_epi16(stp2_15, stp2_12);
-
- stp1_16 = stp2_16;
- stp1_17 = stp2_17;
-
- MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
- stg4_5, stg4_4, stg4_5, stp1_18, stp1_29,
- stp1_19, stp1_28)
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
- stg4_4, stg4_6, stg4_4, stp1_20, stp1_27,
- stp1_21, stp1_26)
-
- stp1_22 = stp2_22;
- stp1_23 = stp2_23;
- stp1_24 = stp2_24;
- stp1_25 = stp2_25;
- stp1_30 = stp2_30;
- stp1_31 = stp2_31;
- }
-
- // Stage6
- {
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);
-
- stp2_0 = _mm_add_epi16(stp1_0, stp1_7);
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
- stp2_3 = _mm_add_epi16(stp1_3, stp1_4);
- stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
- stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);
-
- stp2_8 = stp1_8;
- stp2_9 = stp1_9;
- stp2_14 = stp1_14;
- stp2_15 = stp1_15;
-
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12,
- stg6_0, stg4_0, stg6_0, stg4_0, stp2_10,
- stp2_13, stp2_11, stp2_12)
-
- stp2_16 = _mm_add_epi16(stp1_16, stp1_23);
- stp2_17 = _mm_add_epi16(stp1_17, stp1_22);
- stp2_18 = _mm_add_epi16(stp1_18, stp1_21);
- stp2_19 = _mm_add_epi16(stp1_19, stp1_20);
- stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);
- stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);
- stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);
- stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);
-
- stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);
- stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);
- stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);
- stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);
- stp2_28 = _mm_add_epi16(stp1_27, stp1_28);
- stp2_29 = _mm_add_epi16(stp1_26, stp1_29);
- stp2_30 = _mm_add_epi16(stp1_25, stp1_30);
- stp2_31 = _mm_add_epi16(stp1_24, stp1_31);
- }
-
- // Stage7
- {
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
- const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
- const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
-
- stp1_0 = _mm_add_epi16(stp2_0, stp2_15);
- stp1_1 = _mm_add_epi16(stp2_1, stp2_14);
- stp1_2 = _mm_add_epi16(stp2_2, stp2_13);
- stp1_3 = _mm_add_epi16(stp2_3, stp2_12);
- stp1_4 = _mm_add_epi16(stp2_4, stp2_11);
- stp1_5 = _mm_add_epi16(stp2_5, stp2_10);
- stp1_6 = _mm_add_epi16(stp2_6, stp2_9);
- stp1_7 = _mm_add_epi16(stp2_7, stp2_8);
- stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);
- stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);
- stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);
- stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);
- stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);
- stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);
- stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);
- stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);
-
- stp1_16 = stp2_16;
- stp1_17 = stp2_17;
- stp1_18 = stp2_18;
- stp1_19 = stp2_19;
-
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
- stg4_0, stg6_0, stg4_0, stp1_20, stp1_27,
- stp1_21, stp1_26)
- MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
- stg4_0, stg6_0, stg4_0, stp1_22, stp1_25,
- stp1_23, stp1_24)
-
- stp1_28 = stp2_28;
- stp1_29 = stp2_29;
- stp1_30 = stp2_30;
- stp1_31 = stp2_31;
- }
+ IDCT32_1D
// final stage
if (i < 4) {
@@ -3548,4 +3878,52 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
dest += 8 - (stride * 32);
}
}
+} //NOLINT
+
+void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a, i;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 6);
+
+ dc_value = _mm_set1_epi16(a);
+
+ for (i = 0; i < 4; ++i) {
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ dest += 8 - (stride * 32);
+ }
}
diff --git a/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm b/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
index 980b8b9..69b07f6 100644
--- a/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
+++ b/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
@@ -19,12 +19,14 @@ pw_32: times 8 dw 32
SECTION .text
INIT_MMX sse
-cglobal dc_predictor_4x4, 4, 4, 2, dst, stride, above, left
+cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
pxor m1, m1
movd m0, [aboveq]
punpckldq m0, [leftq]
psadbw m0, m1
- paddw m0, [pw_4]
+ paddw m0, [GLOBAL(pw_4)]
psraw m0, 3
pshufw m0, m0, 0x0
packuswb m0, m0
@@ -33,10 +35,14 @@ cglobal dc_predictor_4x4, 4, 4, 2, dst, stride, above, left
lea dstq, [dstq+strideq*2]
movd [dstq ], m0
movd [dstq+strideq], m0
+
+ RESTORE_GOT
RET
INIT_MMX sse
-cglobal dc_predictor_8x8, 4, 4, 3, dst, stride, above, left
+cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
pxor m1, m1
movq m0, [aboveq]
movq m2, [leftq]
@@ -45,7 +51,7 @@ cglobal dc_predictor_8x8, 4, 4, 3, dst, stride, above, left
psadbw m0, m1
psadbw m2, m1
paddw m0, m2
- paddw m0, [pw_8]
+ paddw m0, [GLOBAL(pw_8)]
psraw m0, 4
pshufw m0, m0, 0x0
packuswb m0, m0
@@ -58,10 +64,14 @@ cglobal dc_predictor_8x8, 4, 4, 3, dst, stride, above, left
movq [dstq+strideq ], m0
movq [dstq+strideq*2], m0
movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
RET
INIT_XMM sse2
-cglobal dc_predictor_16x16, 4, 4, 3, dst, stride, above, left
+cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
pxor m1, m1
mova m0, [aboveq]
mova m2, [leftq]
@@ -73,7 +83,7 @@ cglobal dc_predictor_16x16, 4, 4, 3, dst, stride, above, left
paddw m0, m2
movhlps m2, m0
paddw m0, m2
- paddw m0, [pw_16]
+ paddw m0, [GLOBAL(pw_16)]
psraw m0, 5
pshuflw m0, m0, 0x0
punpcklqdq m0, m0
@@ -86,10 +96,14 @@ cglobal dc_predictor_16x16, 4, 4, 3, dst, stride, above, left
lea dstq, [dstq+strideq*4]
dec lines4d
jnz .loop
+
+ RESTORE_GOT
REP_RET
INIT_XMM sse2
-cglobal dc_predictor_32x32, 4, 4, 5, dst, stride, above, left
+cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
pxor m1, m1
mova m0, [aboveq]
mova m2, [aboveq+16]
@@ -107,7 +121,7 @@ cglobal dc_predictor_32x32, 4, 4, 5, dst, stride, above, left
paddw m0, m4
movhlps m2, m0
paddw m0, m2
- paddw m0, [pw_32]
+ paddw m0, [GLOBAL(pw_32)]
psraw m0, 6
pshuflw m0, m0, 0x0
punpcklqdq m0, m0
@@ -124,6 +138,8 @@ cglobal dc_predictor_32x32, 4, 4, 5, dst, stride, above, left
lea dstq, [dstq+strideq*4]
dec lines4d
jnz .loop
+
+ RESTORE_GOT
REP_RET
INIT_MMX sse
diff --git a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
index 8ba26f3..88df9b2 100644
--- a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
+++ b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -13,27 +13,23 @@
SECTION_RODATA
pb_1: times 16 db 1
-pw_2: times 8 dw 2
-pb_7m1: times 8 db 7, -1
-pb_15: times 16 db 15
-
-sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7
-sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7
+sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b2w01234577: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 7, -1, 7, -1
-sh_b2w12345677: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1
-sh_b2w23456777: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1, 7, -1
-sh_b2w01234567: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1
-sh_b2w12345678: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1
-sh_b2w23456789: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1
-sh_b2w89abcdef: db 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1
-sh_b2w9abcdeff: db 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1
-sh_b2wabcdefff: db 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1, 15, -1
sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
+sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
SECTION .text
@@ -112,14 +108,16 @@ cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
REP_RET
INIT_MMX ssse3
-cglobal d45_predictor_4x4, 3, 3, 4, dst, stride, above
+cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
+ GET_GOT goffsetq
+
movq m0, [aboveq]
- pshufb m2, m0, [sh_b23456777]
- pshufb m1, m0, [sh_b01234577]
- pshufb m0, [sh_b12345677]
+ pshufb m2, m0, [GLOBAL(sh_b23456777)]
+ pshufb m1, m0, [GLOBAL(sh_b01234577)]
+ pshufb m0, [GLOBAL(sh_b12345677)]
pavgb m3, m2, m1
pxor m2, m1
- pand m2, [pb_1]
+ pand m2, [GLOBAL(pb_1)]
psubb m3, m2
pavgb m0, m3
@@ -132,19 +130,23 @@ cglobal d45_predictor_4x4, 3, 3, 4, dst, stride, above
movd [dstq ], m0
psrlq m0, 8
movd [dstq+strideq], m0
+
+ RESTORE_GOT
RET
INIT_MMX ssse3
-cglobal d45_predictor_8x8, 3, 3, 4, dst, stride, above
+cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
+ GET_GOT goffsetq
+
movq m0, [aboveq]
- mova m1, [sh_b12345677]
- DEFINE_ARGS dst, stride, stride3, line
+ mova m1, [GLOBAL(sh_b12345677)]
+ DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
- pshufb m2, m0, [sh_b23456777]
+ pshufb m2, m0, [GLOBAL(sh_b23456777)]
pavgb m3, m2, m0
pxor m2, m0
pshufb m0, m1
- pand m2, [pb_1]
+ pand m2, [GLOBAL(pb_1)]
psubb m3, m2
pavgb m0, m3
@@ -167,20 +169,24 @@ cglobal d45_predictor_8x8, 3, 3, 4, dst, stride, above
movq [dstq+strideq*2], m0
pshufb m0, m1
movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
RET
INIT_XMM ssse3
-cglobal d45_predictor_16x16, 3, 5, 4, dst, stride, above, dst8, line
+cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
+ GET_GOT goffsetq
+
mova m0, [aboveq]
DEFINE_ARGS dst, stride, stride3, dst8, line
lea stride3q, [strideq*3]
lea dst8q, [dstq+strideq*8]
- mova m1, [sh_b123456789abcdeff]
- pshufb m2, m0, [sh_b23456789abcdefff]
+ mova m1, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
pavgb m3, m2, m0
pxor m2, m0
pshufb m0, m1
- pand m2, [pb_1]
+ pand m2, [GLOBAL(pb_1)]
psubb m3, m2
pavgb m0, m3
@@ -214,29 +220,33 @@ cglobal d45_predictor_16x16, 3, 5, 4, dst, stride, above, dst8, line
movhps [dstq+strideq +8], m0
movhps [dstq+strideq*2+8], m0
movhps [dstq+stride3q +8], m0
+
+ RESTORE_GOT
RET
INIT_XMM ssse3
-cglobal d45_predictor_32x32, 3, 5, 7, dst, stride, above, dst16, line
+cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
+ GET_GOT goffsetq
+
mova m0, [aboveq]
mova m4, [aboveq+16]
DEFINE_ARGS dst, stride, stride3, dst16, line
lea stride3q, [strideq*3]
lea dst16q, [dstq +strideq*8]
lea dst16q, [dst16q+strideq*8]
- mova m1, [sh_b123456789abcdeff]
- pshufb m2, m4, [sh_b23456789abcdefff]
+ mova m1, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)]
pavgb m3, m2, m4
pxor m2, m4
palignr m5, m4, m0, 1
palignr m6, m4, m0, 2
pshufb m4, m1
- pand m2, [pb_1]
+ pand m2, [GLOBAL(pb_1)]
psubb m3, m2
pavgb m4, m3
pavgb m3, m0, m6
pxor m0, m6
- pand m0, [pb_1]
+ pand m0, [GLOBAL(pb_1)]
psubb m3, m0
pavgb m5, m3
@@ -288,4 +298,739 @@ cglobal d45_predictor_32x32, 3, 5, 7, dst, stride, above, dst16, line
mova [dstq +strideq +16], m4
mova [dstq +strideq*2+16], m4
mova [dstq +stride3q +16], m4
+
+ RESTORE_GOT
+ RET
+
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+ pavgb %4, %1, %3
+ pxor %3, %1
+ pand %3, [GLOBAL(pb_1)]
+ psubb %4, %3
+ pavgb %4, %2
+%endmacro
+
+INIT_XMM ssse3
+cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
+ GET_GOT goffsetq
+
+ movq m3, [aboveq]
+ pshufb m1, m3, [GLOBAL(sh_b23456777)]
+ pshufb m2, m3, [GLOBAL(sh_b12345677)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
+ pavgb m3, m2
+
+ ; store 4 lines
+ movd [dstq ], m3
+ movd [dstq+strideq], m4
+ lea dstq, [dstq+strideq*2]
+ psrldq m3, 1
+ psrldq m4, 1
+ movd [dstq ], m3
+ movd [dstq+strideq], m4
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
+ GET_GOT goffsetq
+
+ movq m3, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
+ pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
+ pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
+ pshufb m3, [GLOBAL(sh_b0123456777777777)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
+ pavgb m3, m2
+
+ ; store 4 lines
+ movq [dstq ], m3
+ movq [dstq+strideq], m4
+ psrldq m3, 1
+ psrldq m4, 1
+ movq [dstq+strideq*2], m3
+ movq [dstq+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ psrldq m3, 1
+ psrldq m4, 1
+
+ ; store 4 lines
+ movq [dstq ], m3
+ movq [dstq+strideq], m4
+ psrldq m3, 1
+ psrldq m4, 1
+ movq [dstq+strideq*2], m3
+ movq [dstq+stride3q ], m4
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, line
+ lea stride3q, [strideq*3]
+ mova m1, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+ pshufb m3, m0, m1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
+ pavgb m0, m3
+
+ mov lined, 4
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m4
+ pshufb m0, m1
+ pshufb m4, m1
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m4
+ pshufb m0, m1
+ pshufb m4, m1
+ lea dstq, [dstq+strideq*4]
+ dec lined
+ jnz .loop
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ mova m7, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, line
+ mova m1, [GLOBAL(sh_b123456789abcdeff)]
+ lea stride3q, [strideq*3]
+ pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)]
+ pshufb m3, m7, m1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
+ palignr m6, m7, m0, 1
+ palignr m5, m7, m0, 2
+ pavgb m7, m3
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
+ pavgb m0, m6
+
+ mov lined, 8
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m7
+ mova [dstq+strideq ], m2
+ mova [dstq+strideq +16], m4
+ palignr m3, m7, m0, 1
+ palignr m5, m4, m2, 1
+ pshufb m7, m1
+ pshufb m4, m1
+
+ mova [dstq+strideq*2 ], m3
+ mova [dstq+strideq*2+16], m7
+ mova [dstq+stride3q ], m5
+ mova [dstq+stride3q +16], m4
+ palignr m0, m7, m3, 1
+ palignr m2, m4, m5, 1
+ pshufb m7, m1
+ pshufb m4, m1
+ lea dstq, [dstq+strideq*4]
+ dec lined
+ jnz .loop
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ movd m0, [leftq] ; l1, l2, l3, l4
+ movd m1, [aboveq-1] ; tl, t1, t2, t3
+ punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3
+ pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
+ psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3
+ psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1
+ ; A2 B2 A1 B1
+ ; A3 B3 A2 B2
+ ; A4 B4 A3 B3
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1
+ pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1
+
+ punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+stride3q ], m3
+ psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+strideq*2], m3
+ psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+strideq ], m3
+ psrldq m3, 2 ; A1 B1 C1 D1 ..
+ movd [dstq ], m3
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ movq m0, [leftq] ; [0- 7] l1-8 [byte]
+ movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte]
+ pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word]
+ pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word]
+ pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word]
+ pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word]
+ psrldq m4, m0, 1 ; t1-7 [word]
+ psrldq m5, m0, 2 ; t2-7 [word]
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1 E1 F1 G1 H1
+ ; A2 B2 A1 B1 C1 D1 E1 F1
+ ; A3 B3 A2 B2 A1 B1 C1 D1
+ ; A4 B4 A3 B3 A2 B2 A1 B1
+ ; A5 B5 A4 B4 A3 B3 A2 B2
+ ; A6 B6 A5 B5 A4 B4 A3 B3
+ ; A7 B7 A6 B6 A5 B5 A4 B4
+ ; A8 B8 A7 B7 A6 B6 A5 B5
+ pavgb m6, m1, m2 ; 2-tap avg A8-A1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1
+
+ punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1
+ palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1
+ movq [dstq+strideq*2], m0
+ psrldq m0, 2 ; A-B2, A-B1, C-H1
+ movq [dstq+strideq ], m0
+ psrldq m0, 2 ; A-H1
+ movq [dstq ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5
+ psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4
+ movq [dstq+strideq*2], m6
+ psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3
+ movq [dstq+strideq ], m6
+ psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2
+ movq [dstq ], m6
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ mova m0, [leftq]
+ movu m7, [aboveq-1]
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
+ ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
+ ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
+ ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
+ ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
+ ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
+ ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
+ ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
+ ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
+ ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
+ ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
+ ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
+ ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
+ ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
+ ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
+ ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
+ pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)]
+ palignr m5, m0, m6, 15
+ palignr m3, m0, m6, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
+ pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)]
+ pavgb m5, m0 ; A1 - Ag
+
+ punpcklbw m0, m4, m5 ; A-B8 ... A-B1
+ punpckhbw m4, m5 ; A-B9 ... A-Bg
+
+ pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1
+
+ pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ palignr m2, m1, m6, 14
+ mova [dstq ], m2
+ palignr m2, m1, m6, 12
+ mova [dstq+strideq ], m2
+ palignr m2, m1, m6, 10
+ mova [dstq+strideq*2], m2
+ palignr m2, m1, m6, 8
+ mova [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ palignr m2, m1, m6, 6
+ mova [dstq ], m2
+ palignr m2, m1, m6, 4
+ mova [dstq+strideq ], m2
+ palignr m2, m1, m6, 2
+ mova [dstq+strideq*2], m2
+ pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
+ mova [dstq+stride3q ], m6
+ lea dstq, [dstq+strideq*4]
+
+ palignr m2, m6, m4, 14
+ mova [dstq ], m2
+ palignr m2, m6, m4, 12
+ mova [dstq+strideq ], m2
+ palignr m2, m6, m4, 10
+ mova [dstq+strideq*2], m2
+ palignr m2, m6, m4, 8
+ mova [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ palignr m2, m6, m4, 6
+ mova [dstq ], m2
+ palignr m2, m6, m4, 4
+ mova [dstq+strideq ], m2
+ palignr m2, m6, m4, 2
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m4
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ mova m0, [leftq]
+ movu m7, [aboveq-1]
+ movu m1, [aboveq+15]
+
+ pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high]
+
+ palignr m3, m1, m7, 1
+ palignr m5, m1, m7, 2
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low]
+
+ pshufb m7, [GLOBAL(sh_bfedcba9876543210)]
+ palignr m5, m0, m7, 15
+ palignr m3, m0, m7, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
+ pavgb m5, m0 ; A1 - Ag
+ punpcklbw m6, m4, m5 ; A-B8 ... A-B1
+ punpckhbw m4, m5 ; A-B9 ... A-Bg
+ pshufb m6, [GLOBAL(sh_bfedcba9876543210)]
+ pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
+
+ DEFINE_ARGS dst, stride, stride3, left, line
+ lea stride3q, [strideq*3]
+
+ palignr m5, m2, m1, 14
+ palignr m7, m1, m6, 14
+ mova [dstq ], m7
+ mova [dstq+16 ], m5
+ palignr m5, m2, m1, 12
+ palignr m7, m1, m6, 12
+ mova [dstq+strideq ], m7
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m2, m1, 10
+ palignr m7, m1, m6, 10
+ mova [dstq+strideq*2 ], m7
+ mova [dstq+strideq*2+16], m5
+ palignr m5, m2, m1, 8
+ palignr m7, m1, m6, 8
+ mova [dstq+stride3q ], m7
+ mova [dstq+stride3q+16 ], m5
+ lea dstq, [dstq+strideq*4]
+ palignr m5, m2, m1, 6
+ palignr m7, m1, m6, 6
+ mova [dstq ], m7
+ mova [dstq+16 ], m5
+ palignr m5, m2, m1, 4
+ palignr m7, m1, m6, 4
+ mova [dstq+strideq ], m7
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m2, m1, 2
+ palignr m7, m1, m6, 2
+ mova [dstq+strideq*2 ], m7
+ mova [dstq+strideq*2+16], m5
+ mova [dstq+stride3q ], m6
+ mova [dstq+stride3q+16 ], m1
+ lea dstq, [dstq+strideq*4]
+
+ palignr m5, m1, m6, 14
+ palignr m3, m6, m4, 14
+ mova [dstq ], m3
+ mova [dstq+16 ], m5
+ palignr m5, m1, m6, 12
+ palignr m3, m6, m4, 12
+ mova [dstq+strideq ], m3
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m1, m6, 10
+ palignr m3, m6, m4, 10
+ mova [dstq+strideq*2 ], m3
+ mova [dstq+strideq*2+16], m5
+ palignr m5, m1, m6, 8
+ palignr m3, m6, m4, 8
+ mova [dstq+stride3q ], m3
+ mova [dstq+stride3q+16 ], m5
+ lea dstq, [dstq+strideq*4]
+ palignr m5, m1, m6, 6
+ palignr m3, m6, m4, 6
+ mova [dstq ], m3
+ mova [dstq+16 ], m5
+ palignr m5, m1, m6, 4
+ palignr m3, m6, m4, 4
+ mova [dstq+strideq ], m3
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m1, m6, 2
+ palignr m3, m6, m4, 2
+ mova [dstq+strideq*2 ], m3
+ mova [dstq+strideq*2+16], m5
+ mova [dstq+stride3q ], m4
+ mova [dstq+stride3q+16 ], m6
+ lea dstq, [dstq+strideq*4]
+
+ mova m7, [leftq]
+ mova m3, [leftq+16]
+ palignr m5, m3, m7, 15
+ palignr m0, m3, m7, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh -
+ pavgb m5, m3 ; Ah -
+ punpcklbw m3, m2, m5 ; A-B8 ... A-B1
+ punpckhbw m2, m5 ; A-B9 ... A-Bg
+ pshufb m3, [GLOBAL(sh_bfedcba9876543210)]
+ pshufb m2, [GLOBAL(sh_bfedcba9876543210)]
+
+ palignr m7, m6, m4, 14
+ palignr m0, m4, m3, 14
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m6, m4, 12
+ palignr m0, m4, m3, 12
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m6, m4, 10
+ palignr m0, m4, m3, 10
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ palignr m7, m6, m4, 8
+ palignr m0, m4, m3, 8
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q+16 ], m7
+ lea dstq, [dstq+strideq*4]
+ palignr m7, m6, m4, 6
+ palignr m0, m4, m3, 6
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m6, m4, 4
+ palignr m0, m4, m3, 4
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m6, m4, 2
+ palignr m0, m4, m3, 2
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ mova [dstq+stride3q ], m3
+ mova [dstq+stride3q+16 ], m4
+ lea dstq, [dstq+strideq*4]
+
+ palignr m7, m4, m3, 14
+ palignr m0, m3, m2, 14
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m4, m3, 12
+ palignr m0, m3, m2, 12
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m4, m3, 10
+ palignr m0, m3, m2, 10
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ palignr m7, m4, m3, 8
+ palignr m0, m3, m2, 8
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q+16 ], m7
+ lea dstq, [dstq+strideq*4]
+ palignr m7, m4, m3, 6
+ palignr m0, m3, m2, 6
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m4, m3, 4
+ palignr m0, m3, m2, 4
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m4, m3, 2
+ palignr m0, m3, m2, 2
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ mova [dstq+stride3q ], m2
+ mova [dstq+stride3q+16 ], m3
+
+ RESTORE_GOT
+ RET
+
+INIT_MMX ssse3
+cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset
+ GET_GOT goffsetq
+ movd m0, [leftq] ; abcd [byte]
+ pshufb m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
+ pshufb m3, m0, [GLOBAL(sh_b2333)] ; cddd
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
+ pavgb m1, m0 ; ab, bc, cd, d [byte]
+
+ punpcklbw m1, m2 ; ab, a2bc, bc, b2cd, cd, c3d, d, d
+ movd [dstq ], m1
+ psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d
+ movd [dstq+strideq], m1
+ lea dstq, [dstq+strideq*2]
+ psrlq m1, 16 ; cd, c3d, d, d
+ movd [dstq ], m1
+ pshufw m1, m1, q1111 ; d, d, d, d
+ movd [dstq+strideq], m1
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
+ GET_GOT goffsetq
+ movq m3, [leftq] ; abcdefgh [byte]
+ lea stride3q, [strideq*3]
+
+ pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
+ pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
+ pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
+ pavgb m0, m2
+ punpcklbw m0, m3 ; interleaved output
+
+ movq [dstq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq*2], m0
+ psrldq m0, 2
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
+ psrldq m0, 2
+ movq [dstq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq*2], m0
+ psrldq m0, 2
+ movq [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
+ GET_GOT goffsetq
+ lea stride3q, [strideq*3]
+ mova m0, [leftq] ; abcdefghijklmnop [byte]
+ pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
+ pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+ pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte]
+
+ punpckhbw m4, m1, m3 ; interleaved input
+ punpcklbw m1, m3 ; interleaved output
+ mova [dstq ], m1
+ palignr m3, m4, m1, 2
+ mova [dstq+strideq ], m3
+ palignr m3, m4, m1, 4
+ mova [dstq+strideq*2], m3
+ palignr m3, m4, m1, 6
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ palignr m3, m4, m1, 8
+ mova [dstq ], m3
+ palignr m3, m4, m1, 10
+ mova [dstq+strideq ], m3
+ palignr m3, m4, m1, 12
+ mova [dstq+strideq*2], m3
+ palignr m3, m4, m1, 14
+ mova [dstq+stride3q ], m3
+ DEFINE_ARGS dst, stride, stride3, line
+ mov lined, 2
+ mova m0, [GLOBAL(sh_b23456789abcdefff)]
+.loop:
+ lea dstq, [dstq+strideq*4]
+ mova [dstq ], m4
+ pshufb m4, m0
+ mova [dstq+strideq ], m4
+ pshufb m4, m0
+ mova [dstq+strideq*2], m4
+ pshufb m4, m0
+ mova [dstq+stride3q ], m4
+ pshufb m4, m0
+ dec lined
+ jnz .loop
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
+ GET_GOT goffsetq
+ lea stride3q, [strideq*3]
+ mova m1, [leftq] ; 0-15 [byte]
+ mova m2, [leftq+16] ; 16-31 [byte]
+ pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)]
+ pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
+ palignr m6, m2, m1, 1
+ palignr m5, m2, m1, 2
+ pavgb m2, m4 ; high 16px even lines
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
+ pavgb m1, m6 ; low 16px even lines
+
+ punpckhbw m6, m1, m0 ; interleaved output 2
+ punpcklbw m1, m0 ; interleaved output 1
+
+ punpckhbw m7, m2, m3 ; interleaved output 4
+ punpcklbw m2, m3 ; interleaved output 3
+
+ ; output 1st 8 lines (and half of 2nd 8 lines)
+ DEFINE_ARGS dst, stride, stride3, dst8
+ lea dst8q, [dstq+strideq*8]
+ mova [dstq ], m1
+ mova [dstq +16], m6
+ mova [dst8q ], m6
+ palignr m0, m6, m1, 2
+ palignr m4, m2, m6, 2
+ mova [dstq +strideq ], m0
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m0, m6, m1, 4
+ palignr m4, m2, m6, 4
+ mova [dstq +strideq*2 ], m0
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m0, m6, m1, 6
+ palignr m4, m2, m6, 6
+ mova [dstq +stride3q ], m0
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq +strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ palignr m0, m6, m1, 8
+ palignr m4, m2, m6, 8
+ mova [dstq ], m0
+ mova [dstq +16], m4
+ mova [dst8q ], m4
+ palignr m0, m6, m1, 10
+ palignr m4, m2, m6, 10
+ mova [dstq +strideq ], m0
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m0, m6, m1, 12
+ palignr m4, m2, m6, 12
+ mova [dstq +strideq*2 ], m0
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m0, m6, m1, 14
+ palignr m4, m2, m6, 14
+ mova [dstq +stride3q ], m0
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+
+ ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
+ mova [dstq +16], m2
+ mova [dst8q ], m2
+ palignr m4, m7, m2, 2
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m4, m7, m2, 4
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m4, m7, m2, 6
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ palignr m4, m7, m2, 8
+ mova [dstq +16], m4
+ mova [dst8q ], m4
+ palignr m4, m7, m2, 10
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m4, m7, m2, 12
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m4, m7, m2, 14
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+
+ ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
+ mova m0, [GLOBAL(sh_b23456789abcdefff)]
+ mova [dstq +16], m7
+ mova [dst8q ], m7
+ pshufb m7, m0
+ mova [dstq +strideq +16], m7
+ mova [dst8q+strideq ], m7
+ pshufb m7, m0
+ mova [dstq +strideq*2+16], m7
+ mova [dst8q+strideq*2 ], m7
+ pshufb m7, m0
+ mova [dstq +stride3q +16], m7
+ mova [dst8q+stride3q ], m7
+ pshufb m7, m0
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ mova [dstq +16], m7
+ mova [dst8q ], m7
+ pshufb m7, m0
+ mova [dstq +strideq +16], m7
+ mova [dst8q+strideq ], m7
+ pshufb m7, m0
+ mova [dstq +strideq*2+16], m7
+ mova [dst8q+strideq*2 ], m7
+ pshufb m7, m0
+ mova [dstq +stride3q +16], m7
+ mova [dst8q+stride3q ], m7
+ pshufb m7, m0
+ lea dstq, [dstq+strideq*4]
+
+ ; output last half of 4th 8 lines
+ mova [dstq +16], m7
+ mova [dstq +strideq +16], m7
+ mova [dstq +strideq*2+16], m7
+ mova [dstq +stride3q +16], m7
+ lea dstq, [dstq+strideq*4]
+ mova [dstq +16], m7
+ mova [dstq +strideq +16], m7
+ mova [dstq +strideq*2+16], m7
+ mova [dstq +stride3q +16], m7
+
+ ; done!
+ RESTORE_GOT
RET
diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
new file mode 100644
index 0000000..3c5cb8f
--- /dev/null
+++ b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
@@ -0,0 +1,943 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> /* AVX2 */
+
+static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
+ const unsigned char *_blimit, const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+ __m128i abs_p1p0;
+
+ const __m128i thresh = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _thresh[0]));
+ const __m128i limit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _limit[0]));
+ const __m128i blimit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _blimit[0]));
+
+ q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
+ q4p4 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
+ q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
+ q3p3 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
+ q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
+ q2p2 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
+ q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
+ q1p1 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
+ q0p0 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+ _mm_subs_epu8(q0p0, q1p1));
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8(0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+ _mm_subs_epu8(p0q0, q0p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+ _mm_subs_epu8(p1q1, q1p1));
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+ _mm_subs_epu8(q1p1, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+ _mm_subs_epu8(q2p2, q3p3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+ __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+ __m128i qs0 = _mm_xor_si128(p0q0, t80);
+ __m128i qs1 = _mm_xor_si128(p1q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+ filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, qs0ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 0xB);
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 0xB);
+
+ /* Filter1 >> 3 */
+ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+ qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(
+ _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
+ filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+ qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ flat = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+ _mm_subs_epu8(q0p0, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+ _mm_subs_epu8(q0p0, q3p3)));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
+ q5p5 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q5p5),
+ (__m64 *) (s + 5 * p)));
+
+ q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
+ q6p6 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q6p6),
+ (__m64 *) (s + 6 * p)));
+
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+ _mm_subs_epu8(q0p0, q4p4)),
+ _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+ _mm_subs_epu8(q0p0, q5p5)));
+
+ q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
+ q7p7 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q7p7),
+ (__m64 *) (s + 7 * p)));
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+ _mm_subs_epu8(q0p0, q6p6)),
+ _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+ _mm_subs_epu8(q0p0, q7p7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+ p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+ q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+ _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+ _mm_add_epi16(q4_16, q3_16));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
+ _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
+ _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p = _mm_add_epi16(eight,
+ _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(four,
+ _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
+ 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(q3_16, q0_16)), 3);
+
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(p7_16, p7_16);
+ sum_q7 = _mm_add_epi16(q7_16, q7_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
+ 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0,
+ _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
+ 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0,
+ _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
+ 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
+ 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
+ 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
+ 4);
+ flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ flat = _mm_shuffle_epi32(flat, 68);
+ flat2 = _mm_shuffle_epi32(flat2, 68);
+
+ q2p2 = _mm_andnot_si128(flat, q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ q6p6 = _mm_andnot_si128(flat2, q6p6);
+ flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+ q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+ _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
+ _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
+
+ q5p5 = _mm_andnot_si128(flat2, q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+ _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
+ _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
+
+ q4p4 = _mm_andnot_si128(flat2, q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+ _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
+ _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
+
+ q3p3 = _mm_andnot_si128(flat2, q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+ _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
+ _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
+
+ q2p2 = _mm_andnot_si128(flat2, q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+ _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
+ _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
+
+ q1p1 = _mm_andnot_si128(flat2, q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+ _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
+ _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
+
+ q0p0 = _mm_andnot_si128(flat2, q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+ _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
+ _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
+ }
+}
+
+static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
+ const unsigned char *_blimit, const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i p7, p6, p5;
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i q5, q6, q7;
+
+ const __m128i thresh = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _thresh[0]));
+ const __m128i limit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _limit[0]));
+ const __m128i blimit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _blimit[0]));
+
+ p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
+ p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
+ p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
+ q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
+ q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
+
+ {
+ const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+ _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+ _mm_subs_epu8(q0, q1));
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+ _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+ _mm_subs_epu8(q1, p1));
+ __m128i work;
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i te0 = _mm_set1_epi8(0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+ flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
+ flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
+ flat_q2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ /* Filter1 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ /* Filter2 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+ _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
+ q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+ _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
+ q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+ _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+
+ p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
+ q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+ _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m256i eight = _mm256_set1_epi16(8);
+ const __m256i four = _mm256_set1_epi16(4);
+ __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+ q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+ p256_0, q256_0;
+ __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+ pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
+ res_q;
+
+ p256_7 = _mm256_cvtepu8_epi16(p7);
+ p256_6 = _mm256_cvtepu8_epi16(p6);
+ p256_5 = _mm256_cvtepu8_epi16(p5);
+ p256_4 = _mm256_cvtepu8_epi16(p4);
+ p256_3 = _mm256_cvtepu8_epi16(p3);
+ p256_2 = _mm256_cvtepu8_epi16(p2);
+ p256_1 = _mm256_cvtepu8_epi16(p1);
+ p256_0 = _mm256_cvtepu8_epi16(p0);
+ q256_0 = _mm256_cvtepu8_epi16(q0);
+ q256_1 = _mm256_cvtepu8_epi16(q1);
+ q256_2 = _mm256_cvtepu8_epi16(q2);
+ q256_3 = _mm256_cvtepu8_epi16(q3);
+ q256_4 = _mm256_cvtepu8_epi16(q4);
+ q256_5 = _mm256_cvtepu8_epi16(q5);
+ q256_6 = _mm256_cvtepu8_epi16(q6);
+ q256_7 = _mm256_cvtepu8_epi16(q7);
+
+ pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+ _mm256_add_epi16(p256_4, p256_3));
+ pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+ _mm256_add_epi16(q256_4, q256_3));
+
+ pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
+ _mm256_add_epi16(p256_2, p256_1));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
+ _mm256_add_epi16(q256_2, q256_1));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+
+ pixelFilter_p = _mm256_add_epi16(eight,
+ _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+
+ pixetFilter_p2p1p0 = _mm256_add_epi16(four,
+ _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(p256_7, p256_0)), 4);
+
+ flat2_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(q256_7, q256_0)), 4);
+
+ flat2_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(p256_3, p256_0)), 3);
+
+ flat_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(q256_3, q256_0)), 3);
+
+ flat_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+
+ sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+
+ sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_1)), 4);
+
+ flat2_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_1)), 4);
+
+ flat2_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_1)), 3);
+
+ flat_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_1)), 3);
+
+ flat_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+
+ sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_2)), 4);
+
+ flat2_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_2)), 4);
+
+ flat2_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_2)), 3);
+
+ flat_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_2)), 3);
+
+ flat_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_3)), 4);
+
+ flat2_p3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_3)), 4);
+
+ flat2_q3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_4)), 4);
+
+ flat2_p4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_4)), 4);
+
+ flat2_q4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_5)), 4);
+
+ flat2_p5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_5)), 4);
+
+ flat2_q5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_6)), 4);
+
+ flat2_p6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_6)), 4);
+
+ flat2_q6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+ }
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ p2 = _mm_andnot_si128(flat, p2);
+ flat_p2 = _mm_and_si128(flat, flat_p2);
+ p2 = _mm_or_si128(flat_p2, p2);
+
+ p1 = _mm_andnot_si128(flat, ps1);
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ p1 = _mm_or_si128(flat_p1, p1);
+
+ p0 = _mm_andnot_si128(flat, ps0);
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ p0 = _mm_or_si128(flat_p0, p0);
+
+ q0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(flat_q0, q0);
+
+ q1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(flat_q1, q1);
+
+ q2 = _mm_andnot_si128(flat, q2);
+ flat_q2 = _mm_and_si128(flat, flat_q2);
+ q2 = _mm_or_si128(flat_q2, q2);
+
+ p6 = _mm_andnot_si128(flat2, p6);
+ flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+ p6 = _mm_or_si128(flat2_p6, p6);
+ _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
+
+ p5 = _mm_andnot_si128(flat2, p5);
+ flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+ p5 = _mm_or_si128(flat2_p5, p5);
+ _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
+
+ p4 = _mm_andnot_si128(flat2, p4);
+ flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+ p4 = _mm_or_si128(flat2_p4, p4);
+ _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
+
+ p3 = _mm_andnot_si128(flat2, p3);
+ flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+ p3 = _mm_or_si128(flat2_p3, p3);
+ _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
+
+ p2 = _mm_andnot_si128(flat2, p2);
+ flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+ p2 = _mm_or_si128(flat2_p2, p2);
+ _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
+
+ p1 = _mm_andnot_si128(flat2, p1);
+ flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+ p1 = _mm_or_si128(flat2_p1, p1);
+ _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
+
+ p0 = _mm_andnot_si128(flat2, p0);
+ flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+ p0 = _mm_or_si128(flat2_p0, p0);
+ _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
+
+ q0 = _mm_andnot_si128(flat2, q0);
+ flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+ q0 = _mm_or_si128(flat2_q0, q0);
+ _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
+
+ q1 = _mm_andnot_si128(flat2, q1);
+ flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+ q1 = _mm_or_si128(flat2_q1, q1);
+ _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
+
+ q2 = _mm_andnot_si128(flat2, q2);
+ flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+ q2 = _mm_or_si128(flat2_q2, q2);
+ _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
+
+ q3 = _mm_andnot_si128(flat2, q3);
+ flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+ q3 = _mm_or_si128(flat2_q3, q3);
+ _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
+
+ q4 = _mm_andnot_si128(flat2, q4);
+ flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+ q4 = _mm_or_si128(flat2_q4, q4);
+ _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
+
+ q5 = _mm_andnot_si128(flat2, q5);
+ flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+ q5 = _mm_or_si128(flat2_q5, q5);
+ _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
+
+ q6 = _mm_andnot_si128(flat2, q6);
+ flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+ q6 = _mm_or_si128(flat2_q6, q6);
+ _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
+ }
+}
+
+void vp9_mb_lpf_horizontal_edge_w_avx2(unsigned char *s, int p,
+ const unsigned char *_blimit, const unsigned char *_limit,
+ const unsigned char *_thresh, int count) {
+ if (count == 1)
+ mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
+ else
+ mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
+}
diff --git a/libvpx/vp9/common/x86/vp9_postproc_x86.h b/libvpx/vp9/common/x86/vp9_postproc_x86.h
index b0e8b18..8870215 100644
--- a/libvpx/vp9/common/x86/vp9_postproc_x86.h
+++ b/libvpx/vp9/common/x86/vp9_postproc_x86.h
@@ -61,4 +61,4 @@ extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt);
#endif
#endif
-#endif
+#endif // VP9_COMMON_X86_VP9_POSTPROC_X86_H_
diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm b/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm
new file mode 100644
index 0000000..9dc8d0a
--- /dev/null
+++ b/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm
@@ -0,0 +1,987 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ psrldq xmm7, 8
+ pshuflw xmm4, xmm7, 0b ;k4
+ pshuflw xmm5, xmm7, 01010101b ;k5
+ pshuflw xmm6, xmm7, 10101010b ;k6
+ pshuflw xmm7, xmm7, 11111111b ;k7
+
+ punpcklqdq xmm0, xmm1
+ punpcklqdq xmm2, xmm3
+ punpcklqdq xmm5, xmm4
+ punpcklqdq xmm6, xmm7
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm2
+ movdqa k5k4, xmm5
+ movdqa k6k7, xmm6
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+ punpckldq xmm0, xmm1 ;two row in one register
+ punpckldq xmm6, xmm7
+ punpckldq xmm2, xmm3
+ punpckldq xmm5, xmm4
+
+ punpcklbw xmm0, zero ;unpack to word
+ punpcklbw xmm6, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+
+ pmullw xmm0, k0k1 ;multiply the filter factors
+ pmullw xmm6, k6k7
+ pmullw xmm2, k2k3
+ pmullw xmm5, k5k4
+
+ paddsw xmm0, xmm6 ;sum
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm2
+ psrldq xmm2, 8
+ paddsw xmm0, xmm5
+ psrldq xmm5, 8
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ pshufhw xmm4, xmm7, 0b ;k4
+ pshufhw xmm5, xmm7, 01010101b ;k5
+ pshufhw xmm6, xmm7, 10101010b ;k6
+ pshufhw xmm7, xmm7, 11111111b ;k7
+
+ punpcklwd xmm0, xmm0
+ punpcklwd xmm1, xmm1
+ punpcklwd xmm2, xmm2
+ punpcklwd xmm3, xmm3
+ punpckhwd xmm4, xmm4
+ punpckhwd xmm5, xmm5
+ punpckhwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+
+ movdqa k0, xmm0 ;store filter factors on stack
+ movdqa k1, xmm1
+ movdqa k2, xmm2
+ movdqa k3, xmm3
+ movdqa k4, xmm4
+ movdqa k5, xmm5
+ movdqa k6, xmm6
+ movdqa k7, xmm7
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6 ;rounding
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+ movq xmm0, [rsi + %1] ;0
+ movq xmm1, [rsi + rax + %1] ;1
+ movq xmm6, [rsi + rdx * 2 + %1] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2 + %1] ;7
+ movq xmm2, [rsi + rax + %1] ;2
+ movq xmm3, [rsi + rax * 2 + %1] ;3
+ movq xmm4, [rsi + rdx + %1] ;4
+ movq xmm5, [rsi + rax * 4 + %1] ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+ punpcklbw xmm0, zero
+ punpcklbw xmm1, zero
+ punpcklbw xmm6, zero
+ punpcklbw xmm7, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+ punpcklbw xmm3, zero
+ punpcklbw xmm4, zero
+
+ pmullw xmm0, k0
+ pmullw xmm1, k1
+ pmullw xmm6, k6
+ pmullw xmm7, k7
+ pmullw xmm2, k2
+ pmullw xmm5, k5
+ pmullw xmm3, k3
+ pmullw xmm4, k4
+
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+ paddsw xmm0, xmm3
+ paddsw xmm0, xmm4
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+%if %1
+ movq xmm1, [rdi + %2]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi + %2], xmm0
+%endm
+
+;void vp9_filter_block1d4_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d4_v8_sse2) PRIVATE
+sym(vp9_filter_block1d4_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movd xmm0, [rsi] ;load src: row 0
+ movd xmm1, [rsi + rax] ;1
+ movd xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movd xmm7, [rsi + rdx * 2] ;7
+ movd xmm2, [rsi + rax] ;2
+ movd xmm3, [rsi + rax * 2] ;3
+ movd xmm4, [rsi + rdx] ;4
+ movd xmm5, [rsi + rax * 4] ;5
+
+ APPLY_FILTER_4 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d8_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d8_v8_sse2) PRIVATE
+sym(vp9_filter_block1d8_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d16_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d16_v8_sse2) PRIVATE
+sym(vp9_filter_block1d16_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 8
+ APPLY_FILTER_8 0, 8
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d4_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movd xmm0, [rsi] ;load src: row 0
+ movd xmm1, [rsi + rax] ;1
+ movd xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movd xmm7, [rsi + rdx * 2] ;7
+ movd xmm2, [rsi + rax] ;2
+ movd xmm3, [rsi + rax * 2] ;3
+ movd xmm4, [rsi + rdx] ;4
+ movd xmm5, [rsi + rax * 4] ;5
+
+ APPLY_FILTER_4 1
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d8_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 1, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d16_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 1, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 8
+ APPLY_FILTER_8 1, 8
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d4_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d4_h8_sse2) PRIVATE
+sym(vp9_filter_block1d4_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm3, 3
+ psrldq xmm5, 5
+ psrldq xmm4, 4
+
+ APPLY_FILTER_4 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d8_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d8_h8_sse2) PRIVATE
+sym(vp9_filter_block1d8_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d16_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d16_h8_sse2) PRIVATE
+sym(vp9_filter_block1d16_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ movdqu xmm0, [rsi + 5] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 8
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d4_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm3, 3
+ psrldq xmm5, 5
+ psrldq xmm4, 4
+
+ APPLY_FILTER_4 1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d8_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d16_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 0
+
+ movdqu xmm0, [rsi + 5] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 8
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index bbf9888..7a5cca0 100644
--- a/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -534,6 +534,21 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
ret
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%macro HORIZx4_ROW 2
+ movdqa %2, %1
+ pshufb %1, [GLOBAL(shuf_t0t1)]
+ pshufb %2, [GLOBAL(shuf_t2t3)]
+ pmaddubsw %1, xmm6
+ pmaddubsw %2, xmm7
+
+ paddsw %1, %2
+ movdqa %2, %1
+ psrldq %2, 8
+ paddsw %1, %2
+ paddsw %1, xmm5
+ psraw %1, 7
+ packuswb %1, %1
+%endm
%macro HORIZx4 1
mov rdx, arg(5) ;filter ptr
@@ -544,64 +559,84 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
movdqa xmm4, [rdx] ;load filters
movq xmm5, rcx
packsswb xmm4, xmm4
- pshuflw xmm0, xmm4, 0b ;k0_k1
- pshuflw xmm1, xmm4, 01010101b ;k2_k3
- pshuflw xmm2, xmm4, 10101010b ;k4_k5
- pshuflw xmm3, xmm4, 11111111b ;k6_k7
-
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm1
- pshufd xmm5, xmm5, 0
- movdqa k4k5, xmm2
- movdqa k6k7, xmm3
- movdqa krd, xmm5
+ pshuflw xmm6, xmm4, 0b ;k0_k1
+ pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5
+ pshuflw xmm7, xmm4, 01010101b ;k2_k3
+ pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7
+ pshufd xmm5, xmm5, 0 ;rounding
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rdx, dword ptr arg(3) ;output_pitch
movsxd rcx, dword ptr arg(4) ;output_height
-
+ shr rcx, 1
.loop:
- movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
-
- movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
- punpcklqdq xmm0, xmm3
+ ;Do two rows once
+ movq xmm0, [rsi - 3] ;load src
+ movq xmm1, [rsi + 5]
+ movq xmm2, [rsi + rax - 3]
+ movq xmm3, [rsi + rax + 5]
+ punpcklqdq xmm0, xmm1
+ punpcklqdq xmm2, xmm3
+
+ HORIZx4_ROW xmm0, xmm1
+ HORIZx4_ROW xmm2, xmm3
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+ movd xmm3, [rdi + rdx]
+ pavgb xmm2, xmm3
+%endif
+ movd [rdi], xmm0
+ movd [rdi +rdx], xmm2
- movdqa xmm1, xmm0
- pshufb xmm0, [GLOBAL(shuf_t0t1)]
- pmaddubsw xmm0, k0k1
+ lea rsi, [rsi + rax]
+ prefetcht0 [rsi + 4 * rax - 3]
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + 2 * rdx]
+ prefetcht0 [rsi + 2 * rax - 3]
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf_t2t3)]
- pmaddubsw xmm1, k2k3
+ dec rcx
+ jnz .loop
- movdqa xmm4, xmm2
- pshufb xmm2, [GLOBAL(shuf_t4t5)]
- pmaddubsw xmm2, k4k5
+ ; Do last row if output_height is odd
+ movsxd rcx, dword ptr arg(4) ;output_height
+ and rcx, 1
+ je .done
- pshufb xmm4, [GLOBAL(shuf_t6t7)]
- pmaddubsw xmm4, k6k7
+ movq xmm0, [rsi - 3] ; load src
+ movq xmm1, [rsi + 5]
+ punpcklqdq xmm0, xmm1
- paddsw xmm0, xmm1
- paddsw xmm0, xmm4
- paddsw xmm0, xmm2
- paddsw xmm0, krd
- psraw xmm0, 7
- packuswb xmm0, xmm0
+ HORIZx4_ROW xmm0, xmm1
%if %1
movd xmm1, [rdi]
pavgb xmm0, xmm1
%endif
- lea rsi, [rsi + rax]
movd [rdi], xmm0
+.done
+%endm
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
+%macro HORIZx8_ROW 4
+ movdqa %2, %1
+ movdqa %3, %1
+ movdqa %4, %1
+
+ pshufb %1, [GLOBAL(shuf_t0t1)]
+ pshufb %2, [GLOBAL(shuf_t2t3)]
+ pshufb %3, [GLOBAL(shuf_t4t5)]
+ pshufb %4, [GLOBAL(shuf_t6t7)]
+
+ pmaddubsw %1, k0k1
+ pmaddubsw %2, k2k3
+ pmaddubsw %3, k4k5
+ pmaddubsw %4, k6k7
+
+ paddsw %1, %2
+ paddsw %1, %4
+ paddsw %1, %3
+ paddsw %1, krd
+ psraw %1, 7
+ packuswb %1, %1
%endm
%macro HORIZx8 1
@@ -633,45 +668,51 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rdx, dword ptr arg(3) ;output_pitch
movsxd rcx, dword ptr arg(4) ;output_height
+ shr rcx, 1
.loop:
- movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
-
- movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
+ movq xmm0, [rsi - 3] ;load src
+ movq xmm3, [rsi + 5]
+ movq xmm4, [rsi + rax - 3]
+ movq xmm7, [rsi + rax + 5]
punpcklqdq xmm0, xmm3
+ punpcklqdq xmm4, xmm7
- movdqa xmm1, xmm0
- pshufb xmm0, [GLOBAL(shuf_t0t1)]
- pmaddubsw xmm0, k0k1
+ HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
+ HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
+%if %1
+ movq xmm1, [rdi]
+ movq xmm2, [rdi + rdx]
+ pavgb xmm0, xmm1
+ pavgb xmm4, xmm2
+%endif
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm4
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf_t2t3)]
- pmaddubsw xmm1, k2k3
+ lea rsi, [rsi + rax]
+ prefetcht0 [rsi + 4 * rax - 3]
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + 2 * rdx]
+ prefetcht0 [rsi + 2 * rax - 3]
+ dec rcx
+ jnz .loop
- movdqa xmm4, xmm2
- pshufb xmm2, [GLOBAL(shuf_t4t5)]
- pmaddubsw xmm2, k4k5
+ ;Do last row if output_height is odd
+ movsxd rcx, dword ptr arg(4) ;output_height
+ and rcx, 1
+ je .done
- pshufb xmm4, [GLOBAL(shuf_t6t7)]
- pmaddubsw xmm4, k6k7
+ movq xmm0, [rsi - 3]
+ movq xmm3, [rsi + 5]
+ punpcklqdq xmm0, xmm3
- paddsw xmm0, xmm1
- paddsw xmm0, xmm4
- paddsw xmm0, xmm2
- paddsw xmm0, krd
- psraw xmm0, 7
- packuswb xmm0, xmm0
+ HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
%if %1
movq xmm1, [rdi]
pavgb xmm0, xmm1
%endif
-
- lea rsi, [rsi + rax]
movq [rdi], xmm0
-
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
+.done
%endm
%macro HORIZx16 1
@@ -705,60 +746,53 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
movsxd rcx, dword ptr arg(4) ;output_height
.loop:
- movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
+ prefetcht0 [rsi + 2 * rax -3]
- movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
- punpcklqdq xmm0, xmm3
+ movq xmm0, [rsi - 3] ;load src data
+ movq xmm4, [rsi + 5]
+ movq xmm7, [rsi + 13]
+ punpcklqdq xmm0, xmm4
+ punpcklqdq xmm4, xmm7
movdqa xmm1, xmm0
- pshufb xmm0, [GLOBAL(shuf_t0t1)]
- pmaddubsw xmm0, k0k1
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
- movdqa xmm2, xmm1
+ pshufb xmm0, [GLOBAL(shuf_t0t1)]
pshufb xmm1, [GLOBAL(shuf_t2t3)]
- pmaddubsw xmm1, k2k3
-
- movdqa xmm4, xmm2
pshufb xmm2, [GLOBAL(shuf_t4t5)]
- pmaddubsw xmm2, k4k5
+ pshufb xmm3, [GLOBAL(shuf_t6t7)]
+ pshufb xmm4, [GLOBAL(shuf_t0t1)]
+ pshufb xmm5, [GLOBAL(shuf_t2t3)]
+ pshufb xmm6, [GLOBAL(shuf_t4t5)]
+ pshufb xmm7, [GLOBAL(shuf_t6t7)]
- pshufb xmm4, [GLOBAL(shuf_t6t7)]
- pmaddubsw xmm4, k6k7
+ pmaddubsw xmm0, k0k1
+ pmaddubsw xmm1, k2k3
+ pmaddubsw xmm2, k4k5
+ pmaddubsw xmm3, k6k7
+ pmaddubsw xmm4, k0k1
+ pmaddubsw xmm5, k2k3
+ pmaddubsw xmm6, k4k5
+ pmaddubsw xmm7, k6k7
paddsw xmm0, xmm1
- paddsw xmm0, xmm4
+ paddsw xmm0, xmm3
paddsw xmm0, xmm2
+ paddsw xmm4, xmm5
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm6
+
paddsw xmm0, krd
+ paddsw xmm4, krd
psraw xmm0, 7
+ psraw xmm4, 7
packuswb xmm0, xmm0
-
-
- movq xmm3, [rsi + 5]
- movq xmm7, [rsi + 13]
- punpcklqdq xmm3, xmm7
-
- movdqa xmm1, xmm3
- pshufb xmm3, [GLOBAL(shuf_t0t1)]
- pmaddubsw xmm3, k0k1
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf_t2t3)]
- pmaddubsw xmm1, k2k3
-
- movdqa xmm4, xmm2
- pshufb xmm2, [GLOBAL(shuf_t4t5)]
- pmaddubsw xmm2, k4k5
-
- pshufb xmm4, [GLOBAL(shuf_t6t7)]
- pmaddubsw xmm4, k6k7
-
- paddsw xmm3, xmm1
- paddsw xmm3, xmm4
- paddsw xmm3, xmm2
- paddsw xmm3, krd
- psraw xmm3, 7
- packuswb xmm3, xmm3
- punpcklqdq xmm0, xmm3
+ packuswb xmm4, xmm4
+ punpcklqdq xmm0, xmm4
%if %1
movdqa xmm1, [rdi]
pavgb xmm0, xmm1
@@ -792,19 +826,8 @@ sym(vp9_filter_block1d4_h8_ssse3):
push rdi
; end prolog
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
HORIZx4 0
- add rsp, 16*5
- pop rsp
-
; begin epilog
pop rdi
pop rsi
@@ -909,19 +932,8 @@ sym(vp9_filter_block1d4_h8_avg_ssse3):
push rdi
; end prolog
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
HORIZx4 1
- add rsp, 16*5
- pop rsp
-
; begin epilog
pop rdi
pop rsi
diff --git a/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm b/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm
deleted file mode 100644
index 174e747..0000000
--- a/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm
+++ /dev/null
@@ -1,230 +0,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp9_add_constant_residual_8x8_neon|
- EXPORT |vp9_add_constant_residual_16x16_neon|
- EXPORT |vp9_add_constant_residual_32x32_neon|
- ARM
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- MACRO
- LD_16x8 $src, $stride
- vld1.8 {q8}, [$src], $stride
- vld1.8 {q9}, [$src], $stride
- vld1.8 {q10}, [$src], $stride
- vld1.8 {q11}, [$src], $stride
- vld1.8 {q12}, [$src], $stride
- vld1.8 {q13}, [$src], $stride
- vld1.8 {q14}, [$src], $stride
- vld1.8 {q15}, [$src], $stride
- MEND
-
- MACRO
- ADD_DIFF_16x8 $diff
- vqadd.u8 q8, q8, $diff
- vqadd.u8 q9, q9, $diff
- vqadd.u8 q10, q10, $diff
- vqadd.u8 q11, q11, $diff
- vqadd.u8 q12, q12, $diff
- vqadd.u8 q13, q13, $diff
- vqadd.u8 q14, q14, $diff
- vqadd.u8 q15, q15, $diff
- MEND
-
- MACRO
- SUB_DIFF_16x8 $diff
- vqsub.u8 q8, q8, $diff
- vqsub.u8 q9, q9, $diff
- vqsub.u8 q10, q10, $diff
- vqsub.u8 q11, q11, $diff
- vqsub.u8 q12, q12, $diff
- vqsub.u8 q13, q13, $diff
- vqsub.u8 q14, q14, $diff
- vqsub.u8 q15, q15, $diff
- MEND
-
- MACRO
- ST_16x8 $dst, $stride
- vst1.8 {q8}, [$dst], $stride
- vst1.8 {q9}, [$dst], $stride
- vst1.8 {q10}, [$dst], $stride
- vst1.8 {q11}, [$dst], $stride
- vst1.8 {q12}, [$dst], $stride
- vst1.8 {q13}, [$dst], $stride
- vst1.8 {q14}, [$dst], $stride
- vst1.8 {q15}, [$dst], $stride
- MEND
-
-; void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
-; int width, int height) {
-; int r, c;
-;
-; for (r = 0; r < height; r++) {
-; for (c = 0; c < width; c++)
-; dest[c] = clip_pixel(diff + dest[c]);
-;
-; dest += stride;
-; }
-;}
-;void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest,
-; int stride) {
-; add_constant_residual(diff, dest, stride, 8, 8);
-;}
-; r0 : const int16_t diff
-; r1 : const uint8_t *dest
-; r2 : int stride
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp9_add_constant_residual_8x8_neon| PROC
- mov r3, r1 ; r3: save dest to r3
- vld1.8 {d0}, [r1], r2
- vld1.8 {d1}, [r1], r2
- vld1.8 {d2}, [r1], r2
- vld1.8 {d3}, [r1], r2
- vld1.8 {d4}, [r1], r2
- vld1.8 {d5}, [r1], r2
- vld1.8 {d6}, [r1], r2
- vld1.8 {d7}, [r1], r2
- cmp r0, #0
- bge DIFF_POSITIVE_8x8
-
-DIFF_NEGATIVE_8x8 ; diff < 0
- neg r0, r0
- usat r0, #8, r0
- vdup.u8 q8, r0
-
- vqsub.u8 q0, q0, q8
- vqsub.u8 q1, q1, q8
- vqsub.u8 q2, q2, q8
- vqsub.u8 q3, q3, q8
- b DIFF_SAVE_8x8
-
-DIFF_POSITIVE_8x8 ; diff >= 0
- usat r0, #8, r0
- vdup.u8 q8, r0
-
- vqadd.u8 q0, q0, q8
- vqadd.u8 q1, q1, q8
- vqadd.u8 q2, q2, q8
- vqadd.u8 q3, q3, q8
-
-DIFF_SAVE_8x8
- vst1.8 {d0}, [r3], r2
- vst1.8 {d1}, [r3], r2
- vst1.8 {d2}, [r3], r2
- vst1.8 {d3}, [r3], r2
- vst1.8 {d4}, [r3], r2
- vst1.8 {d5}, [r3], r2
- vst1.8 {d6}, [r3], r2
- vst1.8 {d7}, [r3], r2
-
- bx lr
- ENDP
-
-;void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest,
-; int stride) {
-; add_constant_residual(diff, dest, stride, 16, 16);
-;}
-; r0 : const int16_t diff
-; r1 : const uint8_t *dest
-; r2 : int stride
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp9_add_constant_residual_16x16_neon| PROC
- mov r3, r1
- LD_16x8 r1, r2
- cmp r0, #0
- bge DIFF_POSITIVE_16x16
-
-|DIFF_NEGATIVE_16x16|
- neg r0, r0
- usat r0, #8, r0
- vdup.u8 q0, r0
-
- SUB_DIFF_16x8 q0
- ST_16x8 r3, r2
- LD_16x8 r1, r2
- SUB_DIFF_16x8 q0
- b DIFF_SAVE_16x16
-
-|DIFF_POSITIVE_16x16|
- usat r0, #8, r0
- vdup.u8 q0, r0
-
- ADD_DIFF_16x8 q0
- ST_16x8 r3, r2
- LD_16x8 r1, r2
- ADD_DIFF_16x8 q0
-
-|DIFF_SAVE_16x16|
- ST_16x8 r3, r2
- bx lr
- ENDP
-
-;void vp9_add_constant_residual_32x32_c(const int16_t diff, uint8_t *dest,
-; int stride) {
-; add_constant_residual(diff, dest, stride, 32, 32);
-;}
-; r0 : const int16_t diff
-; r1 : const uint8_t *dest
-; r2 : int stride
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp9_add_constant_residual_32x32_neon| PROC
- push {r4,lr}
- pld [r1]
- mov r3, r1
- add r4, r1, #16 ; r4 dest + 16 for second loop
- cmp r0, #0
- bge DIFF_POSITIVE_32x32
-
-|DIFF_NEGATIVE_32x32|
- neg r0, r0
- usat r0, #8, r0
- vdup.u8 q0, r0
- mov r0, #4
-
-|DIFF_NEGATIVE_32x32_LOOP|
- sub r0, #1
- LD_16x8 r1, r2
- SUB_DIFF_16x8 q0
- ST_16x8 r3, r2
-
- LD_16x8 r1, r2
- SUB_DIFF_16x8 q0
- ST_16x8 r3, r2
- cmp r0, #2
- moveq r1, r4
- moveq r3, r4
- cmp r0, #0
- bne DIFF_NEGATIVE_32x32_LOOP
- pop {r4,pc}
-
-|DIFF_POSITIVE_32x32|
- usat r0, #8, r0
- vdup.u8 q0, r0
- mov r0, #4
-
-|DIFF_POSITIVE_32x32_LOOP|
- sub r0, #1
- LD_16x8 r1, r2
- ADD_DIFF_16x8 q0
- ST_16x8 r3, r2
-
- LD_16x8 r1, r2
- ADD_DIFF_16x8 q0
- ST_16x8 r3, r2
- cmp r0, #2
- moveq r1, r4
- moveq r3, r4
- cmp r0, #0
- bne DIFF_POSITIVE_32x32_LOOP
- pop {r4,pc}
- ENDP
-
- END
diff --git a/libvpx/vp9/decoder/vp9_dboolhuff.h b/libvpx/vp9/decoder/vp9_dboolhuff.h
index c864516..fd8e74c 100644
--- a/libvpx/vp9/decoder/vp9_dboolhuff.h
+++ b/libvpx/vp9/decoder/vp9_dboolhuff.h
@@ -44,7 +44,7 @@ static int vp9_read(vp9_reader *br, int probability) {
VP9_BD_VALUE bigsplit;
int count;
unsigned int range;
- unsigned int split = 1 + (((br->range - 1) * probability) >> 8);
+ unsigned int split = ((br->range * probability) + (256 - probability)) >> 8;
if (br->count < 0)
vp9_reader_fill(br);
diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c
index 84a29b1..9792d2c 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/libvpx/vp9/decoder/vp9_decodemv.c
@@ -23,18 +23,36 @@
#include "vp9/decoder/vp9_decodemv.h"
#include "vp9/decoder/vp9_decodframe.h"
#include "vp9/decoder/vp9_onyxd_int.h"
-#include "vp9/decoder/vp9_dsubexp.h"
#include "vp9/decoder/vp9_treereader.h"
static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p);
}
+static MB_PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r,
+ int size_group) {
+ const MB_PREDICTION_MODE y_mode = read_intra_mode(r,
+ cm->fc.y_mode_prob[size_group]);
+ if (!cm->frame_parallel_decoding_mode)
+ ++cm->counts.y_mode[size_group][y_mode];
+ return y_mode;
+}
+
+static MB_PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r,
+ MB_PREDICTION_MODE y_mode) {
+ const MB_PREDICTION_MODE uv_mode = read_intra_mode(r,
+ cm->fc.uv_mode_prob[y_mode]);
+ if (!cm->frame_parallel_decoding_mode)
+ ++cm->counts.uv_mode[y_mode][uv_mode];
+ return uv_mode;
+}
+
static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r,
uint8_t context) {
- MB_PREDICTION_MODE mode = treed_read(r, vp9_inter_mode_tree,
- cm->fc.inter_mode_probs[context]);
- ++cm->counts.inter_mode[context][inter_mode_offset(mode)];
+ const MB_PREDICTION_MODE mode = treed_read(r, vp9_inter_mode_tree,
+ cm->fc.inter_mode_probs[context]);
+ if (!cm->frame_parallel_decoding_mode)
+ ++cm->counts.inter_mode[context][inter_mode_offset(mode)];
return mode;
}
@@ -53,33 +71,28 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
tx_size += vp9_read(r, tx_probs[2]);
}
- update_tx_counts(bsize, context, tx_size, &cm->counts.tx);
+ if (!cm->frame_parallel_decoding_mode)
+ ++get_tx_counts(bsize, context, &cm->counts.tx)[tx_size];
return tx_size;
}
-static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode,
- BLOCK_SIZE bsize, int allow_select,
+static TX_SIZE read_tx_size(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ TX_MODE tx_mode, BLOCK_SIZE bsize, int allow_select,
vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
-
- if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
+ if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8) {
return read_selected_tx_size(cm, xd, bsize, r);
- else if (tx_mode >= ALLOW_32X32 && bsize >= BLOCK_32X32)
- return TX_32X32;
- else if (tx_mode >= ALLOW_16X16 && bsize >= BLOCK_16X16)
- return TX_16X16;
- else if (tx_mode >= ALLOW_8X8 && bsize >= BLOCK_8X8)
- return TX_8X8;
- else
- return TX_4X4;
+ } else {
+ const TX_SIZE max_tx_size_block = max_txsize_lookup[bsize];
+ const TX_SIZE max_tx_size_txmode = tx_mode_to_biggest_tx_size[tx_mode];
+ return MIN(max_tx_size_block, max_tx_size_txmode);
+ }
}
static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
int mi_row, int mi_col, int segment_id) {
const int mi_offset = mi_row * cm->mi_cols + mi_col;
- const int bw = 1 << mi_width_log2(bsize);
- const int bh = 1 << mi_height_log2(bsize);
+ const int bw = num_8x8_blocks_wide_lookup[bsize];
+ const int bh = num_8x8_blocks_high_lookup[bsize];
const int xmis = MIN(cm->mi_cols - mi_col, bw);
const int ymis = MIN(cm->mi_rows - mi_row, bh);
int x, y;
@@ -91,11 +104,11 @@ static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
cm->last_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
}
-static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
+static int read_intra_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ int mi_row, int mi_col,
vp9_reader *r) {
- MACROBLOCKD *const xd = &pbi->mb;
- struct segmentation *const seg = &pbi->common.seg;
- const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type;
+ struct segmentation *const seg = &cm->seg;
+ const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type;
int segment_id;
if (!seg->enabled)
@@ -105,16 +118,14 @@ static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
return 0;
segment_id = read_segment_id(r, seg);
- set_segment_id(&pbi->common, bsize, mi_row, mi_col, segment_id);
+ set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
return segment_id;
}
-static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
- vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
+static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ int mi_row, int mi_col, vp9_reader *r) {
struct segmentation *const seg = &cm->seg;
- const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type;
+ const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type;
int pred_segment_id, segment_id;
if (!seg->enabled)
@@ -138,37 +149,37 @@ static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
return segment_id;
}
-static uint8_t read_skip_coeff(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
- int skip_coeff = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
- if (!skip_coeff) {
+static int read_skip_coeff(VP9_COMMON *cm, const MACROBLOCKD *xd,
+ int segment_id, vp9_reader *r) {
+ if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 1;
+ } else {
const int ctx = vp9_get_pred_context_mbskip(xd);
- skip_coeff = vp9_read(r, vp9_get_pred_prob_mbskip(cm, xd));
- cm->counts.mbskip[ctx][skip_coeff]++;
+ const int skip = vp9_read(r, cm->fc.mbskip_probs[ctx]);
+ if (!cm->frame_parallel_decoding_mode)
+ ++cm->counts.mbskip[ctx][skip];
+ return skip;
}
- return skip_coeff;
}
-static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
+static void read_intra_frame_mode_info(VP9_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ MODE_INFO *const m,
int mi_row, int mi_col, vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
MB_MODE_INFO *const mbmi = &m->mbmi;
const BLOCK_SIZE bsize = mbmi->sb_type;
const MODE_INFO *above_mi = xd->mi_8x8[-cm->mode_info_stride];
- const MODE_INFO *left_mi = xd->mi_8x8[-1];
+ const MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL;
- mbmi->segment_id = read_intra_segment_id(pbi, mi_row, mi_col, r);
- mbmi->skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
- mbmi->tx_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r);
+ mbmi->segment_id = read_intra_segment_id(cm, xd, mi_row, mi_col, r);
+ mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r);
+ mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, bsize, 1, r);
mbmi->ref_frame[0] = INTRA_FRAME;
mbmi->ref_frame[1] = NONE;
if (bsize >= BLOCK_8X8) {
const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0);
- const MB_PREDICTION_MODE L = xd->left_available ?
- left_block_mode(m, left_mi, 0) : DC_PRED;
+ const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, 0);
mbmi->mode = read_intra_mode(r, vp9_kf_y_mode_prob[A][L]);
} else {
// Only 4x4, 4x8, 8x4 blocks
@@ -180,8 +191,7 @@ static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
for (idx = 0; idx < 2; idx += num_4x4_w) {
const int ib = idy * 2 + idx;
const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, ib);
- const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
- left_block_mode(m, left_mi, ib) : DC_PRED;
+ const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, ib);
const MB_PREDICTION_MODE b_mode = read_intra_mode(r,
vp9_kf_y_mode_prob[A][L]);
m->bmi[ib].as_mode = b_mode;
@@ -200,7 +210,6 @@ static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
static int read_mv_component(vp9_reader *r,
const nmv_component *mvcomp, int usehp) {
-
int mag, d, fr, hp;
const int sign = vp9_read(r, mvcomp->sign);
const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
@@ -251,56 +260,10 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
mv->col = ref->col + diff.col;
}
-static void update_mv(vp9_reader *r, vp9_prob *p) {
- if (vp9_read(r, NMV_UPDATE_PROB))
- *p = (vp9_read_literal(r, 7) << 1) | 1;
-}
-
-static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int allow_hp) {
- int i, j, k;
-
- for (j = 0; j < MV_JOINTS - 1; ++j)
- update_mv(r, &mvc->joints[j]);
-
- for (i = 0; i < 2; ++i) {
- nmv_component *const comp = &mvc->comps[i];
-
- update_mv(r, &comp->sign);
-
- for (j = 0; j < MV_CLASSES - 1; ++j)
- update_mv(r, &comp->classes[j]);
-
- for (j = 0; j < CLASS0_SIZE - 1; ++j)
- update_mv(r, &comp->class0[j]);
-
- for (j = 0; j < MV_OFFSET_BITS; ++j)
- update_mv(r, &comp->bits[j]);
- }
-
- for (i = 0; i < 2; ++i) {
- nmv_component *const comp = &mvc->comps[i];
-
- for (j = 0; j < CLASS0_SIZE; ++j)
- for (k = 0; k < 3; ++k)
- update_mv(r, &comp->class0_fp[j][k]);
-
- for (j = 0; j < 3; ++j)
- update_mv(r, &comp->fp[j]);
- }
-
- if (allow_hp) {
- for (i = 0; i < 2; ++i) {
- update_mv(r, &mvc->comps[i].class0_hp);
- update_mv(r, &mvc->comps[i].hp);
- }
- }
-}
-
// Read the referncence frame
-static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r,
+static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ vp9_reader *r,
int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
FRAME_CONTEXT *const fc = &cm->fc;
FRAME_COUNTS *const counts = &cm->counts;
@@ -313,7 +276,8 @@ static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r,
if (cm->comp_pred_mode == HYBRID_PREDICTION) {
is_comp = vp9_read(r, fc->comp_inter_prob[comp_ctx]);
- counts->comp_inter[comp_ctx][is_comp]++;
+ if (!cm->frame_parallel_decoding_mode)
+ ++counts->comp_inter[comp_ctx][is_comp];
} else {
is_comp = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
}
@@ -323,18 +287,21 @@ static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r,
const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
const int ref_ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
const int b = vp9_read(r, fc->comp_ref_prob[ref_ctx]);
- counts->comp_ref[ref_ctx][b]++;
+ if (!cm->frame_parallel_decoding_mode)
+ ++counts->comp_ref[ref_ctx][b];
ref_frame[fix_ref_idx] = cm->comp_fixed_ref;
ref_frame[!fix_ref_idx] = cm->comp_var_ref[b];
} else {
const int ctx0 = vp9_get_pred_context_single_ref_p1(xd);
const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]);
- ++counts->single_ref[ctx0][0][bit0];
+ if (!cm->frame_parallel_decoding_mode)
+ ++counts->single_ref[ctx0][0][bit0];
if (bit0) {
const int ctx1 = vp9_get_pred_context_single_ref_p2(xd);
const int bit1 = vp9_read(r, fc->single_ref_prob[ctx1][1]);
ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
- ++counts->single_ref[ctx1][1][bit1];
+ if (!cm->frame_parallel_decoding_mode)
+ ++counts->single_ref[ctx1][1][bit1];
} else {
ref_frame[0] = LAST_FRAME;
}
@@ -344,43 +311,19 @@ static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r,
}
}
-static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
- int i, j;
- for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j)
- for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
- if (vp9_read(r, MODE_UPDATE_PROB))
- vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
-}
-static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
- int i, j;
- for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
- for (j = 0; j < INTER_MODES - 1; ++j)
- if (vp9_read(r, MODE_UPDATE_PROB))
- vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
-}
-
-static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
- COMPPREDMODE_TYPE mode = vp9_read_bit(r);
- if (mode)
- mode += vp9_read_bit(r);
- return mode;
-}
-
-static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
- VP9D_COMP *pbi, vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
+static INLINE INTERPOLATION_TYPE read_switchable_filter_type(
+ VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r) {
const int ctx = vp9_get_pred_context_switchable_interp(xd);
const int type = treed_read(r, vp9_switchable_interp_tree,
cm->fc.switchable_interp_prob[ctx]);
- ++cm->counts.switchable_interp[ctx][type];
+ if (!cm->frame_parallel_decoding_mode)
+ ++cm->counts.switchable_interp[ctx][type];
return type;
}
-static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
- vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
+static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi,
+ vp9_reader *r) {
MB_MODE_INFO *const mbmi = &mi->mbmi;
const BLOCK_SIZE bsize = mi->mbmi.sb_type;
@@ -388,9 +331,7 @@ static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
mbmi->ref_frame[1] = NONE;
if (bsize >= BLOCK_8X8) {
- const int size_group = size_group_lookup[bsize];
- mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[size_group]);
- cm->counts.y_mode[size_group][mbmi->mode]++;
+ mbmi->mode = read_intra_mode_y(cm, r, size_group_lookup[bsize]);
} else {
// Only 4x4, 4x8, 8x4 blocks
const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; // 1 or 2
@@ -400,10 +341,8 @@ static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
for (idy = 0; idy < 2; idy += num_4x4_h) {
for (idx = 0; idx < 2; idx += num_4x4_w) {
const int ib = idy * 2 + idx;
- const int b_mode = read_intra_mode(r, cm->fc.y_mode_prob[0]);
+ const int b_mode = read_intra_mode_y(cm, r, 0);
mi->bmi[ib].as_mode = b_mode;
- cm->counts.y_mode[0][b_mode]++;
-
if (num_4x4_h == 2)
mi->bmi[ib + 2].as_mode = b_mode;
if (num_4x4_w == 2)
@@ -413,55 +352,98 @@ static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
mbmi->mode = mi->bmi[3].as_mode;
}
- mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
- cm->counts.uv_mode[mbmi->mode][mbmi->uv_mode]++;
+ mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
}
-static int read_is_inter_block(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
+static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
+ int_mv mv[2], int_mv best_mv[2],
+ int_mv nearest_mv[2], int_mv near_mv[2],
+ int is_compound, int allow_hp, vp9_reader *r) {
+ int i;
+ int ret = 1;
+
+ switch (mode) {
+ case NEWMV: {
+ nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ?
+ NULL : &cm->counts.mv;
+ read_mv(r, &mv[0].as_mv, &best_mv[0].as_mv,
+ &cm->fc.nmvc, mv_counts, allow_hp);
+ if (is_compound)
+ read_mv(r, &mv[1].as_mv, &best_mv[1].as_mv,
+ &cm->fc.nmvc, mv_counts, allow_hp);
+ for (i = 0; i < 1 + is_compound; ++i) {
+ ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW;
+ ret = ret && mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW;
+ }
+ break;
+ }
+ case NEARESTMV: {
+ mv[0].as_int = nearest_mv[0].as_int;
+ if (is_compound) mv[1].as_int = nearest_mv[1].as_int;
+ break;
+ }
+ case NEARMV: {
+ mv[0].as_int = near_mv[0].as_int;
+ if (is_compound) mv[1].as_int = near_mv[1].as_int;
+ break;
+ }
+ case ZEROMV: {
+ mv[0].as_int = 0;
+ if (is_compound) mv[1].as_int = 0;
+ break;
+ }
+ default: {
+ return 0;
+ }
+ }
+ return ret;
+}
+static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ int segment_id, vp9_reader *r) {
if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
return vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) !=
INTRA_FRAME;
} else {
const int ctx = vp9_get_pred_context_intra_inter(xd);
const int is_inter = vp9_read(r, vp9_get_pred_prob_intra_inter(cm, xd));
- ++cm->counts.intra_inter[ctx][is_inter];
+ if (!cm->frame_parallel_decoding_mode)
+ ++cm->counts.intra_inter[ctx][is_inter];
return is_inter;
}
}
-static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
+static void read_inter_block_mode_info(VP9_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ const TileInfo *const tile,
+ MODE_INFO *const mi,
int mi_row, int mi_col, vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
- nmv_context *const nmvc = &cm->fc.nmvc;
MB_MODE_INFO *const mbmi = &mi->mbmi;
- int_mv *const mv0 = &mbmi->mv[0];
- int_mv *const mv1 = &mbmi->mv[1];
const BLOCK_SIZE bsize = mbmi->sb_type;
- const int allow_hp = xd->allow_high_precision_mv;
+ const int allow_hp = cm->allow_high_precision_mv;
- int_mv nearest, nearby, best_mv;
- int_mv nearest_second, nearby_second, best_mv_second;
+ int_mv nearest[2], nearmv[2], best[2];
uint8_t inter_mode_ctx;
MV_REFERENCE_FRAME ref0;
int is_compound;
mbmi->uv_mode = DC_PRED;
- read_ref_frames(pbi, r, mbmi->segment_id, mbmi->ref_frame);
+ read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
ref0 = mbmi->ref_frame[0];
is_compound = has_second_ref(mbmi);
- vp9_find_mv_refs(cm, xd, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0],
+ vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0],
mi_row, mi_col);
inter_mode_ctx = mbmi->mode_context[ref0];
if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
mbmi->mode = ZEROMV;
- assert(bsize >= BLOCK_8X8);
+ if (bsize < BLOCK_8X8) {
+ vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+ "Invalid usage of segement feature on small blocks");
+ return;
+ }
} else {
if (bsize >= BLOCK_8X8)
mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
@@ -469,222 +451,119 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
// nearest, nearby
if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
- vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref0], &nearest, &nearby);
- best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int;
+ vp9_find_best_ref_mvs(xd, allow_hp,
+ mbmi->ref_mvs[ref0], &nearest[0], &nearmv[0]);
+ best[0].as_int = nearest[0].as_int;
}
if (is_compound) {
const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
- vp9_find_mv_refs(cm, xd, mi, xd->last_mi,
+ vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi,
ref1, mbmi->ref_mvs[ref1], mi_row, mi_col);
if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
- vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref1],
- &nearest_second, &nearby_second);
- best_mv_second.as_int = mbmi->ref_mvs[ref1][0].as_int;
+ vp9_find_best_ref_mvs(xd, allow_hp,
+ mbmi->ref_mvs[ref1], &nearest[1], &nearmv[1]);
+ best[1].as_int = nearest[1].as_int;
}
}
- mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
- ? read_switchable_filter_type(pbi, r)
- : cm->mcomp_filter_type;
+ mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE)
+ ? read_switchable_filter_type(cm, xd, r)
+ : cm->mcomp_filter_type;
if (bsize < BLOCK_8X8) {
const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; // 1 or 2
const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; // 1 or 2
int idx, idy;
+ int b_mode;
for (idy = 0; idy < 2; idy += num_4x4_h) {
for (idx = 0; idx < 2; idx += num_4x4_w) {
- int_mv blockmv, secondmv;
+ int_mv block[2];
const int j = idy * 2 + idx;
- const int b_mode = read_inter_mode(cm, r, inter_mode_ctx);
+ b_mode = read_inter_mode(cm, r, inter_mode_ctx);
if (b_mode == NEARESTMV || b_mode == NEARMV) {
- vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0,
+ vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[0],
+ &nearmv[0], j, 0,
mi_row, mi_col);
if (is_compound)
- vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest_second,
- &nearby_second, j, 1,
- mi_row, mi_col);
+ vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[1],
+ &nearmv[1], j, 1,
+ mi_row, mi_col);
}
- switch (b_mode) {
- case NEWMV:
- read_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
- &cm->counts.mv, allow_hp);
-
- if (is_compound)
- read_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
- &cm->counts.mv, allow_hp);
- break;
- case NEARESTMV:
- blockmv.as_int = nearest.as_int;
- if (is_compound)
- secondmv.as_int = nearest_second.as_int;
- break;
- case NEARMV:
- blockmv.as_int = nearby.as_int;
- if (is_compound)
- secondmv.as_int = nearby_second.as_int;
- break;
- case ZEROMV:
- blockmv.as_int = 0;
- if (is_compound)
- secondmv.as_int = 0;
- break;
- default:
- assert(!"Invalid inter mode value");
- }
- mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
+ if (!assign_mv(cm, b_mode, block, best, nearest, nearmv,
+ is_compound, allow_hp, r)) {
+ xd->corrupted |= 1;
+ break;
+ };
+
+
+ mi->bmi[j].as_mv[0].as_int = block[0].as_int;
if (is_compound)
- mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
+ mi->bmi[j].as_mv[1].as_int = block[1].as_int;
if (num_4x4_h == 2)
mi->bmi[j + 2] = mi->bmi[j];
if (num_4x4_w == 2)
mi->bmi[j + 1] = mi->bmi[j];
- mi->mbmi.mode = b_mode;
}
}
- mv0->as_int = mi->bmi[3].as_mv[0].as_int;
- mv1->as_int = mi->bmi[3].as_mv[1].as_int;
- } else {
- switch (mbmi->mode) {
- case NEARMV:
- mv0->as_int = nearby.as_int;
- if (is_compound)
- mv1->as_int = nearby_second.as_int;
- break;
+ mi->mbmi.mode = b_mode;
- case NEARESTMV:
- mv0->as_int = nearest.as_int;
- if (is_compound)
- mv1->as_int = nearest_second.as_int;
- break;
-
- case ZEROMV:
- mv0->as_int = 0;
- if (is_compound)
- mv1->as_int = 0;
- break;
-
- case NEWMV:
- read_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->counts.mv, allow_hp);
- if (is_compound)
- read_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc, &cm->counts.mv,
- allow_hp);
- break;
- default:
- assert(!"Invalid inter mode value");
- }
+ mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+ mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+ } else {
+ xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv,
+ best, nearest, nearmv,
+ is_compound, allow_hp, r);
}
}
-static void read_inter_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
+static void read_inter_frame_mode_info(VP9_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ const TileInfo *const tile,
+ MODE_INFO *const mi,
int mi_row, int mi_col, vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
MB_MODE_INFO *const mbmi = &mi->mbmi;
int inter_block;
mbmi->mv[0].as_int = 0;
mbmi->mv[1].as_int = 0;
- mbmi->segment_id = read_inter_segment_id(pbi, mi_row, mi_col, r);
- mbmi->skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
- inter_block = read_is_inter_block(pbi, mbmi->segment_id, r);
- mbmi->tx_size = read_tx_size(pbi, cm->tx_mode, mbmi->sb_type,
+ mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
+ mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r);
+ inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
+ mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type,
!mbmi->skip_coeff || !inter_block, r);
if (inter_block)
- read_inter_block_mode_info(pbi, mi, mi_row, mi_col, r);
+ read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
else
- read_intra_block_mode_info(pbi, mi, r);
+ read_intra_block_mode_info(cm, mi, r);
}
-static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
- int i;
-
- cm->comp_pred_mode = cm->allow_comp_inter_inter ? read_comp_pred_mode(r)
- : SINGLE_PREDICTION_ONLY;
-
- if (cm->comp_pred_mode == HYBRID_PREDICTION)
- for (i = 0; i < COMP_INTER_CONTEXTS; i++)
- if (vp9_read(r, MODE_UPDATE_PROB))
- vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
-
- if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
- for (i = 0; i < REF_CONTEXTS; i++) {
- if (vp9_read(r, MODE_UPDATE_PROB))
- vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
- if (vp9_read(r, MODE_UPDATE_PROB))
- vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
- }
-
- if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
- for (i = 0; i < REF_CONTEXTS; i++)
- if (vp9_read(r, MODE_UPDATE_PROB))
- vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
-}
-
-void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
- int k;
-
- // TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove.
- // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));
- for (k = 0; k < MBSKIP_CONTEXTS; ++k)
- if (vp9_read(r, MODE_UPDATE_PROB))
- vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]);
-
- if (cm->frame_type != KEY_FRAME && !cm->intra_only) {
- nmv_context *const nmvc = &pbi->common.fc.nmvc;
- MACROBLOCKD *const xd = &pbi->mb;
- int i, j;
-
- read_inter_mode_probs(&cm->fc, r);
-
- if (cm->mcomp_filter_type == SWITCHABLE)
- read_switchable_interp_probs(&cm->fc, r);
-
- for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
- if (vp9_read(r, MODE_UPDATE_PROB))
- vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]);
-
- read_comp_pred(cm, r);
-
- for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
- for (i = 0; i < INTRA_MODES - 1; ++i)
- if (vp9_read(r, MODE_UPDATE_PROB))
- vp9_diff_update_prob(r, &cm->fc.y_mode_prob[j][i]);
-
- for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j)
- for (i = 0; i < PARTITION_TYPES - 1; ++i)
- if (vp9_read(r, MODE_UPDATE_PROB))
- vp9_diff_update_prob(r, &cm->fc.partition_prob[INTER_FRAME][j][i]);
-
- read_mv_probs(r, nmvc, xd->allow_high_precision_mv);
- }
-}
-
-void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
- MODE_INFO *mi = xd->this_mi;
+void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+ const TileInfo *const tile,
+ int mi_row, int mi_col, vp9_reader *r) {
+ MODE_INFO *const mi = xd->mi_8x8[0];
const BLOCK_SIZE bsize = mi->mbmi.sb_type;
- const int bw = 1 << mi_width_log2(bsize);
- const int bh = 1 << mi_height_log2(bsize);
+ const int bw = num_8x8_blocks_wide_lookup[bsize];
+ const int bh = num_8x8_blocks_high_lookup[bsize];
const int y_mis = MIN(bh, cm->mi_rows - mi_row);
const int x_mis = MIN(bw, cm->mi_cols - mi_col);
int x, y, z;
- if (cm->frame_type == KEY_FRAME || cm->intra_only)
- read_intra_frame_mode_info(pbi, mi, mi_row, mi_col, r);
+ if (frame_is_intra_only(cm))
+ read_intra_frame_mode_info(cm, xd, mi, mi_row, mi_col, r);
else
- read_inter_frame_mode_info(pbi, mi, mi_row, mi_col, r);
+ read_inter_frame_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
- for (y = 0, z = 0; y < y_mis; y++, z += cm->mode_info_stride)
+ for (y = 0, z = 0; y < y_mis; y++, z += cm->mode_info_stride) {
for (x = !y; x < x_mis; x++) {
- xd->mi_8x8[z + x] = mi;
- }
+ xd->mi_8x8[z + x] = mi;
+ }
+ }
}
diff --git a/libvpx/vp9/decoder/vp9_decodemv.h b/libvpx/vp9/decoder/vp9_decodemv.h
index 462d2e3..8e9ae4a 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.h
+++ b/libvpx/vp9/decoder/vp9_decodemv.h
@@ -14,8 +14,10 @@
#include "vp9/decoder/vp9_onyxd_int.h"
#include "vp9/decoder/vp9_dboolhuff.h"
-void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r);
+struct TileInfo;
-void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r);
+void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+ const struct TileInfo *const tile,
+ int mi_row, int mi_col, vp9_reader *r);
#endif // VP9_DECODER_VP9_DECODEMV_H_
diff --git a/libvpx/vp9/decoder/vp9_decodframe.c b/libvpx/vp9/decoder/vp9_decodframe.c
index dbba28e..4746a3a 100644
--- a/libvpx/vp9/decoder/vp9_decodframe.c
+++ b/libvpx/vp9/decoder/vp9_decodframe.c
@@ -19,6 +19,7 @@
#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_extend.h"
+#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_quant_common.h"
#include "vp9/common/vp9_reconintra.h"
@@ -31,16 +32,49 @@
#include "vp9/decoder/vp9_detokenize.h"
#include "vp9/decoder/vp9_decodemv.h"
#include "vp9/decoder/vp9_dsubexp.h"
-#include "vp9/decoder/vp9_idct_blk.h"
#include "vp9/decoder/vp9_onyxd_int.h"
#include "vp9/decoder/vp9_read_bit_buffer.h"
#include "vp9/decoder/vp9_thread.h"
#include "vp9/decoder/vp9_treereader.h"
+typedef struct TileWorkerData {
+ VP9_COMMON *cm;
+ vp9_reader bit_reader;
+ DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+ DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
+} TileWorkerData;
+
static int read_be32(const uint8_t *p) {
return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
}
+static int is_compound_prediction_allowed(const VP9_COMMON *cm) {
+ int i;
+ for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i)
+ if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
+ return 1;
+
+ return 0;
+}
+
+static void setup_compound_prediction(VP9_COMMON *cm) {
+ if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+ cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
+ cm->comp_fixed_ref = ALTREF_FRAME;
+ cm->comp_var_ref[0] = LAST_FRAME;
+ cm->comp_var_ref[1] = GOLDEN_FRAME;
+ } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+ cm->ref_frame_sign_bias[ALTREF_FRAME]) {
+ cm->comp_fixed_ref = GOLDEN_FRAME;
+ cm->comp_var_ref[0] = LAST_FRAME;
+ cm->comp_var_ref[1] = ALTREF_FRAME;
+ } else {
+ cm->comp_fixed_ref = LAST_FRAME;
+ cm->comp_var_ref[0] = GOLDEN_FRAME;
+ cm->comp_var_ref[1] = ALTREF_FRAME;
+ }
+}
+
// len == 0 is not allowed
static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
return start + len > start && start + len <= end;
@@ -63,18 +97,105 @@ static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) {
for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
for (j = 0; j < TX_SIZES - 3; ++j)
- if (vp9_read(r, MODE_UPDATE_PROB))
- vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);
+ vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);
for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
for (j = 0; j < TX_SIZES - 2; ++j)
- if (vp9_read(r, MODE_UPDATE_PROB))
- vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);
+ vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);
for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
for (j = 0; j < TX_SIZES - 1; ++j)
- if (vp9_read(r, MODE_UPDATE_PROB))
- vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
+ vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
+}
+
+static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
+ int i, j;
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+ for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
+ vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
+}
+
+static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
+ int i, j;
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+ for (j = 0; j < INTER_MODES - 1; ++j)
+ vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
+}
+
+static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
+ COMPPREDMODE_TYPE mode = vp9_read_bit(r);
+ if (mode)
+ mode += vp9_read_bit(r);
+ return mode;
+}
+
+static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
+ int i;
+
+ const int compound_allowed = is_compound_prediction_allowed(cm);
+ cm->comp_pred_mode = compound_allowed ? read_comp_pred_mode(r)
+ : SINGLE_PREDICTION_ONLY;
+ if (compound_allowed)
+ setup_compound_prediction(cm);
+
+ if (cm->comp_pred_mode == HYBRID_PREDICTION)
+ for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+ vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
+
+ if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
+ for (i = 0; i < REF_CONTEXTS; i++) {
+ vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
+ vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
+ }
+
+ if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+ for (i = 0; i < REF_CONTEXTS; i++)
+ vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
+}
+
+static void update_mv(vp9_reader *r, vp9_prob *p) {
+ if (vp9_read(r, NMV_UPDATE_PROB))
+ *p = (vp9_read_literal(r, 7) << 1) | 1;
+}
+
+static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int allow_hp) {
+ int i, j, k;
+
+ for (j = 0; j < MV_JOINTS - 1; ++j)
+ update_mv(r, &mvc->joints[j]);
+
+ for (i = 0; i < 2; ++i) {
+ nmv_component *const comp = &mvc->comps[i];
+
+ update_mv(r, &comp->sign);
+
+ for (j = 0; j < MV_CLASSES - 1; ++j)
+ update_mv(r, &comp->classes[j]);
+
+ for (j = 0; j < CLASS0_SIZE - 1; ++j)
+ update_mv(r, &comp->class0[j]);
+
+ for (j = 0; j < MV_OFFSET_BITS; ++j)
+ update_mv(r, &comp->bits[j]);
+ }
+
+ for (i = 0; i < 2; ++i) {
+ nmv_component *const comp = &mvc->comps[i];
+
+ for (j = 0; j < CLASS0_SIZE; ++j)
+ for (k = 0; k < 3; ++k)
+ update_mv(r, &comp->class0_fp[j][k]);
+
+ for (j = 0; j < 3; ++j)
+ update_mv(r, &comp->fp[j]);
+ }
+
+ if (allow_hp) {
+ for (i = 0; i < 2; ++i) {
+ update_mv(r, &mvc->comps[i].class0_hp);
+ update_mv(r, &mvc->comps[i].hp);
+ }
+ }
}
static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
@@ -85,47 +206,110 @@ static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
xd->plane[i].dequant = cm->uv_dequant[q_index];
}
-static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *arg) {
- MACROBLOCKD* const xd = arg;
+// Allocate storage for each tile column.
+// TODO(jzern): when max_threads <= 1 the same storage could be used for each
+// tile.
+static void alloc_tile_storage(VP9D_COMP *pbi, int tile_cols) {
+ VP9_COMMON *const cm = &pbi->common;
+ const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+ int i, tile_col;
+
+ CHECK_MEM_ERROR(cm, pbi->mi_streams,
+ vpx_realloc(pbi->mi_streams, tile_cols *
+ sizeof(*pbi->mi_streams)));
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileInfo tile;
+
+ vp9_tile_init(&tile, cm, 0, tile_col);
+ pbi->mi_streams[tile_col] =
+ &cm->mi[cm->mi_rows * tile.mi_col_start];
+ }
+
+ // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
+ // block where mi unit size is 8x8.
+ CHECK_MEM_ERROR(cm, pbi->above_context[0],
+ vpx_realloc(pbi->above_context[0],
+ sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
+ 2 * aligned_mi_cols));
+ for (i = 1; i < MAX_MB_PLANE; ++i) {
+ pbi->above_context[i] = pbi->above_context[0] +
+ i * sizeof(*pbi->above_context[0]) *
+ 2 * aligned_mi_cols;
+ }
+
+ // This is sized based on the entire frame. Each tile operates within its
+ // column bounds.
+ CHECK_MEM_ERROR(cm, pbi->above_seg_context,
+ vpx_realloc(pbi->above_seg_context,
+ sizeof(*pbi->above_seg_context) *
+ aligned_mi_cols));
+}
+
+static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+ int16_t* const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
const int stride = pd->dst.stride;
const int eob = pd->eobs[block];
- const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
- block);
- uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
- pd->dst.buf, stride);
- switch (tx_size) {
- case TX_4X4: {
- const TX_TYPE tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
- if (tx_type == DCT_DCT)
- xd->itxm_add(qcoeff, dst, stride, eob);
+ if (eob > 0) {
+ TX_TYPE tx_type;
+ const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
+ block);
+ uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
+ pd->dst.buf, stride);
+ switch (tx_size) {
+ case TX_4X4:
+ tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
+ if (tx_type == DCT_DCT)
+ xd->itxm_add(dqcoeff, dst, stride, eob);
+ else
+ vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type);
+ break;
+ case TX_8X8:
+ tx_type = get_tx_type_8x8(pd->plane_type, xd);
+ vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+ break;
+ case TX_16X16:
+ tx_type = get_tx_type_16x16(pd->plane_type, xd);
+ vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+ break;
+ case TX_32X32:
+ tx_type = DCT_DCT;
+ vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+ break;
+ default:
+ assert(!"Invalid transform size");
+ }
+
+ if (eob == 1) {
+ vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
+ } else {
+ if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
+ vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+ else if (tx_size == TX_32X32 && eob <= 34)
+ vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
else
- vp9_iht_add_c(tx_type, qcoeff, dst, stride, eob);
- break;
+ vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
}
- case TX_8X8:
- vp9_iht_add_8x8_c(get_tx_type_8x8(pd->plane_type, xd), qcoeff, dst,
- stride, eob);
- break;
- case TX_16X16:
- vp9_iht_add_16x16_c(get_tx_type_16x16(pd->plane_type, xd), qcoeff, dst,
- stride, eob);
- break;
- case TX_32X32:
- vp9_idct_add_32x32(qcoeff, dst, stride, eob);
- break;
- default:
- assert(!"Invalid transform size");
}
}
-static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *arg) {
- MACROBLOCKD* const xd = arg;
+struct intra_args {
+ VP9_COMMON *cm;
+ MACROBLOCKD *xd;
+ vp9_reader *r;
+ unsigned char* token_cache;
+};
+
+static void predict_and_reconstruct_intra_block(int plane, int block,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct intra_args *const args = arg;
+ VP9_COMMON *const cm = args->cm;
+ MACROBLOCKD *const xd = args->xd;
+
struct macroblockd_plane *const pd = &xd->plane[plane];
- MODE_INFO *const mi = xd->this_mi;
+ MODE_INFO *const mi = xd->mi_8x8[0];
const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
block);
uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
@@ -142,32 +326,37 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
b_width_log2(plane_bsize), tx_size, mode,
dst, pd->dst.stride, dst, pd->dst.stride);
- if (!mi->mbmi.skip_coeff)
- decode_block(plane, block, plane_bsize, tx_size, arg);
+ if (!mi->mbmi.skip_coeff) {
+ vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size,
+ args->r, args->token_cache);
+ inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
+ }
}
-static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE bsize, vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
- MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
-
- if (mbmi->skip_coeff) {
- reset_skip_context(xd, bsize);
- return -1;
- } else {
- if (cm->seg.enabled)
- setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
- cm->base_qindex));
-
- // TODO(dkovalev) if (!vp9_reader_has_error(r))
- return vp9_decode_tokens(pbi, r, bsize);
- }
+struct inter_args {
+ VP9_COMMON *cm;
+ MACROBLOCKD *xd;
+ vp9_reader *r;
+ int *eobtotal;
+ unsigned char* token_cache;
+};
+
+static void reconstruct_inter_block(int plane, int block,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct inter_args *args = arg;
+ VP9_COMMON *const cm = args->cm;
+ MACROBLOCKD *const xd = args->xd;
+
+ *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
+ plane_bsize, tx_size,
+ args->r, args->token_cache);
+ inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
}
-static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize,
- int mi_row, int mi_col) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
+static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ const TileInfo *const tile,
+ BLOCK_SIZE bsize, int mi_row, int mi_col) {
const int bh = num_8x8_blocks_high_lookup[bsize];
const int bw = num_8x8_blocks_wide_lookup[bsize];
const int offset = mi_row * cm->mode_info_stride + mi_col;
@@ -178,178 +367,187 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize,
xd->prev_mi_8x8 = cm->prev_mi_grid_visible + offset;
// we are using the mode info context stream here
- xd->this_mi =
- xd->mi_8x8[0] = xd->mic_stream_ptr;
- xd->this_mi->mbmi.sb_type = bsize;
- xd->mic_stream_ptr++;
+ xd->mi_8x8[0] = xd->mi_stream;
+ xd->mi_8x8[0]->mbmi.sb_type = bsize;
+ ++xd->mi_stream;
// Special case: if prev_mi is NULL, the previous mode info context
// cannot be used.
xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
- set_skip_context(cm, xd, mi_row, mi_col);
- set_partition_seg_context(cm, xd, mi_row, mi_col);
+ set_skip_context(xd, xd->above_context, xd->left_context, mi_row, mi_col);
// Distance of Mb to the various image edges. These are specified to 8th pel
// as they are always compared to values that are in 1/8th pel units
- set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
- setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], mi_row, mi_col);
+ setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col);
}
-static void set_ref(VP9D_COMP *pbi, int i, int mi_row, int mi_col) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
- MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
- const int ref = mbmi->ref_frame[i] - LAST_FRAME;
- const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[ref]];
- const struct scale_factors *sf = &cm->active_ref_scale[ref];
- if (!vp9_is_valid_scale(sf))
+static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ int idx, int mi_row, int mi_col) {
+ MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+ const int ref = mbmi->ref_frame[idx] - LAST_FRAME;
+ const YV12_BUFFER_CONFIG *cfg = get_frame_ref_buffer(cm, ref);
+ const struct scale_factors_common *sfc = &cm->active_ref_scale_comm[ref];
+ if (!vp9_is_valid_scale(sfc))
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Invalid scale factors");
- xd->scale_factor[i] = *sf;
- setup_pre_planes(xd, i, cfg, mi_row, mi_col, sf);
+ xd->scale_factor[idx].sfc = sfc;
+ setup_pre_planes(xd, idx, cfg, mi_row, mi_col, &xd->scale_factor[idx]);
xd->corrupted |= cfg->corrupted;
}
-static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
- vp9_reader *r, BLOCK_SIZE bsize) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
+static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ const TileInfo *const tile,
+ int mi_row, int mi_col,
+ vp9_reader *r, BLOCK_SIZE bsize,
+ unsigned char *token_cache) {
const int less8x8 = bsize < BLOCK_8X8;
MB_MODE_INFO *mbmi;
- if (less8x8)
- if (xd->ab_index > 0)
- return;
-
- set_offsets(pbi, bsize, mi_row, mi_col);
- vp9_read_mode_info(pbi, mi_row, mi_col, r);
+ set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
+ vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
if (less8x8)
bsize = BLOCK_8X8;
// Has to be called after set_offsets
- mbmi = &xd->this_mi->mbmi;
+ mbmi = &xd->mi_8x8[0]->mbmi;
- if (!is_inter_block(mbmi)) {
- // Intra reconstruction
- decode_tokens(pbi, bsize, r);
- foreach_transformed_block(xd, bsize, decode_block_intra, xd);
+ if (mbmi->skip_coeff) {
+ reset_skip_context(xd, bsize);
} else {
- // Inter reconstruction
- int eobtotal;
+ if (cm->seg.enabled)
+ setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
+ cm->base_qindex));
+ }
- set_ref(pbi, 0, mi_row, mi_col);
+ if (!is_inter_block(mbmi)) {
+ struct intra_args arg = { cm, xd, r, token_cache };
+ foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
+ &arg);
+ } else {
+ // Setup
+ set_ref(cm, xd, 0, mi_row, mi_col);
if (has_second_ref(mbmi))
- set_ref(pbi, 1, mi_row, mi_col);
+ set_ref(cm, xd, 1, mi_row, mi_col);
- vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+ xd->subpix.filter_x = xd->subpix.filter_y =
+ vp9_get_filter_kernel(mbmi->interp_filter);
+
+ // Prediction
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
- eobtotal = decode_tokens(pbi, bsize, r);
- if (less8x8) {
- if (eobtotal >= 0)
- foreach_transformed_block(xd, bsize, decode_block, xd);
- } else {
- assert(mbmi->sb_type == bsize);
- if (eobtotal == 0)
- // skip loopfilter
- vp9_set_pred_flag_mbskip(xd, bsize, 1);
- else if (eobtotal > 0)
- foreach_transformed_block(xd, bsize, decode_block, xd);
+
+ // Reconstruction
+ if (!mbmi->skip_coeff) {
+ int eobtotal = 0;
+ struct inter_args arg = { cm, xd, r, &eobtotal, token_cache };
+ foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+ if (!less8x8 && eobtotal == 0)
+ mbmi->skip_coeff = 1; // skip loopfilter
}
}
+
xd->corrupted |= vp9_reader_has_error(r);
}
-static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
- vp9_reader* r, BLOCK_SIZE bsize) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
+static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ vp9_reader *r) {
+ const int ctx = partition_plane_context(xd->above_seg_context,
+ xd->left_seg_context,
+ mi_row, mi_col, bsize);
+ const vp9_prob *const probs = get_partition_probs(cm, ctx);
+ const int has_rows = (mi_row + hbs) < cm->mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_cols;
+ PARTITION_TYPE p;
+
+ if (has_rows && has_cols)
+ p = treed_read(r, vp9_partition_tree, probs);
+ else if (!has_rows && has_cols)
+ p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
+ else if (has_rows && !has_cols)
+ p = vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
+ else
+ p = PARTITION_SPLIT;
+
+ if (!cm->frame_parallel_decoding_mode)
+ ++cm->counts.partition[ctx][p];
+
+ return p;
+}
+
+static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ const TileInfo *const tile,
+ int mi_row, int mi_col,
+ vp9_reader* r, BLOCK_SIZE bsize,
+ unsigned char *token_cache) {
const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
- PARTITION_TYPE partition = PARTITION_NONE;
+ PARTITION_TYPE partition;
BLOCK_SIZE subsize;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- if (bsize < BLOCK_8X8) {
- if (xd->ab_index != 0)
- return;
- } else {
- int pl;
- const int idx = check_bsize_coverage(hbs, cm->mi_rows, cm->mi_cols,
- mi_row, mi_col);
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
-
- if (idx == 0)
- partition = treed_read(r, vp9_partition_tree,
- cm->fc.partition_prob[cm->frame_type][pl]);
- else if (idx > 0 &&
- !vp9_read(r, cm->fc.partition_prob[cm->frame_type][pl][idx]))
- partition = (idx == 1) ? PARTITION_HORZ : PARTITION_VERT;
- else
- partition = PARTITION_SPLIT;
-
- cm->counts.partition[pl][partition]++;
- }
-
+ partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
subsize = get_subsize(bsize, partition);
- *get_sb_index(xd, subsize) = 0;
-
- switch (partition) {
- case PARTITION_NONE:
- decode_modes_b(pbi, mi_row, mi_col, r, subsize);
- break;
- case PARTITION_HORZ:
- decode_modes_b(pbi, mi_row, mi_col, r, subsize);
- *get_sb_index(xd, subsize) = 1;
- if (mi_row + hbs < cm->mi_rows)
- decode_modes_b(pbi, mi_row + hbs, mi_col, r, subsize);
- break;
- case PARTITION_VERT:
- decode_modes_b(pbi, mi_row, mi_col, r, subsize);
- *get_sb_index(xd, subsize) = 1;
- if (mi_col + hbs < cm->mi_cols)
- decode_modes_b(pbi, mi_row, mi_col + hbs, r, subsize);
- break;
- case PARTITION_SPLIT: {
- int n;
- for (n = 0; n < 4; n++) {
- const int j = n >> 1, i = n & 1;
- *get_sb_index(xd, subsize) = n;
- decode_modes_sb(pbi, mi_row + j * hbs, mi_col + i * hbs, r, subsize);
- }
- } break;
- default:
- assert(!"Invalid partition type");
+ if (subsize < BLOCK_8X8) {
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+ } else {
+ switch (partition) {
+ case PARTITION_NONE:
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+ break;
+ case PARTITION_HORZ:
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+ if (mi_row + hbs < cm->mi_rows)
+ decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
+ token_cache);
+ break;
+ case PARTITION_VERT:
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+ if (mi_col + hbs < cm->mi_cols)
+ decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
+ token_cache);
+ break;
+ case PARTITION_SPLIT:
+ decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize,
+ token_cache);
+ decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
+ token_cache);
+ decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
+ token_cache);
+ decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize,
+ token_cache);
+ break;
+ default:
+ assert(!"Invalid partition type");
+ }
}
// update partition context
if (bsize >= BLOCK_8X8 &&
- (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) {
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- update_partition_context(xd, subsize, bsize);
- }
+ (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
+ update_partition_context(xd->above_seg_context, xd->left_seg_context,
+ mi_row, mi_col, subsize, bsize);
}
-static void setup_token_decoder(VP9D_COMP *pbi,
- const uint8_t *data, size_t read_size,
+static void setup_token_decoder(const uint8_t *data,
+ const uint8_t *data_end,
+ size_t read_size,
+ struct vpx_internal_error_info *error_info,
vp9_reader *r) {
- VP9_COMMON *cm = &pbi->common;
- const uint8_t *data_end = pbi->source + pbi->source_sz;
-
// Validate the calculated partition length. If the buffer
// described by the partition can't be fully read, then restrict
// it to the portion that can be (for EC mode) or throw an error.
if (!read_is_valid(data, read_size, data_end))
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt tile length");
if (vp9_reader_init(r, data, read_size))
- vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR,
"Failed to allocate bool decoder %d", 1);
}
@@ -364,22 +562,15 @@ static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs,
for (l = 0; l < PREV_COEF_CONTEXTS; l++)
if (k > 0 || l < 3)
for (m = 0; m < UNCONSTRAINED_NODES; m++)
- if (vp9_read(r, VP9_COEF_UPDATE_PROB))
- vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
+ vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
}
static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
vp9_reader *r) {
- read_coef_probs_common(fc->coef_probs[TX_4X4], r);
-
- if (tx_mode > ONLY_4X4)
- read_coef_probs_common(fc->coef_probs[TX_8X8], r);
-
- if (tx_mode > ALLOW_8X8)
- read_coef_probs_common(fc->coef_probs[TX_16X16], r);
-
- if (tx_mode > ALLOW_16X16)
- read_coef_probs_common(fc->coef_probs[TX_32X32], r);
+ const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+ TX_SIZE tx_size;
+ for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+ read_coef_probs_common(fc->coef_probs[tx_size], r);
}
static void setup_segmentation(struct segmentation *seg,
@@ -436,7 +627,6 @@ static void setup_segmentation(struct segmentation *seg,
static void setup_loopfilter(struct loopfilter *lf,
struct vp9_read_bit_buffer *rb) {
-
lf->filter_level = vp9_rb_read_literal(rb, 6);
lf->sharpness_level = vp9_rb_read_literal(rb, 3);
@@ -467,9 +657,8 @@ static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
return old != *delta_q;
}
-static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
- MACROBLOCKD *const xd = &pbi->mb;
- VP9_COMMON *const cm = &pbi->common;
+static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ struct vp9_read_bit_buffer *rb) {
int update = 0;
cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS);
@@ -484,16 +673,15 @@ static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
cm->uv_dc_delta_q == 0 &&
cm->uv_ac_delta_q == 0;
- xd->itxm_add = xd->lossless ? vp9_idct_add_lossless_c
- : vp9_idct_add;
+ xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
}
-static INTERPOLATIONFILTERTYPE read_interp_filter_type(
- struct vp9_read_bit_buffer *rb) {
- const INTERPOLATIONFILTERTYPE literal_to_type[] = { EIGHTTAP_SMOOTH,
- EIGHTTAP,
- EIGHTTAP_SHARP,
- BILINEAR };
+static INTERPOLATION_TYPE read_interp_filter_type(
+ struct vp9_read_bit_buffer *rb) {
+ const INTERPOLATION_TYPE literal_to_type[] = { EIGHTTAP_SMOOTH,
+ EIGHTTAP,
+ EIGHTTAP_SHARP,
+ BILINEAR };
return vp9_rb_read_bit(rb) ? SWITCHABLE
: literal_to_type[vp9_rb_read_literal(rb, 2)];
}
@@ -539,7 +727,7 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) {
vp9_update_frame_size(cm);
}
- vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx], cm->width, cm->height,
+ vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
VP9BORDERINPIXELS);
}
@@ -560,7 +748,7 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
int found = 0, i;
for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
if (vp9_rb_read_bit(rb)) {
- YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[i]];
+ YV12_BUFFER_CONFIG *const cfg = get_frame_ref_buffer(cm, i);
width = cfg->y_crop_width;
height = cfg->y_crop_height;
found = 1;
@@ -579,67 +767,73 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
setup_display_size(cm, rb);
}
-static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {
+static void setup_tile_context(VP9D_COMP *const pbi, MACROBLOCKD *const xd,
+ int tile_col) {
+ int i;
+ xd->mi_stream = pbi->mi_streams[tile_col];
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ xd->above_context[i] = pbi->above_context[i];
+ }
+ // see note in alloc_tile_storage().
+ xd->above_seg_context = pbi->above_seg_context;
+}
+
+static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile,
+ vp9_reader *r) {
const int num_threads = pbi->oxcf.max_threads;
VP9_COMMON *const cm = &pbi->common;
int mi_row, mi_col;
- YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[cm->new_fb_idx];
+ MACROBLOCKD *xd = &pbi->mb;
if (pbi->do_loopfilter_inline) {
- if (num_threads > 1) {
- LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
- lf_data->frame_buffer = fb;
- lf_data->cm = cm;
- lf_data->xd = pbi->mb;
- lf_data->stop = 0;
- lf_data->y_only = 0;
- }
+ LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+ lf_data->frame_buffer = get_frame_new_buffer(cm);
+ lf_data->cm = cm;
+ lf_data->xd = pbi->mb;
+ lf_data->stop = 0;
+ lf_data->y_only = 0;
vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
}
- for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
+ for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
mi_row += MI_BLOCK_SIZE) {
// For a SB there are 2 left contexts, each pertaining to a MB row within
- vp9_zero(cm->left_context);
- vp9_zero(cm->left_seg_context);
- for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+ vp9_zero(xd->left_context);
+ vp9_zero(xd->left_seg_context);
+ for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE)
- decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_64X64);
+ decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64,
+ pbi->token_cache);
if (pbi->do_loopfilter_inline) {
- // delay the loopfilter by 1 macroblock row.
const int lf_start = mi_row - MI_BLOCK_SIZE;
- if (lf_start < 0) continue;
+ LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
- if (num_threads > 1) {
- LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+ // delay the loopfilter by 1 macroblock row.
+ if (lf_start < 0) continue;
- // decoding has completed: finish up the loop filter in this thread.
- if (mi_row + MI_BLOCK_SIZE >= cm->cur_tile_mi_row_end) continue;
+ // decoding has completed: finish up the loop filter in this thread.
+ if (mi_row + MI_BLOCK_SIZE >= tile->mi_row_end) continue;
- vp9_worker_sync(&pbi->lf_worker);
- lf_data->start = lf_start;
- lf_data->stop = mi_row;
- pbi->lf_worker.hook = vp9_loop_filter_worker;
+ vp9_worker_sync(&pbi->lf_worker);
+ lf_data->start = lf_start;
+ lf_data->stop = mi_row;
+ if (num_threads > 1) {
vp9_worker_launch(&pbi->lf_worker);
} else {
- vp9_loop_filter_rows(fb, cm, &pbi->mb, lf_start, mi_row, 0);
+ vp9_worker_execute(&pbi->lf_worker);
}
}
}
if (pbi->do_loopfilter_inline) {
- int lf_start;
- if (num_threads > 1) {
- LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+ LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
- vp9_worker_sync(&pbi->lf_worker);
- lf_start = lf_data->stop;
- } else {
- lf_start = mi_row - MI_BLOCK_SIZE;
- }
- vp9_loop_filter_rows(fb, cm, &pbi->mb,
- lf_start, cm->mi_rows, 0);
+ vp9_worker_sync(&pbi->lf_worker);
+ lf_data->start = lf_data->stop;
+ lf_data->stop = cm->mi_rows;
+ vp9_worker_execute(&pbi->lf_worker);
}
}
@@ -659,10 +853,32 @@ static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
cm->log2_tile_rows += vp9_rb_read_bit(rb);
}
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static size_t get_tile(const uint8_t *const data_end,
+ int is_last,
+ struct vpx_internal_error_info *error_info,
+ const uint8_t **data) {
+ size_t size;
+
+ if (!is_last) {
+ if (!read_is_valid(*data, 4, data_end))
+ vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt tile length");
+
+ size = read_be32(*data);
+ *data += 4;
+ } else {
+ size = data_end - *data;
+ }
+ return size;
+}
+
static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
vp9_reader residual_bc;
VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
const uint8_t *const data_end = pbi->source + pbi->source_sz;
const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
@@ -672,70 +888,57 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
// Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer.
- vpx_memset(cm->above_context[0], 0,
- sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE * (2 * aligned_mi_cols));
+ vpx_memset(pbi->above_context[0], 0,
+ sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
+ 2 * aligned_mi_cols);
- vpx_memset(cm->above_seg_context, 0,
- sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
+ vpx_memset(pbi->above_seg_context, 0,
+ sizeof(*pbi->above_seg_context) * aligned_mi_cols);
if (pbi->oxcf.inv_tile_order) {
const uint8_t *data_ptr2[4][1 << 6];
vp9_reader bc_bak = {0};
- // pre-initialize the offsets, we're going to read in inverse order
+ // pre-initialize the offsets, we're going to decode in inverse order
data_ptr2[0][0] = data;
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- if (tile_row) {
- const int size = read_be32(data_ptr2[tile_row - 1][tile_cols - 1]);
- data_ptr2[tile_row - 1][tile_cols - 1] += 4;
- data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][tile_cols - 1] + size;
- }
-
- for (tile_col = 1; tile_col < tile_cols; tile_col++) {
- const int size = read_be32(data_ptr2[tile_row][tile_col - 1]);
- data_ptr2[tile_row][tile_col - 1] += 4;
- data_ptr2[tile_row][tile_col] =
- data_ptr2[tile_row][tile_col - 1] + size;
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ const int last_tile =
+ tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
+ const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
+ data_ptr2[tile_row][tile_col] = data;
+ data += size;
}
}
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- vp9_get_tile_row_offsets(cm, tile_row);
for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) {
- vp9_get_tile_col_offsets(cm, tile_col);
- setup_token_decoder(pbi, data_ptr2[tile_row][tile_col],
+ TileInfo tile;
+
+ vp9_tile_init(&tile, cm, tile_row, tile_col);
+ setup_token_decoder(data_ptr2[tile_row][tile_col], data_end,
data_end - data_ptr2[tile_row][tile_col],
- &residual_bc);
- decode_tile(pbi, &residual_bc);
+ &cm->error, &residual_bc);
+ setup_tile_context(pbi, xd, tile_col);
+ decode_tile(pbi, &tile, &residual_bc);
if (tile_row == tile_rows - 1 && tile_col == tile_cols - 1)
bc_bak = residual_bc;
}
}
residual_bc = bc_bak;
} else {
- int has_more;
-
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- vp9_get_tile_row_offsets(cm, tile_row);
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
- size_t size;
-
- vp9_get_tile_col_offsets(cm, tile_col);
+ const int last_tile =
+ tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
+ const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
+ TileInfo tile;
- has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1;
- if (has_more) {
- if (!read_is_valid(data, 4, data_end))
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Truncated packet or corrupt tile length");
+ vp9_tile_init(&tile, cm, tile_row, tile_col);
- size = read_be32(data);
- data += 4;
- } else {
- size = data_end - data;
- }
-
- setup_token_decoder(pbi, data, size, &residual_bc);
- decode_tile(pbi, &residual_bc);
+ setup_token_decoder(data, data_end, size, &cm->error, &residual_bc);
+ setup_tile_context(pbi, xd, tile_col);
+ decode_tile(pbi, &tile, &residual_bc);
data += size;
}
}
@@ -744,10 +947,113 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
return vp9_reader_find_end(&residual_bc);
}
+static int tile_worker_hook(void *arg1, void *arg2) {
+ TileWorkerData *tile_data = (TileWorkerData*)arg1;
+ const TileInfo *const tile = (TileInfo*)arg2;
+ int mi_row, mi_col;
+
+ for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+ mi_row += MI_BLOCK_SIZE) {
+ vp9_zero(tile_data->xd.left_context);
+ vp9_zero(tile_data->xd.left_seg_context);
+ for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+ mi_col += MI_BLOCK_SIZE) {
+ decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
+ mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64,
+ tile_data->token_cache);
+ }
+ }
+ return !tile_data->xd.corrupted;
+}
+
+static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) {
+ VP9_COMMON *const cm = &pbi->common;
+ const uint8_t *const data_end = pbi->source + pbi->source_sz;
+ const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols);
+ int tile_col = 0;
+
+ assert(tile_rows == 1);
+ (void)tile_rows;
+
+ if (num_workers > pbi->num_tile_workers) {
+ int i;
+ CHECK_MEM_ERROR(cm, pbi->tile_workers,
+ vpx_realloc(pbi->tile_workers,
+ num_workers * sizeof(*pbi->tile_workers)));
+ for (i = pbi->num_tile_workers; i < num_workers; ++i) {
+ VP9Worker *const worker = &pbi->tile_workers[i];
+ ++pbi->num_tile_workers;
+
+ vp9_worker_init(worker);
+ worker->hook = (VP9WorkerHook)tile_worker_hook;
+ CHECK_MEM_ERROR(cm, worker->data1,
+ vpx_memalign(32, sizeof(TileWorkerData)));
+ CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
+ if (i < num_workers - 1 && !vp9_worker_reset(worker)) {
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Tile decoder thread creation failed");
+ }
+ }
+ }
+
+ // Note: this memset assumes above_context[0], [1] and [2]
+ // are allocated as part of the same buffer.
+ vpx_memset(pbi->above_context[0], 0,
+ sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
+ 2 * aligned_mi_cols);
+ vpx_memset(pbi->above_seg_context, 0,
+ sizeof(*pbi->above_seg_context) * aligned_mi_cols);
+
+ while (tile_col < tile_cols) {
+ int i;
+ for (i = 0; i < num_workers && tile_col < tile_cols; ++i) {
+ VP9Worker *const worker = &pbi->tile_workers[i];
+ TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
+ TileInfo *const tile = (TileInfo*)worker->data2;
+ const size_t size =
+ get_tile(data_end, tile_col == tile_cols - 1, &cm->error, &data);
+
+ tile_data->cm = cm;
+ tile_data->xd = pbi->mb;
+ tile_data->xd.corrupted = 0;
+ vp9_tile_init(tile, tile_data->cm, 0, tile_col);
+
+ setup_token_decoder(data, data_end, size, &cm->error,
+ &tile_data->bit_reader);
+ setup_tile_context(pbi, &tile_data->xd, tile_col);
+
+ worker->had_error = 0;
+ if (i == num_workers - 1 || tile_col == tile_cols - 1) {
+ vp9_worker_execute(worker);
+ } else {
+ vp9_worker_launch(worker);
+ }
+
+ data += size;
+ ++tile_col;
+ }
+
+ for (; i > 0; --i) {
+ VP9Worker *const worker = &pbi->tile_workers[i - 1];
+ pbi->mb.corrupted |= !vp9_worker_sync(worker);
+ }
+ }
+
+ {
+ const int final_worker = (tile_cols + num_workers - 1) % num_workers;
+ TileWorkerData *const tile_data =
+ (TileWorkerData*)pbi->tile_workers[final_worker].data1;
+ return vp9_reader_find_end(&tile_data->bit_reader);
+ }
+}
+
static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
- if (vp9_rb_read_literal(rb, 8) != SYNC_CODE_0 ||
- vp9_rb_read_literal(rb, 8) != SYNC_CODE_1 ||
- vp9_rb_read_literal(rb, 8) != SYNC_CODE_2) {
+ if (vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_0 ||
+ vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_1 ||
+ vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_2) {
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Invalid frame sync code");
}
@@ -758,34 +1064,6 @@ static void error_handler(void *data, size_t bit_offset) {
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
}
-static void setup_inter_inter(VP9_COMMON *cm) {
- int i;
-
- cm->allow_comp_inter_inter = 0;
- for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i)
- cm->allow_comp_inter_inter |=
- cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1];
-
- if (cm->allow_comp_inter_inter) {
- // which one is always-on in comp inter-inter?
- if (cm->ref_frame_sign_bias[LAST_FRAME] ==
- cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
- cm->comp_fixed_ref = ALTREF_FRAME;
- cm->comp_var_ref[0] = LAST_FRAME;
- cm->comp_var_ref[1] = GOLDEN_FRAME;
- } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
- cm->ref_frame_sign_bias[ALTREF_FRAME]) {
- cm->comp_fixed_ref = GOLDEN_FRAME;
- cm->comp_var_ref[0] = LAST_FRAME;
- cm->comp_var_ref[1] = ALTREF_FRAME;
- } else {
- cm->comp_fixed_ref = LAST_FRAME;
- cm->comp_var_ref[0] = GOLDEN_FRAME;
- cm->comp_var_ref[1] = ALTREF_FRAME;
- }
- }
-}
-
#define RESERVED \
if (vp9_rb_read_bit(rb)) \
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, \
@@ -794,12 +1072,12 @@ static void setup_inter_inter(VP9_COMMON *cm) {
static size_t read_uncompressed_header(VP9D_COMP *pbi,
struct vp9_read_bit_buffer *rb) {
VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
+ size_t sz;
int i;
cm->last_frame_type = cm->frame_type;
- if (vp9_rb_read_literal(rb, 2) != 0x2)
+ if (vp9_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Invalid frame marker");
@@ -820,12 +1098,10 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
cm->error_resilient_mode = vp9_rb_read_bit(rb);
if (cm->frame_type == KEY_FRAME) {
- int csp;
-
check_sync_code(cm, rb);
- csp = vp9_rb_read_literal(rb, 3); // colorspace
- if (csp != 7) { // != sRGB
+ cm->color_space = vp9_rb_read_literal(rb, 3); // colorspace
+ if (cm->color_space != SRGB) {
vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range
if (cm->version == 1) {
cm->subsampling_x = vp9_rb_read_bit(rb);
@@ -872,13 +1148,11 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
setup_frame_size_with_refs(pbi, rb);
- xd->allow_high_precision_mv = vp9_rb_read_bit(rb);
+ cm->allow_high_precision_mv = vp9_rb_read_bit(rb);
cm->mcomp_filter_type = read_interp_filter_type(rb);
for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
vp9_setup_scale_factors(cm, i);
-
- setup_inter_inter(cm);
}
}
@@ -890,25 +1164,34 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
cm->frame_parallel_decoding_mode = 1;
}
+ // This flag will be overridden by the call to vp9_setup_past_independence
+ // below, forcing the use of context 0 for those frame types.
cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LOG2);
- if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || cm->intra_only)
+ if (frame_is_intra_only(cm) || cm->error_resilient_mode)
vp9_setup_past_independence(cm);
setup_loopfilter(&cm->lf, rb);
- setup_quantization(pbi, rb);
+ setup_quantization(cm, &pbi->mb, rb);
setup_segmentation(&cm->seg, rb);
setup_tile_info(cm, rb);
+ sz = vp9_rb_read_literal(rb, 16);
+
+ if (sz == 0)
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Invalid header size");
- return vp9_rb_read_literal(rb, 16);
+ return sz;
}
static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
size_t partition_size) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
+ FRAME_CONTEXT *const fc = &cm->fc;
vp9_reader r;
+ int k;
if (vp9_reader_init(&r, data, partition_size))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
@@ -916,10 +1199,36 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r);
if (cm->tx_mode == TX_MODE_SELECT)
- read_tx_probs(&cm->fc.tx_probs, &r);
- read_coef_probs(&cm->fc, cm->tx_mode, &r);
+ read_tx_probs(&fc->tx_probs, &r);
+ read_coef_probs(fc, cm->tx_mode, &r);
+
+ for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+ vp9_diff_update_prob(&r, &fc->mbskip_probs[k]);
+
+ if (!frame_is_intra_only(cm)) {
+ nmv_context *const nmvc = &fc->nmvc;
+ int i, j;
+
+ read_inter_mode_probs(fc, &r);
+
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ read_switchable_interp_probs(fc, &r);
+
+ for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+ vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]);
+
+ read_comp_pred(cm, &r);
+
+ for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
+ for (i = 0; i < INTRA_MODES - 1; ++i)
+ vp9_diff_update_prob(&r, &fc->y_mode_prob[j][i]);
+
+ for (j = 0; j < PARTITION_CONTEXTS; ++j)
+ for (i = 0; i < PARTITION_TYPES - 1; ++i)
+ vp9_diff_update_prob(&r, &fc->partition_prob[j][i]);
- vp9_prepare_read_mode_info(pbi, &r);
+ read_mv_probs(&r, nmvc, cm->allow_high_precision_mv);
+ }
return vp9_reader_has_error(&r);
}
@@ -936,59 +1245,109 @@ void vp9_init_dequantizer(VP9_COMMON *cm) {
}
}
+#ifdef NDEBUG
+#define debug_check_frame_counts(cm) (void)0
+#else // !NDEBUG
+// Counts should only be incremented when frame_parallel_decoding_mode and
+// error_resilient_mode are disabled.
+static void debug_check_frame_counts(const VP9_COMMON *const cm) {
+ FRAME_COUNTS zero_counts;
+ vp9_zero(zero_counts);
+ assert(cm->frame_parallel_decoding_mode || cm->error_resilient_mode);
+ assert(!memcmp(cm->counts.y_mode, zero_counts.y_mode,
+ sizeof(cm->counts.y_mode)));
+ assert(!memcmp(cm->counts.uv_mode, zero_counts.uv_mode,
+ sizeof(cm->counts.uv_mode)));
+ assert(!memcmp(cm->counts.partition, zero_counts.partition,
+ sizeof(cm->counts.partition)));
+ assert(!memcmp(cm->counts.coef, zero_counts.coef,
+ sizeof(cm->counts.coef)));
+ assert(!memcmp(cm->counts.eob_branch, zero_counts.eob_branch,
+ sizeof(cm->counts.eob_branch)));
+ assert(!memcmp(cm->counts.switchable_interp, zero_counts.switchable_interp,
+ sizeof(cm->counts.switchable_interp)));
+ assert(!memcmp(cm->counts.inter_mode, zero_counts.inter_mode,
+ sizeof(cm->counts.inter_mode)));
+ assert(!memcmp(cm->counts.intra_inter, zero_counts.intra_inter,
+ sizeof(cm->counts.intra_inter)));
+ assert(!memcmp(cm->counts.comp_inter, zero_counts.comp_inter,
+ sizeof(cm->counts.comp_inter)));
+ assert(!memcmp(cm->counts.single_ref, zero_counts.single_ref,
+ sizeof(cm->counts.single_ref)));
+ assert(!memcmp(cm->counts.comp_ref, zero_counts.comp_ref,
+ sizeof(cm->counts.comp_ref)));
+ assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx)));
+ assert(!memcmp(cm->counts.mbskip, zero_counts.mbskip,
+ sizeof(cm->counts.mbskip)));
+ assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv)));
+}
+#endif // NDEBUG
+
int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
int i;
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
const uint8_t *data = pbi->source;
- const uint8_t *data_end = pbi->source + pbi->source_sz;
+ const uint8_t *const data_end = pbi->source + pbi->source_sz;
- struct vp9_read_bit_buffer rb = { data, data_end, 0,
- cm, error_handler };
+ struct vp9_read_bit_buffer rb = { data, data_end, 0, cm, error_handler };
const size_t first_partition_size = read_uncompressed_header(pbi, &rb);
const int keyframe = cm->frame_type == KEY_FRAME;
- YV12_BUFFER_CONFIG *new_fb = &cm->yv12_fb[cm->new_fb_idx];
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
if (!first_partition_size) {
- // showing a frame directly
- *p_data_end = data + 1;
- return 0;
+ // showing a frame directly
+ *p_data_end = data + 1;
+ return 0;
}
- data += vp9_rb_bytes_read(&rb);
- xd->corrupted = 0;
- new_fb->corrupted = 0;
- pbi->do_loopfilter_inline =
- (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
if (!pbi->decoded_key_frame && !keyframe)
return -1;
+ data += vp9_rb_bytes_read(&rb);
if (!read_is_valid(data, first_partition_size, data_end))
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt header length");
- setup_plane_dequants(cm, &pbi->mb, cm->base_qindex);
+ pbi->do_loopfilter_inline =
+ (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
+ if (pbi->do_loopfilter_inline && pbi->lf_worker.data1 == NULL) {
+ CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, vpx_malloc(sizeof(LFWorkerData)));
+ pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
+ if (pbi->oxcf.max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) {
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Loop filter thread creation failed");
+ }
+ }
+
+ alloc_tile_storage(pbi, tile_cols);
xd->mi_8x8 = cm->mi_grid_visible;
- xd->mic_stream_ptr = cm->mi;
xd->mode_info_stride = cm->mode_info_stride;
+ set_prev_mi(cm);
- cm->fc = cm->frame_contexts[cm->frame_context_idx];
-
- vp9_zero(cm->counts);
-
- new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size);
-
+ setup_plane_dequants(cm, xd, cm->base_qindex);
setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y);
- // clear out the coeff buffer
+ cm->fc = cm->frame_contexts[cm->frame_context_idx];
+ vp9_zero(cm->counts);
for (i = 0; i < MAX_MB_PLANE; ++i)
- vp9_zero(xd->plane[i].qcoeff);
+ vp9_zero(xd->plane[i].dqcoeff);
- set_prev_mi(cm);
+ xd->corrupted = 0;
+ new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
- *p_data_end = decode_tiles(pbi, data + first_partition_size);
+ // TODO(jzern): remove frame_parallel_decoding_mode restriction for
+ // single-frame tile decoding.
+ if (pbi->oxcf.max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
+ cm->frame_parallel_decoding_mode) {
+ *p_data_end = decode_tiles_mt(pbi, data + first_partition_size);
+ } else {
+ *p_data_end = decode_tiles(pbi, data + first_partition_size);
+ }
cm->last_width = cm->width;
cm->last_height = cm->height;
@@ -1006,10 +1365,12 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
vp9_adapt_coef_probs(cm);
- if (!keyframe && !cm->intra_only) {
+ if (!frame_is_intra_only(cm)) {
vp9_adapt_mode_probs(cm);
- vp9_adapt_mv_probs(cm, xd->allow_high_precision_mv);
+ vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
}
+ } else {
+ debug_check_frame_counts(cm);
}
if (cm->refresh_frame_context)
diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c
index cd74a0b..b8d670b 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/libvpx/vp9/decoder/vp9_detokenize.c
@@ -61,53 +61,55 @@ static const vp9_prob cat6_prob[15] = {
254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
};
-DECLARE_ALIGNED(16, extern const uint8_t,
- vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-#define INCREMENT_COUNT(token) \
- do { \
- coef_counts[type][ref][band][pt] \
- [token >= TWO_TOKEN ? \
- (token == DCT_EOB_TOKEN ? DCT_EOB_MODEL_TOKEN : TWO_TOKEN) : \
- token]++; \
- token_cache[scan[c]] = vp9_pt_energy_class[token]; \
- } while (0)
+static const int token_to_counttoken[MAX_ENTROPY_TOKENS] = {
+ ZERO_TOKEN, ONE_TOKEN, TWO_TOKEN, TWO_TOKEN,
+ TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, TWO_TOKEN,
+ TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, DCT_EOB_MODEL_TOKEN
+};
+
+#define INCREMENT_COUNT(token) \
+ do { \
+ if (!cm->frame_parallel_decoding_mode) { \
+ ++coef_counts[band][pt][token_to_counttoken[token]]; \
+ } \
+ } while (0);
#define WRITE_COEF_CONTINUE(val, token) \
{ \
- qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \
+ dqcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \
dq[c > 0] / (1 + (tx_size == TX_32X32)); \
INCREMENT_COUNT(token); \
+ token_cache[scan[c]] = vp9_pt_energy_class[token]; \
c++; \
continue; \
}
-#define ADJUST_COEF(prob, bits_count) \
- do { \
- if (vp9_read(r, prob)) \
- val += 1 << bits_count; \
+#define ADJUST_COEF(prob, bits_count) \
+ do { \
+ val += (vp9_read(r, prob) << bits_count); \
} while (0);
static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
vp9_reader *r, int block_idx,
- PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr,
- TX_SIZE tx_size, const int16_t *dq,
- ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
- FRAME_CONTEXT *const fc = &cm->fc;
+ PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr,
+ TX_SIZE tx_size, const int16_t *dq, int pt,
+ uint8_t *token_cache) {
+ const FRAME_CONTEXT *const fc = &cm->fc;
FRAME_COUNTS *const counts = &cm->counts;
- const int ref = is_inter_block(&xd->this_mi->mbmi);
+ const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
int band, c = 0;
- vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES] =
+ const vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES] =
fc->coef_probs[tx_size][type][ref];
vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
uint8_t load_map[COEF_BANDS][PREV_COEF_CONTEXTS] = { { 0 } };
- vp9_prob *prob;
- vp9_coeff_count_model *coef_counts = counts->coef[tx_size];
+ const vp9_prob *prob;
+ unsigned int (*coef_counts)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES + 1] =
+ counts->coef[tx_size][type][ref];
+ unsigned int (*eob_branch_count)[PREV_COEF_CONTEXTS] =
+ counts->eob_branch[tx_size][type][ref];
const int16_t *scan, *nb;
- const uint8_t *band_translate;
- uint8_t token_cache[1024];
- int pt = get_entropy_context(xd, tx_size, type, block_idx, A, L,
- &scan, &band_translate);
- nb = vp9_get_coef_neighbors_handle(scan);
+ const uint8_t *const band_translate = get_band_translate(tx_size);
+ get_scan(xd, tx_size, type, block_idx, &scan, &nb);
while (1) {
int val;
@@ -118,11 +120,12 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
pt = get_coef_context(nb, token_cache, c);
band = get_coef_band(band_translate, c);
prob = coef_probs[band][pt];
- counts->eob_branch[tx_size][type][ref][band][pt]++;
+ if (!cm->frame_parallel_decoding_mode)
+ ++eob_branch_count[band][pt];
if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
break;
-SKIP_START:
+ SKIP_START:
if (c >= seg_eob)
break;
if (c)
@@ -132,6 +135,7 @@ SKIP_START:
if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
INCREMENT_COUNT(ZERO_TOKEN);
+ token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN];
++c;
goto SKIP_START;
}
@@ -203,47 +207,34 @@ SKIP_START:
WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6);
}
- if (c < seg_eob)
- coef_counts[type][ref][band][pt][DCT_EOB_MODEL_TOKEN]++;
-
+ if (c < seg_eob) {
+ if (!cm->frame_parallel_decoding_mode)
+ ++coef_counts[band][pt][DCT_EOB_MODEL_TOKEN];
+ }
return c;
}
-struct decode_block_args {
- VP9D_COMP *pbi;
- vp9_reader *r;
- int *eobtotal;
-};
-
-static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *argv) {
- const struct decode_block_args* const arg = argv;
-
- // find the maximum eob for this transform size, adjusted by segment
- MACROBLOCKD *xd = &arg->pbi->mb;
- struct segmentation *seg = &arg->pbi->common.seg;
- struct macroblockd_plane* pd = &xd->plane[plane];
- const int segment_id = xd->this_mi->mbmi.segment_id;
- const int seg_eob = get_tx_eob(seg, segment_id, tx_size);
- int aoff, loff, eob;
-
+int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, vp9_reader *r,
+ uint8_t *token_cache) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
+ tx_size);
+ int aoff, loff, eob, pt;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
+ pt = get_entropy_context(tx_size, pd->above_context + aoff,
+ pd->left_context + loff);
- eob = decode_coefs(&arg->pbi->common, xd, arg->r, block,
- pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block),
- tx_size, pd->dequant,
- pd->above_context + aoff, pd->left_context + loff);
+ eob = decode_coefs(cm, xd, r, block,
+ pd->plane_type, seg_eob, BLOCK_OFFSET(pd->dqcoeff, block),
+ tx_size, pd->dequant, pt, token_cache);
set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff);
pd->eobs[block] = eob;
- *arg->eobtotal += eob;
+ return eob;
}
-int vp9_decode_tokens(VP9D_COMP *pbi, vp9_reader *r, BLOCK_SIZE bsize) {
- int eobtotal = 0;
- struct decode_block_args args = {pbi, r, &eobtotal};
- foreach_transformed_block(&pbi->mb, bsize, decode_block, &args);
- return eobtotal;
-}
+
diff --git a/libvpx/vp9/decoder/vp9_detokenize.h b/libvpx/vp9/decoder/vp9_detokenize.h
index cf07c56..04939ea 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.h
+++ b/libvpx/vp9/decoder/vp9_detokenize.h
@@ -15,6 +15,9 @@
#include "vp9/decoder/vp9_onyxd_int.h"
#include "vp9/decoder/vp9_dboolhuff.h"
-int vp9_decode_tokens(VP9D_COMP* pbi, vp9_reader *r, BLOCK_SIZE bsize);
+int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, vp9_reader *r,
+ uint8_t *token_cache);
#endif // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/libvpx/vp9/decoder/vp9_dsubexp.c b/libvpx/vp9/decoder/vp9_dsubexp.c
index 8cc64f7..fcca017 100644
--- a/libvpx/vp9/decoder/vp9_dsubexp.c
+++ b/libvpx/vp9/decoder/vp9_dsubexp.c
@@ -48,8 +48,6 @@ static int merge_index(int v, int n, int modulus) {
static int inv_remap_prob(int v, int m) {
static int inv_map_table[MAX_PROB - 1] = {
- // generated by:
- // inv_map_table[j] = merge_index(j, MAX_PROB - 1, MODULUS_PARAM);
6, 19, 32, 45, 58, 71, 84, 97, 110, 123, 136, 149, 162, 175, 188,
201, 214, 227, 240, 253, 0, 1, 2, 3, 4, 5, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26,
@@ -66,10 +64,11 @@ static int inv_remap_prob(int v, int m) {
190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203, 204, 205,
206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221,
222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
- 238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
-
+ 238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252
};
- // v = merge_index(v, MAX_PROBS - 1, MODULUS_PARAM);
+ // The clamp is not necessary for conforming VP9 stream, it is added to
+ // prevent out of bound access for bad input data
+ v = clamp(v, 0, 253);
v = inv_map_table[v];
m--;
if ((m << 1) <= MAX_PROB) {
@@ -101,6 +100,8 @@ static int decode_term_subexp(vp9_reader *r, int k, int num_syms) {
}
void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) {
- int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
- *p = (vp9_prob)inv_remap_prob(delp, *p);
+ if (vp9_read(r, DIFF_UPDATE_PROB)) {
+ const int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
+ *p = (vp9_prob)inv_remap_prob(delp, *p);
+ }
}
diff --git a/libvpx/vp9/decoder/vp9_idct_blk.c b/libvpx/vp9/decoder/vp9_idct_blk.c
deleted file mode 100644
index 395e636..0000000
--- a/libvpx/vp9/decoder/vp9_idct_blk.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9_rtcd.h"
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/decoder/vp9_idct_blk.h"
-
-static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
- int width, int height) {
- int r, c;
-
- for (r = 0; r < height; r++) {
- for (c = 0; c < width; c++)
- dest[c] = clip_pixel(diff + dest[c]);
-
- dest += stride;
- }
-}
-
-void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest,
- int stride) {
- add_constant_residual(diff, dest, stride, 8, 8);
-}
-
-void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest,
- int stride) {
- add_constant_residual(diff, dest, stride, 16, 16);
-}
-
-void vp9_add_constant_residual_32x32_c(const int16_t diff, uint8_t *dest,
- int stride) {
- add_constant_residual(diff, dest, stride, 32, 32);
-}
-
-void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
- int eob) {
- if (tx_type == DCT_DCT) {
- vp9_idct_add(input, dest, stride, eob);
- } else {
- vp9_short_iht4x4_add(input, dest, stride, tx_type);
- vpx_memset(input, 0, 32);
- }
-}
-
-void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
- int stride, int eob) {
- if (tx_type == DCT_DCT) {
- vp9_idct_add_8x8(input, dest, stride, eob);
- } else {
- if (eob > 0) {
- vp9_short_iht8x8_add(input, dest, stride, tx_type);
- vpx_memset(input, 0, 128);
- }
- }
-}
-
-void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
- if (eob > 1) {
- vp9_short_idct4x4_add(input, dest, stride);
- vpx_memset(input, 0, 32);
- } else {
- vp9_short_idct4x4_1_add(input, dest, stride);
- ((int *)input)[0] = 0;
- }
-}
-
-void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride,
- int eob) {
- if (eob > 1) {
- vp9_short_iwalsh4x4_add(input, dest, stride);
- vpx_memset(input, 0, 32);
- } else {
- vp9_short_iwalsh4x4_1_add_c(input, dest, stride);
- ((int *)input)[0] = 0;
- }
-}
-
-void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
- // If dc is 1, then input[0] is the reconstructed value, do not need
- // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
-
- // The calculation can be simplified if there are not many non-zero dct
- // coefficients. Use eobs to decide what to do.
- // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
- // Combine that with code here.
- if (eob) {
- if (eob == 1) {
- // DC only DCT coefficient
- vp9_short_idct8x8_1_add(input, dest, stride);
- input[0] = 0;
- } else if (eob <= 10) {
- vp9_short_idct10_8x8_add(input, dest, stride);
- vpx_memset(input, 0, 128);
- } else {
- vp9_short_idct8x8_add(input, dest, stride);
- vpx_memset(input, 0, 128);
- }
- }
-}
-
-void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
- int stride, int eob) {
- if (tx_type == DCT_DCT) {
- vp9_idct_add_16x16(input, dest, stride, eob);
- } else {
- if (eob > 0) {
- vp9_short_iht16x16_add(input, dest, stride, tx_type);
- vpx_memset(input, 0, 512);
- }
- }
-}
-
-void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
- /* The calculation can be simplified if there are not many non-zero dct
- * coefficients. Use eobs to separate different cases. */
- if (eob) {
- if (eob == 1) {
- /* DC only DCT coefficient. */
- vp9_short_idct16x16_1_add(input, dest, stride);
- input[0] = 0;
- } else if (eob <= 10) {
- vp9_short_idct10_16x16_add(input, dest, stride);
- vpx_memset(input, 0, 512);
- } else {
- vp9_short_idct16x16_add(input, dest, stride);
- vpx_memset(input, 0, 512);
- }
- }
-}
-
-void vp9_idct_add_32x32_c(int16_t *input, uint8_t *dest, int stride, int eob) {
- DECLARE_ALIGNED_ARRAY(16, int16_t, output, 1024);
-
- if (eob) {
- if (eob == 1) {
- vp9_short_idct1_32x32(input, output);
- vp9_add_constant_residual_32x32(output[0], dest, stride);
- input[0] = 0;
- } else {
- vp9_short_idct32x32_add(input, dest, stride);
- vpx_memset(input, 0, 2048);
- }
- }
-}
-
diff --git a/libvpx/vp9/decoder/vp9_idct_blk.h b/libvpx/vp9/decoder/vp9_idct_blk.h
deleted file mode 100644
index 1810bd0..0000000
--- a/libvpx/vp9/decoder/vp9_idct_blk.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_DECODER_VP9_IDCT_BLK_H_
-#define VP9_DECODER_VP9_IDCT_BLK_H_
-
-#include "vp9/common/vp9_blockd.h"
-
-
-void vp9_idct_add_lossless_c(int16_t *input, unsigned char *dest, int stride,
- int eob);
-
-void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,
- int stride, int eob);
-
-void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,
- int stride, int eob);
-
-void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,
- int stride, int eob);
-
-#endif // VP9_DECODER_VP9_IDCT_BLK_H_
diff --git a/libvpx/vp9/decoder/vp9_onyxd.h b/libvpx/vp9/decoder/vp9_onyxd.h
index cd5b750..a4b9c24 100644
--- a/libvpx/vp9/decoder/vp9_onyxd.h
+++ b/libvpx/vp9/decoder/vp9_onyxd.h
@@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef VP9_COMMON_VP9_ONYXD_H_
-#define VP9_COMMON_VP9_ONYXD_H_
+#ifndef VP9_DECODER_VP9_ONYXD_H_
+#define VP9_DECODER_VP9_ONYXD_H_
#ifdef __cplusplus
extern "C" {
@@ -40,7 +40,7 @@ typedef enum {
void vp9_initialize_dec();
int vp9_receive_compressed_data(VP9D_PTR comp,
- uint64_t size, const uint8_t **dest,
+ size_t size, const uint8_t **dest,
int64_t time_stamp);
int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,
@@ -66,4 +66,4 @@ void vp9_remove_decompressor(VP9D_PTR comp);
}
#endif
-#endif // VP9_COMMON_VP9_ONYXD_H_
+#endif // VP9_DECODER_VP9_ONYXD_H_
diff --git a/libvpx/vp9/decoder/vp9_onyxd_if.c b/libvpx/vp9/decoder/vp9_onyxd_if.c
index 17d5def..5f970a3 100644
--- a/libvpx/vp9/decoder/vp9_onyxd_if.c
+++ b/libvpx/vp9/decoder/vp9_onyxd_if.c
@@ -65,13 +65,12 @@ static void recon_write_yuv_frame(const char *name,
#endif
#if WRITE_RECON_BUFFER == 2
void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-
// write the frame
FILE *yframe;
int i;
char filename[255];
- sprintf(filename, "dx\\y%04d.raw", this_frame);
+ snprintf(filename, sizeof(filename)-1, "dx\\y%04d.raw", this_frame);
yframe = fopen(filename, "wb");
for (i = 0; i < frame->y_height; i++)
@@ -79,7 +78,7 @@ void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
frame->y_width, 1, yframe);
fclose(yframe);
- sprintf(filename, "dx\\u%04d.raw", this_frame);
+ snprintf(filename, sizeof(filename)-1, "dx\\u%04d.raw", this_frame);
yframe = fopen(filename, "wb");
for (i = 0; i < frame->uv_height; i++)
@@ -87,7 +86,7 @@ void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
frame->uv_width, 1, yframe);
fclose(yframe);
- sprintf(filename, "dx\\v%04d.raw", this_frame);
+ snprintf(filename, sizeof(filename)-1, "dx\\v%04d.raw", this_frame);
yframe = fopen(filename, "wb");
for (i = 0; i < frame->uv_height; i++)
@@ -142,20 +141,13 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
cm->error.setjmp = 0;
pbi->decoded_key_frame = 0;
- if (pbi->oxcf.max_threads > 1) {
- vp9_worker_init(&pbi->lf_worker);
- pbi->lf_worker.data1 = vpx_malloc(sizeof(LFWorkerData));
- pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
- if (pbi->lf_worker.data1 == NULL || !vp9_worker_reset(&pbi->lf_worker)) {
- vp9_remove_decompressor(pbi);
- return NULL;
- }
- }
+ vp9_worker_init(&pbi->lf_worker);
return pbi;
}
void vp9_remove_decompressor(VP9D_PTR ptr) {
+ int i;
VP9D_COMP *const pbi = (VP9D_COMP *)ptr;
if (!pbi)
@@ -164,6 +156,16 @@ void vp9_remove_decompressor(VP9D_PTR ptr) {
vp9_remove_common(&pbi->common);
vp9_worker_end(&pbi->lf_worker);
vpx_free(pbi->lf_worker.data1);
+ for (i = 0; i < pbi->num_tile_workers; ++i) {
+ VP9Worker *const worker = &pbi->tile_workers[i];
+ vp9_worker_end(worker);
+ vpx_free(worker->data1);
+ vpx_free(worker->data2);
+ }
+ vpx_free(pbi->tile_workers);
+ vpx_free(pbi->mi_streams);
+ vpx_free(pbi->above_context[0]);
+ vpx_free(pbi->above_seg_context);
vpx_free(pbi);
}
@@ -177,7 +179,6 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr,
YV12_BUFFER_CONFIG *sd) {
VP9D_COMP *pbi = (VP9D_COMP *) ptr;
VP9_COMMON *cm = &pbi->common;
- int ref_fb_idx;
/* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
* encoder is using the frame buffers for. This is just a stub to keep the
@@ -185,18 +186,15 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr,
* later commit that adds VP9-specific controls for this functionality.
*/
if (ref_frame_flag == VP9_LAST_FLAG) {
- ref_fb_idx = cm->ref_frame_map[0];
+ YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[0]];
+ if (!equal_dimensions(cfg, sd))
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ else
+ vp8_yv12_copy_frame(cfg, sd);
} else {
vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
"Invalid reference frame");
- return cm->error.error_code;
- }
-
- if (!equal_dimensions(&cm->yv12_fb[ref_fb_idx], sd)) {
- vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
- "Incorrect buffer dimensions");
- } else {
- vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd);
}
return cm->error.error_code;
@@ -214,13 +212,13 @@ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
* vpxenc --test-decode functionality working, and will be replaced in a
* later commit that adds VP9-specific controls for this functionality.
*/
- if (ref_frame_flag == VP9_LAST_FLAG)
+ if (ref_frame_flag == VP9_LAST_FLAG) {
ref_fb_ptr = &pbi->common.active_ref_idx[0];
- else if (ref_frame_flag == VP9_GOLD_FLAG)
+ } else if (ref_frame_flag == VP9_GOLD_FLAG) {
ref_fb_ptr = &pbi->common.active_ref_idx[1];
- else if (ref_frame_flag == VP9_ALT_FLAG)
+ } else if (ref_frame_flag == VP9_ALT_FLAG) {
ref_fb_ptr = &pbi->common.active_ref_idx[2];
- else {
+ } else {
vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
"Invalid reference frame");
return pbi->common.error.error_code;
@@ -268,7 +266,7 @@ static void swap_frame_buffers(VP9D_COMP *pbi) {
++ref_index;
}
- cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+ cm->frame_to_show = get_frame_new_buffer(cm);
cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
// Invalidate these references until the next frame starts.
@@ -277,7 +275,7 @@ static void swap_frame_buffers(VP9D_COMP *pbi) {
}
int vp9_receive_compressed_data(VP9D_PTR ptr,
- uint64_t size, const uint8_t **psource,
+ size_t size, const uint8_t **psource,
int64_t time_stamp) {
VP9D_COMP *pbi = (VP9D_COMP *) ptr;
VP9_COMMON *cm = &pbi->common;
@@ -306,7 +304,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
* thing to do here.
*/
if (cm->active_ref_idx[0] != INT_MAX)
- cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1;
+ get_frame_ref_buffer(cm, 0)->corrupted = 1;
}
cm->new_fb_idx = get_free_fb(cm);
@@ -323,7 +321,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
* thing to do here.
*/
if (cm->active_ref_idx[0] != INT_MAX)
- cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1;
+ get_frame_ref_buffer(cm, 0)->corrupted = 1;
if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
@@ -343,36 +341,33 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
return retcode;
}
- {
- swap_frame_buffers(pbi);
+ swap_frame_buffers(pbi);
#if WRITE_RECON_BUFFER == 2
- if (cm->show_frame)
- write_dx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame);
- else
- write_dx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame + 1000);
+ if (cm->show_frame)
+ write_dx_frame_to_file(cm->frame_to_show,
+ cm->current_video_frame);
+ else
+ write_dx_frame_to_file(cm->frame_to_show,
+ cm->current_video_frame + 1000);
#endif
- if (!pbi->do_loopfilter_inline) {
- /* Apply the loop filter if appropriate. */
- vp9_loop_filter_frame(cm, &pbi->mb, pbi->common.lf.filter_level, 0, 0);
- }
+ if (!pbi->do_loopfilter_inline) {
+ vp9_loop_filter_frame(cm, &pbi->mb, pbi->common.lf.filter_level, 0, 0);
+ }
#if WRITE_RECON_BUFFER == 2
- if (cm->show_frame)
- write_dx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame + 2000);
- else
- write_dx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame + 3000);
+ if (cm->show_frame)
+ write_dx_frame_to_file(cm->frame_to_show,
+ cm->current_video_frame + 2000);
+ else
+ write_dx_frame_to_file(cm->frame_to_show,
+ cm->current_video_frame + 3000);
#endif
- vp9_extend_frame_inner_borders(cm->frame_to_show,
- cm->subsampling_x,
- cm->subsampling_y);
- }
+ vp9_extend_frame_inner_borders(cm->frame_to_show,
+ cm->subsampling_x,
+ cm->subsampling_y);
#if WRITE_RECON_BUFFER == 1
if (cm->show_frame)
@@ -398,6 +393,9 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1;
cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
+ pbi->mb.mi_8x8 = cm->mi_grid_visible;
+ pbi->mb.mi_8x8[0] = cm->mi;
+
cm->current_video_frame++;
}
diff --git a/libvpx/vp9/decoder/vp9_onyxd_int.h b/libvpx/vp9/decoder/vp9_onyxd_int.h
index a051971..7c4c9db 100644
--- a/libvpx/vp9/decoder/vp9_onyxd_int.h
+++ b/libvpx/vp9/decoder/vp9_onyxd_int.h
@@ -25,7 +25,7 @@ typedef struct VP9Decompressor {
VP9D_CONFIG oxcf;
const uint8_t *source;
- uint32_t source_sz;
+ size_t source_sz;
int64_t last_time_stamp;
int ready_for_new_data;
@@ -39,6 +39,18 @@ typedef struct VP9Decompressor {
int do_loopfilter_inline; // apply loopfilter to available rows immediately
VP9Worker lf_worker;
+
+ VP9Worker *tile_workers;
+ int num_tile_workers;
+
+ /* Each tile column has its own MODE_INFO stream. This array indexes them by
+ tile column index. */
+ MODE_INFO **mi_streams;
+
+ ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+ PARTITION_CONTEXT *above_seg_context;
+
+ DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
} VP9D_COMP;
-#endif // VP9_DECODER_VP9_TREEREADER_H_
+#endif // VP9_DECODER_VP9_ONYXD_INT_H_
diff --git a/libvpx/vp9/decoder/vp9_read_bit_buffer.h b/libvpx/vp9/decoder/vp9_read_bit_buffer.h
index c7fa3aa..41a6868 100644
--- a/libvpx/vp9/decoder/vp9_read_bit_buffer.h
+++ b/libvpx/vp9/decoder/vp9_read_bit_buffer.h
@@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef VP9_READ_BIT_BUFFER_
-#define VP9_READ_BIT_BUFFER_
+#ifndef VP9_DECODER_VP9_READ_BIT_BUFFER_H_
+#define VP9_DECODER_VP9_READ_BIT_BUFFER_H_
#include <limits.h>
@@ -57,4 +57,4 @@ static int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb,
return vp9_rb_read_bit(rb) ? -value : value;
}
-#endif // VP9_READ_BIT_BUFFER_
+#endif // VP9_DECODER_VP9_READ_BIT_BUFFER_H_
diff --git a/libvpx/vp9/decoder/vp9_thread.c b/libvpx/vp9/decoder/vp9_thread.c
index dc3b681..d953e72 100644
--- a/libvpx/vp9/decoder/vp9_thread.c
+++ b/libvpx/vp9/decoder/vp9_thread.c
@@ -29,7 +29,7 @@ extern "C" {
//------------------------------------------------------------------------------
// simplistic pthread emulation layer
-#include <process.h>
+#include <process.h> // NOLINT
// _beginthreadex requires __stdcall
#define THREADFN unsigned int __stdcall
@@ -145,9 +145,7 @@ static THREADFN thread_loop(void *ptr) { // thread loop
pthread_cond_wait(&worker->condition_, &worker->mutex_);
}
if (worker->status_ == WORK) {
- if (worker->hook) {
- worker->had_error |= !worker->hook(worker->data1, worker->data2);
- }
+ vp9_worker_execute(worker);
worker->status_ = OK;
} else if (worker->status_ == NOT_OK) { // finish the worker
done = 1;
@@ -178,7 +176,7 @@ static void change_state(VP9Worker* const worker,
pthread_mutex_unlock(&worker->mutex_);
}
-#endif
+#endif // CONFIG_MULTITHREAD
//------------------------------------------------------------------------------
@@ -218,12 +216,17 @@ int vp9_worker_reset(VP9Worker* const worker) {
return ok;
}
+void vp9_worker_execute(VP9Worker* const worker) {
+ if (worker->hook != NULL) {
+ worker->had_error |= !worker->hook(worker->data1, worker->data2);
+ }
+}
+
void vp9_worker_launch(VP9Worker* const worker) {
#if CONFIG_MULTITHREAD
change_state(worker, WORK);
#else
- if (worker->hook)
- worker->had_error |= !worker->hook(worker->data1, worker->data2);
+ vp9_worker_execute(worker);
#endif
}
diff --git a/libvpx/vp9/decoder/vp9_thread.h b/libvpx/vp9/decoder/vp9_thread.h
index a8f7e04..a624f3c 100644
--- a/libvpx/vp9/decoder/vp9_thread.h
+++ b/libvpx/vp9/decoder/vp9_thread.h
@@ -17,7 +17,7 @@
#ifndef VP9_DECODER_VP9_THREAD_H_
#define VP9_DECODER_VP9_THREAD_H_
-#include "vpx_config.h"
+#include "./vpx_config.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
@@ -27,7 +27,7 @@ extern "C" {
#if defined(_WIN32)
-#include <windows.h>
+#include <windows.h> // NOLINT
typedef HANDLE pthread_t;
typedef CRITICAL_SECTION pthread_mutex_t;
typedef struct {
@@ -38,7 +38,7 @@ typedef struct {
#else
-#include <pthread.h>
+#include <pthread.h> // NOLINT
#endif /* _WIN32 */
#endif /* CONFIG_MULTITHREAD */
@@ -80,6 +80,11 @@ int vp9_worker_sync(VP9Worker* const worker);
// hook/data1/data2 can be changed at any time before calling this function,
// but not be changed afterward until the next call to vp9_worker_sync().
void vp9_worker_launch(VP9Worker* const worker);
+// This function is similar to vp9_worker_launch() except that it calls the
+// hook directly instead of using a thread. Convenient to bypass the thread
+// mechanism while still using the VP9Worker structs. vp9_worker_sync() must
+// still be called afterward (for error reporting).
+void vp9_worker_execute(VP9Worker* const worker);
// Kill the thread and terminate the object. To use the object again, one
// must call vp9_worker_reset() again.
void vp9_worker_end(VP9Worker* const worker);
@@ -90,4 +95,4 @@ void vp9_worker_end(VP9Worker* const worker);
} // extern "C"
#endif
-#endif /* VP9_DECODER_VP9_THREAD_H_ */
+#endif // VP9_DECODER_VP9_THREAD_H_
diff --git a/libvpx/vp9/decoder/vp9_treereader.h b/libvpx/vp9/decoder/vp9_treereader.h
index 710cc4c..f612497 100644
--- a/libvpx/vp9/decoder/vp9_treereader.h
+++ b/libvpx/vp9/decoder/vp9_treereader.h
@@ -23,7 +23,8 @@ static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
const vp9_prob *const p) {
register vp9_tree_index i = 0;
- while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0);
+ while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0)
+ continue;
return -i;
}
diff --git a/libvpx/vp9/decoder/x86/vp9_dequantize_sse2.c b/libvpx/vp9/decoder/x86/vp9_dequantize_sse2.c
deleted file mode 100644
index 54ec67f..0000000
--- a/libvpx/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <emmintrin.h> // SSE2
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_idct.h"
-
-void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
- int stride) {
- uint8_t abs_diff;
- __m128i d;
-
- // Prediction data.
- __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
- __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
- __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
- __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
- __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride));
- __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride));
- __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride));
- __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride));
-
- p0 = _mm_unpacklo_epi64(p0, p1);
- p2 = _mm_unpacklo_epi64(p2, p3);
- p4 = _mm_unpacklo_epi64(p4, p5);
- p6 = _mm_unpacklo_epi64(p6, p7);
-
- // Clip diff value to [0, 255] range. Then, do addition or subtraction
- // according to its sign.
- if (diff >= 0) {
- abs_diff = (diff > 255) ? 255 : diff;
- d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-
- p0 = _mm_adds_epu8(p0, d);
- p2 = _mm_adds_epu8(p2, d);
- p4 = _mm_adds_epu8(p4, d);
- p6 = _mm_adds_epu8(p6, d);
- } else {
- abs_diff = (diff < -255) ? 255 : -diff;
- d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-
- p0 = _mm_subs_epu8(p0, d);
- p2 = _mm_subs_epu8(p2, d);
- p4 = _mm_subs_epu8(p4, d);
- p6 = _mm_subs_epu8(p6, d);
- }
-
- _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);
- p0 = _mm_srli_si128(p0, 8);
- _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);
-
- _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);
- p2 = _mm_srli_si128(p2, 8);
- _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);
-
- _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);
- p4 = _mm_srli_si128(p4, 8);
- _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);
-
- _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);
- p6 = _mm_srli_si128(p6, 8);
- _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
-}
-
-void vp9_add_constant_residual_16x16_sse2(const int16_t diff, uint8_t *dest,
- int stride) {
- uint8_t abs_diff;
- __m128i d;
-
- // Prediction data.
- __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
- __m128i p1 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
- __m128i p2 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
- __m128i p3 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
- __m128i p4 = _mm_load_si128((const __m128i *)(dest + 4 * stride));
- __m128i p5 = _mm_load_si128((const __m128i *)(dest + 5 * stride));
- __m128i p6 = _mm_load_si128((const __m128i *)(dest + 6 * stride));
- __m128i p7 = _mm_load_si128((const __m128i *)(dest + 7 * stride));
- __m128i p8 = _mm_load_si128((const __m128i *)(dest + 8 * stride));
- __m128i p9 = _mm_load_si128((const __m128i *)(dest + 9 * stride));
- __m128i p10 = _mm_load_si128((const __m128i *)(dest + 10 * stride));
- __m128i p11 = _mm_load_si128((const __m128i *)(dest + 11 * stride));
- __m128i p12 = _mm_load_si128((const __m128i *)(dest + 12 * stride));
- __m128i p13 = _mm_load_si128((const __m128i *)(dest + 13 * stride));
- __m128i p14 = _mm_load_si128((const __m128i *)(dest + 14 * stride));
- __m128i p15 = _mm_load_si128((const __m128i *)(dest + 15 * stride));
-
- // Clip diff value to [0, 255] range. Then, do addition or subtraction
- // according to its sign.
- if (diff >= 0) {
- abs_diff = (diff > 255) ? 255 : diff;
- d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-
- p0 = _mm_adds_epu8(p0, d);
- p1 = _mm_adds_epu8(p1, d);
- p2 = _mm_adds_epu8(p2, d);
- p3 = _mm_adds_epu8(p3, d);
- p4 = _mm_adds_epu8(p4, d);
- p5 = _mm_adds_epu8(p5, d);
- p6 = _mm_adds_epu8(p6, d);
- p7 = _mm_adds_epu8(p7, d);
- p8 = _mm_adds_epu8(p8, d);
- p9 = _mm_adds_epu8(p9, d);
- p10 = _mm_adds_epu8(p10, d);
- p11 = _mm_adds_epu8(p11, d);
- p12 = _mm_adds_epu8(p12, d);
- p13 = _mm_adds_epu8(p13, d);
- p14 = _mm_adds_epu8(p14, d);
- p15 = _mm_adds_epu8(p15, d);
- } else {
- abs_diff = (diff < -255) ? 255 : -diff;
- d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
-
- p0 = _mm_subs_epu8(p0, d);
- p1 = _mm_subs_epu8(p1, d);
- p2 = _mm_subs_epu8(p2, d);
- p3 = _mm_subs_epu8(p3, d);
- p4 = _mm_subs_epu8(p4, d);
- p5 = _mm_subs_epu8(p5, d);
- p6 = _mm_subs_epu8(p6, d);
- p7 = _mm_subs_epu8(p7, d);
- p8 = _mm_subs_epu8(p8, d);
- p9 = _mm_subs_epu8(p9, d);
- p10 = _mm_subs_epu8(p10, d);
- p11 = _mm_subs_epu8(p11, d);
- p12 = _mm_subs_epu8(p12, d);
- p13 = _mm_subs_epu8(p13, d);
- p14 = _mm_subs_epu8(p14, d);
- p15 = _mm_subs_epu8(p15, d);
- }
-
- // Store results
- _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
- _mm_store_si128((__m128i *)(dest + 1 * stride), p1);
- _mm_store_si128((__m128i *)(dest + 2 * stride), p2);
- _mm_store_si128((__m128i *)(dest + 3 * stride), p3);
- _mm_store_si128((__m128i *)(dest + 4 * stride), p4);
- _mm_store_si128((__m128i *)(dest + 5 * stride), p5);
- _mm_store_si128((__m128i *)(dest + 6 * stride), p6);
- _mm_store_si128((__m128i *)(dest + 7 * stride), p7);
- _mm_store_si128((__m128i *)(dest + 8 * stride), p8);
- _mm_store_si128((__m128i *)(dest + 9 * stride), p9);
- _mm_store_si128((__m128i *)(dest + 10 * stride), p10);
- _mm_store_si128((__m128i *)(dest + 11 * stride), p11);
- _mm_store_si128((__m128i *)(dest + 12 * stride), p12);
- _mm_store_si128((__m128i *)(dest + 13 * stride), p13);
- _mm_store_si128((__m128i *)(dest + 14 * stride), p14);
- _mm_store_si128((__m128i *)(dest + 15 * stride), p15);
-}
-
-void vp9_add_constant_residual_32x32_sse2(const int16_t diff, uint8_t *dest,
- int stride) {
- uint8_t abs_diff;
- __m128i d;
- int i = 8;
-
- if (diff >= 0) {
- abs_diff = (diff > 255) ? 255 : diff;
- d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
- } else {
- abs_diff = (diff < -255) ? 255 : -diff;
- d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);
- }
-
- do {
- // Prediction data.
- __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
- __m128i p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16));
- __m128i p2 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
- __m128i p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16));
- __m128i p4 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
- __m128i p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride + 16));
- __m128i p6 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
- __m128i p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride + 16));
-
- // Clip diff value to [0, 255] range. Then, do addition or subtraction
- // according to its sign.
- if (diff >= 0) {
- p0 = _mm_adds_epu8(p0, d);
- p1 = _mm_adds_epu8(p1, d);
- p2 = _mm_adds_epu8(p2, d);
- p3 = _mm_adds_epu8(p3, d);
- p4 = _mm_adds_epu8(p4, d);
- p5 = _mm_adds_epu8(p5, d);
- p6 = _mm_adds_epu8(p6, d);
- p7 = _mm_adds_epu8(p7, d);
- } else {
- p0 = _mm_subs_epu8(p0, d);
- p1 = _mm_subs_epu8(p1, d);
- p2 = _mm_subs_epu8(p2, d);
- p3 = _mm_subs_epu8(p3, d);
- p4 = _mm_subs_epu8(p4, d);
- p5 = _mm_subs_epu8(p5, d);
- p6 = _mm_subs_epu8(p6, d);
- p7 = _mm_subs_epu8(p7, d);
- }
-
- // Store results
- _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
- _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);
- _mm_store_si128((__m128i *)(dest + 1 * stride), p2);
- _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);
- _mm_store_si128((__m128i *)(dest + 2 * stride), p4);
- _mm_store_si128((__m128i *)(dest + 2 * stride + 16), p5);
- _mm_store_si128((__m128i *)(dest + 3 * stride), p6);
- _mm_store_si128((__m128i *)(dest + 3 * stride + 16), p7);
-
- dest += 4 * stride;
- } while (--i);
-}
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index 957cfd2..87bd36c 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c
@@ -22,7 +22,6 @@
#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_entropy.h"
-#include "vp9/common/vp9_entropymv.h"
#include "vp9/common/vp9_mvref_common.h"
#include "vp9/common/vp9_treecoder.h"
#include "vp9/common/vp9_systemdependent.h"
@@ -54,8 +53,7 @@ extern unsigned int active_section;
int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES];
int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1];
int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-int64_t switchable_interp_stats[SWITCHABLE_FILTERS+1]
- [SWITCHABLE_FILTERS];
+int64_t switchable_interp_stats[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
void init_tx_count_stats() {
vp9_zero(tx_count_32x32p_stats);
@@ -88,10 +86,9 @@ static void update_tx_count_stats(VP9_COMMON *cm) {
static void update_switchable_interp_stats(VP9_COMMON *cm) {
int i, j;
- for (i = 0; i < SWITCHABLE_FILTERS+1; ++i)
- for (j = 0; j < SWITCHABLE_FILTERS; ++j) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+ for (j = 0; j < SWITCHABLE_FILTERS; ++j)
switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j];
- }
}
void write_tx_count_stats() {
@@ -141,9 +138,9 @@ void write_switchable_interp_stats() {
fclose(fp);
printf(
- "vp9_default_switchable_filter_count[SWITCHABLE_FILTERS+1]"
+ "vp9_default_switchable_filter_count[SWITCHABLE_FILTER_CONTEXTS]"
"[SWITCHABLE_FILTERS] = {\n");
- for (i = 0; i < SWITCHABLE_FILTERS+1; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
printf(" { ");
for (j = 0; j < SWITCHABLE_FILTERS; j++) {
printf("%"PRId64", ", switchable_interp_stats[i][j]);
@@ -166,34 +163,27 @@ void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
}
-static void update_mode(
- vp9_writer *w,
- int n,
- vp9_tree tree,
- vp9_prob Pnew[/* n-1 */],
- vp9_prob Pcur[/* n-1 */],
- unsigned int bct[/* n-1 */] [2],
- const unsigned int num_events[/* n */]
-) {
+static void update_mode(vp9_writer *w, int n, vp9_tree tree,
+ vp9_prob Pcur[/* n-1 */],
+ unsigned int bct[/* n-1 */][2],
+ const unsigned int num_events[/* n */]) {
int i = 0;
- vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);
+ vp9_tree_probs_from_distribution(tree, bct, num_events, 0);
n--;
- for (i = 0; i < n; ++i) {
- vp9_cond_prob_diff_update(w, &Pcur[i], MODE_UPDATE_PROB, bct[i]);
- }
+ for (i = 0; i < n; ++i)
+ vp9_cond_prob_diff_update(w, &Pcur[i], bct[i]);
}
static void update_mbintra_mode_probs(VP9_COMP* const cpi,
vp9_writer* const bc) {
VP9_COMMON *const cm = &cpi->common;
int j;
- vp9_prob pnew[INTRA_MODES - 1];
unsigned int bct[INTRA_MODES - 1][2];
for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
- update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, pnew,
+ update_mode(bc, INTRA_MODES, vp9_intra_mode_tree,
cm->fc.y_mode_prob[j], bct,
(unsigned int *)cpi->y_mode_count[j]);
}
@@ -228,53 +218,43 @@ void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) {
int k;
for (k = 0; k < MBSKIP_CONTEXTS; ++k)
- vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k],
- MODE_UPDATE_PROB, cm->counts.mbskip[k]);
+ vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], cm->counts.mbskip[k]);
}
static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);
}
-static void update_switchable_interp_probs(VP9_COMP *const cpi,
- vp9_writer* const bc) {
+static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) {
VP9_COMMON *const cm = &cpi->common;
- unsigned int branch_ct[SWITCHABLE_FILTERS + 1]
- [SWITCHABLE_FILTERS - 1][2];
- vp9_prob new_prob[SWITCHABLE_FILTERS + 1][SWITCHABLE_FILTERS - 1];
+ unsigned int branch_ct[SWITCHABLE_FILTERS - 1][2];
int i, j;
- for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
- vp9_tree_probs_from_distribution(
- vp9_switchable_interp_tree,
- new_prob[j], branch_ct[j],
- cm->counts.switchable_interp[j], 0);
- }
- for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
- for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
- vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
- MODE_UPDATE_PROB, branch_ct[j][i]);
- }
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
+ vp9_tree_probs_from_distribution(vp9_switchable_interp_tree, branch_ct,
+ cm->counts.switchable_interp[j], 0);
+
+ for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
+ vp9_cond_prob_diff_update(w, &cm->fc.switchable_interp_prob[j][i],
+ branch_ct[i]);
}
+
#ifdef MODE_STATS
if (!cpi->dummy_packing)
update_switchable_interp_stats(cm);
#endif
}
-static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) {
+static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer *w) {
int i, j;
for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
unsigned int branch_ct[INTER_MODES - 1][2];
- vp9_prob new_prob[INTER_MODES - 1];
-
- vp9_tree_probs_from_distribution(vp9_inter_mode_tree,
- new_prob, branch_ct,
+ vp9_tree_probs_from_distribution(vp9_inter_mode_tree, branch_ct,
cm->counts.inter_mode[i], NEARESTMV);
for (j = 0; j < INTER_MODES - 1; ++j)
- vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j],
- MODE_UPDATE_PROB, branch_ct[j]);
+ vp9_cond_prob_diff_update(w, &cm->fc.inter_mode_probs[i][j],
+ branch_ct[j]);
}
}
@@ -283,7 +263,7 @@ static void pack_mb_tokens(vp9_writer* const bc,
const TOKENEXTRA *const stop) {
TOKENEXTRA *p = *tp;
- while (p < stop) {
+ while (p < stop && p->token != EOSB_TOKEN) {
const int t = p->token;
const struct vp9_token *const a = vp9_coef_encodings + t;
const vp9_extra_bit *const b = vp9_extra_bits + t;
@@ -293,10 +273,6 @@ static void pack_mb_tokens(vp9_writer* const bc,
int n = a->len;
vp9_prob probs[ENTROPY_NODES];
- if (t == EOSB_TOKEN) {
- ++p;
- break;
- }
if (t >= TWO_TOKEN) {
vp9_model_to_full_probs(p->context_tree, probs);
pp = probs;
@@ -338,14 +314,14 @@ static void pack_mb_tokens(vp9_writer* const bc,
++p;
}
- *tp = p;
+ *tp = p + (p->token == EOSB_TOKEN);
}
static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode,
const vp9_prob *p) {
assert(is_inter_mode(mode));
write_token(w, vp9_inter_mode_tree, p,
- &vp9_inter_mode_encodings[mode - NEARESTMV]);
+ &vp9_inter_mode_encodings[inter_mode_offset(mode)]);
}
@@ -360,7 +336,7 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *mi = &xd->this_mi->mbmi;
+ MB_MODE_INFO *mi = &xd->mi_8x8[0]->mbmi;
const int segment_id = mi->segment_id;
int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
SEG_LVL_REF_FRAME);
@@ -393,8 +369,8 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
mi->ref_frame[0]);
}
- // if using the prediction mdoel we have nothing further to do because
- // the reference frame is fully coded by the segment
+ // If using the prediction model we have nothing further to do because
+ // the reference frame is fully coded by the segment.
}
static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
@@ -409,9 +385,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
const int segment_id = mi->segment_id;
int skip_coeff;
const BLOCK_SIZE bsize = mi->sb_type;
- const int allow_hp = xd->allow_high_precision_mv;
-
- x->partition_info = x->pi + (m - cm->mi);
+ const int allow_hp = cm->allow_high_precision_mv;
#ifdef ENTROPY_STATS
active_section = 9;
@@ -488,17 +462,13 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
}
if (bsize < BLOCK_8X8) {
- int j;
- MB_PREDICTION_MODE blockmode;
- int_mv blockmv;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
int idx, idy;
for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
- j = idy * 2 + idx;
- blockmode = x->partition_info->bmi[j].mode;
- blockmv = m->bmi[j].as_mv[0];
+ const int j = idy * 2 + idx;
+ const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
write_sb_mv_ref(bc, blockmode, mv_ref_p);
++cm->counts.inter_mode[mi->mode_context[rf]]
[inter_mode_offset(blockmode)];
@@ -507,14 +477,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
#ifdef ENTROPY_STATS
active_section = 11;
#endif
- vp9_encode_mv(cpi, bc, &blockmv.as_mv, &mi->best_mv.as_mv,
- nmvc, allow_hp);
-
- if (mi->ref_frame[1] > INTRA_FRAME)
- vp9_encode_mv(cpi, bc,
- &m->bmi[j].as_mv[1].as_mv,
- &mi->best_second_mv.as_mv,
- nmvc, allow_hp);
+ vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[0].as_mv,
+ &mi->best_mv[0].as_mv, nmvc, allow_hp);
+
+ if (has_second_ref(mi))
+ vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[1].as_mv,
+ &mi->best_mv[1].as_mv, nmvc, allow_hp);
}
}
}
@@ -522,12 +490,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
#ifdef ENTROPY_STATS
active_section = 5;
#endif
- vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv, &mi->best_mv.as_mv,
- nmvc, allow_hp);
+ vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv,
+ &mi->best_mv[0].as_mv, nmvc, allow_hp);
- if (mi->ref_frame[1] > INTRA_FRAME)
- vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv, &mi->best_second_mv.as_mv,
- nmvc, allow_hp);
+ if (has_second_ref(mi))
+ vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv,
+ &mi->best_mv[1].as_mv, nmvc, allow_hp);
}
}
}
@@ -541,7 +509,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
const int ym = m->mbmi.mode;
const int segment_id = m->mbmi.segment_id;
MODE_INFO *above_mi = mi_8x8[-xd->mode_info_stride];
- MODE_INFO *left_mi = mi_8x8[-1];
+ MODE_INFO *left_mi = xd->left_available ? mi_8x8[-1] : NULL;
if (seg->update_map)
write_segment_id(bc, seg, m->mbmi.segment_id);
@@ -553,8 +521,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
if (m->mbmi.sb_type >= BLOCK_8X8) {
const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0);
- const MB_PREDICTION_MODE L = xd->left_available ?
- left_block_mode(m, left_mi, 0) : DC_PRED;
+ const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, 0);
write_intra_mode(bc, ym, vp9_kf_y_mode_prob[A][L]);
} else {
int idx, idy;
@@ -564,8 +531,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
int i = idy * 2 + idx;
const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, i);
- const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
- left_block_mode(m, left_mi, i) : DC_PRED;
+ const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, i);
const int bm = m->bmi[i].as_mode;
#ifdef ENTROPY_STATS
++intra_mode_stats[A][L][bm];
@@ -578,24 +544,25 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
write_intra_mode(bc, m->mbmi.uv_mode, vp9_kf_uv_mode_prob[ym]);
}
-static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
+static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
+ MODE_INFO **mi_8x8, vp9_writer *bc,
TOKENEXTRA **tok, TOKENEXTRA *tok_end,
- int mi_row, int mi_col) {
+ int mi_row, int mi_col, int index) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
MODE_INFO *m = mi_8x8[0];
if (m->mbmi.sb_type < BLOCK_8X8)
- if (xd->ab_index > 0)
+ if (index > 0)
return;
- xd->this_mi = mi_8x8[0];
xd->mi_8x8 = mi_8x8;
- set_mi_row_col(&cpi->common, xd,
+ set_mi_row_col(xd, tile,
mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
- mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type]);
- if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
+ mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
+ cm->mi_rows, cm->mi_cols);
+ if (frame_is_intra_only(cm)) {
write_mb_modes_kf(cpi, mi_8x8, bc);
#ifdef ENTROPY_STATS
active_section = 8;
@@ -611,11 +578,35 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
pack_mb_tokens(bc, tok, tok_end);
}
-static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
+static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col,
+ PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int ctx = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row, mi_col, bsize);
+ const vp9_prob *const probs = get_partition_probs(cm, ctx);
+ const int has_rows = (mi_row + hbs) < cm->mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+ if (has_rows && has_cols) {
+ write_token(w, vp9_partition_tree, probs, &vp9_partition_encodings[p]);
+ } else if (!has_rows && has_cols) {
+ assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+ vp9_write(w, p == PARTITION_SPLIT, probs[1]);
+ } else if (has_rows && !has_cols) {
+ assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+ vp9_write(w, p == PARTITION_SPLIT, probs[2]);
+ } else {
+ assert(p == PARTITION_SPLIT);
+ }
+}
+
+static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
+ MODE_INFO **mi_8x8, vp9_writer *bc,
TOKENEXTRA **tok, TOKENEXTRA *tok_end,
- int mi_row, int mi_col, BLOCK_SIZE bsize) {
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int index) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
const int mis = cm->mode_info_stride;
int bsl = b_width_log2(bsize);
int bs = (1 << bsl) / 4; // mode_info step for subsize
@@ -629,52 +620,37 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
partition = partition_lookup[bsl][m->mbmi.sb_type];
- if (bsize < BLOCK_8X8)
- if (xd->ab_index > 0)
+ if (bsize < BLOCK_8X8) {
+ if (index > 0)
return;
-
- if (bsize >= BLOCK_8X8) {
- int pl;
- const int idx = check_bsize_coverage(bs, cm->mi_rows, cm->mi_cols,
- mi_row, mi_col);
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- // encode the partition information
- if (idx == 0)
- write_token(bc, vp9_partition_tree,
- cm->fc.partition_prob[cm->frame_type][pl],
- vp9_partition_encodings + partition);
- else if (idx > 0)
- vp9_write(bc, partition == PARTITION_SPLIT,
- cm->fc.partition_prob[cm->frame_type][pl][idx]);
+ } else {
+ write_partition(cpi, bs, mi_row, mi_col, partition, bsize, bc);
}
subsize = get_subsize(bsize, partition);
- *(get_sb_index(xd, subsize)) = 0;
switch (partition) {
case PARTITION_NONE:
- write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
break;
case PARTITION_HORZ:
- write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
- *(get_sb_index(xd, subsize)) = 1;
+ write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
if ((mi_row + bs) < cm->mi_rows)
- write_modes_b(cpi, mi_8x8 + bs * mis, bc, tok, tok_end, mi_row + bs,
- mi_col);
+ write_modes_b(cpi, tile, mi_8x8 + bs * mis, bc, tok, tok_end,
+ mi_row + bs, mi_col, 1);
break;
case PARTITION_VERT:
- write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
- *(get_sb_index(xd, subsize)) = 1;
+ write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
if ((mi_col + bs) < cm->mi_cols)
- write_modes_b(cpi, mi_8x8 + bs, bc, tok, tok_end, mi_row, mi_col + bs);
+ write_modes_b(cpi, tile, mi_8x8 + bs, bc, tok, tok_end,
+ mi_row, mi_col + bs, 1);
break;
case PARTITION_SPLIT:
for (n = 0; n < 4; n++) {
- int j = n >> 1, i = n & 0x01;
- *(get_sb_index(xd, subsize)) = n;
- write_modes_sb(cpi, mi_8x8 + j * bs * mis + i * bs, bc, tok, tok_end,
- mi_row + j * bs, mi_col + i * bs, subsize);
+ const int j = n >> 1, i = n & 1;
+ write_modes_sb(cpi, tile, mi_8x8 + j * bs * mis + i * bs, bc,
+ tok, tok_end,
+ mi_row + j * bs, mi_col + i * bs, subsize, n);
}
break;
default:
@@ -683,13 +659,13 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
// update partition context
if (bsize >= BLOCK_8X8 &&
- (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) {
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- update_partition_context(xd, subsize, bsize);
- }
+ (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
+ update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
+ mi_row, mi_col, subsize, bsize);
}
-static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
+static void write_modes(VP9_COMP *cpi, const TileInfo *const tile,
+ vp9_writer* const bc,
TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
VP9_COMMON *const cm = &cpi->common;
const int mis = cm->mode_info_stride;
@@ -697,57 +673,27 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
MODE_INFO **mi_8x8 = cm->mi_grid_visible;
MODE_INFO **m_8x8;
- mi_8x8 += cm->cur_tile_mi_col_start + cm->cur_tile_mi_row_start * mis;
+ mi_8x8 += tile->mi_col_start + tile->mi_row_start * mis;
- for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
+ for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
mi_row += 8, mi_8x8 += 8 * mis) {
m_8x8 = mi_8x8;
- vp9_zero(cm->left_seg_context);
- for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+ vp9_zero(cpi->left_seg_context);
+ for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) {
- write_modes_sb(cpi, m_8x8, bc, tok, tok_end, mi_row, mi_col,
- BLOCK_64X64);
+ write_modes_sb(cpi, tile, m_8x8, bc, tok, tok_end, mi_row, mi_col,
+ BLOCK_64X64, 0);
}
}
}
-/* This function is used for debugging probability trees. */
-static void print_prob_tree(vp9_coeff_probs *coef_probs, int block_types) {
- /* print coef probability tree */
- int i, j, k, l, m;
- FILE *f = fopen("enc_tree_probs.txt", "a");
- fprintf(f, "{\n");
- for (i = 0; i < block_types; i++) {
- fprintf(f, " {\n");
- for (j = 0; j < REF_TYPES; ++j) {
- fprintf(f, " {\n");
- for (k = 0; k < COEF_BANDS; k++) {
- fprintf(f, " {\n");
- for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
- fprintf(f, " {");
- for (m = 0; m < ENTROPY_NODES; m++) {
- fprintf(f, "%3u, ",
- (unsigned int)(coef_probs[i][j][k][l][m]));
- }
- }
- fprintf(f, " }\n");
- }
- fprintf(f, " }\n");
- }
- fprintf(f, " }\n");
- }
- fprintf(f, "}\n");
- fclose(f);
-}
-
static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[tx_size];
vp9_coeff_count *coef_counts = cpi->coef_counts[tx_size];
unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
cpi->common.counts.eob_branch[tx_size];
vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[tx_size];
- vp9_prob full_probs[ENTROPY_NODES];
- int i, j, k, l;
+ int i, j, k, l, m;
for (i = 0; i < BLOCK_TYPES; ++i) {
for (j = 0; j < REF_TYPES; ++j) {
@@ -756,16 +702,14 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
if (l >= 3 && k == 0)
continue;
vp9_tree_probs_from_distribution(vp9_coef_tree,
- full_probs,
coef_branch_ct[i][j][k][l],
coef_counts[i][j][k][l], 0);
- vpx_memcpy(coef_probs[i][j][k][l], full_probs,
- sizeof(vp9_prob) * UNCONSTRAINED_NODES);
coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
coef_branch_ct[i][j][k][l][0][0];
- coef_probs[i][j][k][l][0] =
- get_binary_prob(coef_branch_ct[i][j][k][l][0][0],
- coef_branch_ct[i][j][k][l][0][1]);
+ for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+ coef_probs[i][j][k][l][m] = get_binary_prob(
+ coef_branch_ct[i][j][k][l][m][0],
+ coef_branch_ct[i][j][k][l][m][1]);
#ifdef ENTROPY_STATS
if (!cpi->dummy_packing) {
int t;
@@ -794,7 +738,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
vp9_coeff_probs_model *old_frame_coef_probs =
cpi->common.fc.coef_probs[tx_size];
vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size];
- const vp9_prob upd = VP9_COEF_UPDATE_PROB;
+ const vp9_prob upd = DIFF_UPDATE_PROB;
const int entropy_nodes_update = UNCONSTRAINED_NODES;
int i, j, k, l, t;
switch (cpi->sf.use_fast_coef_updates) {
@@ -849,7 +793,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
for (t = 0; t < entropy_nodes_update; ++t) {
vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
- const vp9_prob upd = VP9_COEF_UPDATE_PROB;
+ const vp9_prob upd = DIFF_UPDATE_PROB;
int s;
int u = 0;
if (l >= 3 && k == 0)
@@ -1132,26 +1076,23 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
- tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i],
- ct_8x8p);
+ tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p);
for (j = 0; j < TX_SIZES - 3; j++)
- vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j],
- MODE_UPDATE_PROB, ct_8x8p[j]);
+ vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], ct_8x8p[j]);
}
for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
- tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i],
- ct_16x16p);
+ tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p);
for (j = 0; j < TX_SIZES - 2; j++)
vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
- MODE_UPDATE_PROB, ct_16x16p[j]);
+ ct_16x16p[j]);
}
for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
for (j = 0; j < TX_SIZES - 1; j++)
vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
- MODE_UPDATE_PROB, ct_32x32p[j]);
+ ct_32x32p[j]);
}
#ifdef MODE_STATS
if (!cpi->dummy_packing)
@@ -1160,9 +1101,9 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
}
}
-static void write_interp_filter_type(INTERPOLATIONFILTERTYPE type,
+static void write_interp_filter_type(INTERPOLATION_TYPE type,
struct vp9_write_bit_buffer *wb) {
- const int type_to_literal[] = { 1, 0, 2 };
+ const int type_to_literal[] = { 1, 0, 2, 3 };
vp9_wb_write_bit(wb, type == SWITCHABLE);
if (type != SWITCHABLE)
@@ -1178,7 +1119,7 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) {
int i, j, c = 0;
for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
count[i] = 0;
- for (j = 0; j <= SWITCHABLE_FILTERS; ++j)
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
count[i] += cm->counts.switchable_interp[j][i];
c += (count[i] > 0);
}
@@ -1258,7 +1199,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
const int tile_cols = 1 << cm->log2_tile_cols;
const int tile_rows = 1 << cm->log2_tile_rows;
- vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
+ vpx_memset(cpi->above_seg_context, 0, sizeof(*cpi->above_seg_context) *
mi_cols_aligned_to_sb(cm->mi_cols));
tok[0][0] = cpi->tok;
@@ -1273,9 +1214,10 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
}
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- vp9_get_tile_row_offsets(cm, tile_row);
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
- vp9_get_tile_col_offsets(cm, tile_col);
+ TileInfo tile;
+
+ vp9_tile_init(&tile, cm, 0, tile_col);
tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
@@ -1283,7 +1225,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
else
vp9_start_encode(&residual_bc, data_ptr + total_size);
- write_modes(cpi, &residual_bc, &tok[tile_row][tile_col], tok_end);
+ write_modes(cpi, &tile, &residual_bc, &tok[tile_row][tile_col], tok_end);
assert(tok[tile_row][tile_col] == tok_end);
vp9_stop_encode(&residual_bc);
if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
@@ -1352,18 +1294,16 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
}
static void write_sync_code(struct vp9_write_bit_buffer *wb) {
- vp9_wb_write_literal(wb, SYNC_CODE_0, 8);
- vp9_wb_write_literal(wb, SYNC_CODE_1, 8);
- vp9_wb_write_literal(wb, SYNC_CODE_2, 8);
+ vp9_wb_write_literal(wb, VP9_SYNC_CODE_0, 8);
+ vp9_wb_write_literal(wb, VP9_SYNC_CODE_1, 8);
+ vp9_wb_write_literal(wb, VP9_SYNC_CODE_2, 8);
}
static void write_uncompressed_header(VP9_COMP *cpi,
struct vp9_write_bit_buffer *wb) {
VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- // frame marker bits
- vp9_wb_write_literal(wb, 0x2, 2);
+ vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
// bitstream version.
// 00 - profile 0. 4:2:0 only
@@ -1377,18 +1317,10 @@ static void write_uncompressed_header(VP9_COMP *cpi,
vp9_wb_write_bit(wb, cm->error_resilient_mode);
if (cm->frame_type == KEY_FRAME) {
+ const COLOR_SPACE cs = UNKNOWN;
write_sync_code(wb);
- // colorspaces
- // 000 - Unknown
- // 001 - BT.601
- // 010 - BT.709
- // 011 - SMPTE-170
- // 100 - SMPTE-240
- // 101 - Reserved
- // 110 - Reserved
- // 111 - sRGB (RGB)
- vp9_wb_write_literal(wb, 0, 3);
- if (1 /* colorspace != sRGB */) {
+ vp9_wb_write_literal(wb, cs, 3);
+ if (cs != SRGB) {
vp9_wb_write_bit(wb, 0); // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
if (cm->version == 1) {
vp9_wb_write_bit(wb, cm->subsampling_x);
@@ -1425,7 +1357,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,
write_frame_size_with_refs(cpi, wb);
- vp9_wb_write_bit(wb, xd->allow_high_precision_mv);
+ vp9_wb_write_bit(wb, cm->allow_high_precision_mv);
fix_mcomp_filter_type(cpi);
write_interp_filter_type(cm->mcomp_filter_type, wb);
@@ -1467,7 +1399,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
vp9_update_skip_probs(cpi, &header_bc);
- if (cm->frame_type != KEY_FRAME) {
+ if (!frame_is_intra_only(cm)) {
int i;
#ifdef ENTROPY_STATS
active_section = 1;
@@ -1481,7 +1413,6 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
- MODE_UPDATE_PROB,
cpi->intra_inter_count[i]);
if (cm->allow_comp_inter_inter) {
@@ -1495,7 +1426,6 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
if (use_hybrid_pred)
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
- MODE_UPDATE_PROB,
cpi->comp_inter_count[i]);
}
}
@@ -1503,10 +1433,8 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
for (i = 0; i < REF_CONTEXTS; i++) {
vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
- MODE_UPDATE_PROB,
cpi->single_ref_count[i][0]);
vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
- MODE_UPDATE_PROB,
cpi->single_ref_count[i][1]);
}
}
@@ -1514,21 +1442,18 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
for (i = 0; i < REF_CONTEXTS; i++)
vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
- MODE_UPDATE_PROB,
cpi->comp_ref_count[i]);
update_mbintra_mode_probs(cpi, &header_bc);
- for (i = 0; i < NUM_PARTITION_CONTEXTS; ++i) {
- vp9_prob pnew[PARTITION_TYPES - 1];
+ for (i = 0; i < PARTITION_CONTEXTS; ++i) {
unsigned int bct[PARTITION_TYPES - 1][2];
- update_mode(&header_bc, PARTITION_TYPES,
- vp9_partition_tree, pnew,
- fc->partition_prob[cm->frame_type][i], bct,
+ update_mode(&header_bc, PARTITION_TYPES, vp9_partition_tree,
+ fc->partition_prob[i], bct,
(unsigned int *)cpi->partition_count[i]);
}
- vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc);
+ vp9_write_nmv_probs(cpi, cm->allow_high_precision_mv, &header_bc);
}
vp9_stop_encode(&header_bc);
diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h
index 013047e..8033a4d 100644
--- a/libvpx/vp9/encoder/vp9_block.h
+++ b/libvpx/vp9/encoder/vp9_block.h
@@ -23,17 +23,11 @@ typedef struct {
int offset;
} search_site;
-typedef struct {
- struct {
- MB_PREDICTION_MODE mode;
- } bmi[4];
-} PARTITION_INFO;
-
// Structure to hold snapshot of coding context during the mode picking process
-// TODO Do we need all of these?
typedef struct {
MODE_INFO mic;
- PARTITION_INFO partition_info;
+ uint8_t *zcoeff_blk;
+ int num_4x4_blk;
int skip;
int_mv best_ref_mv;
int_mv second_best_ref_mv;
@@ -48,7 +42,7 @@ typedef struct {
int comp_pred_diff;
int single_pred_diff;
int64_t tx_rd_diff[TX_MODES];
- int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
// motion vector cache for adaptive motion search control in partition
// search loop
@@ -62,8 +56,8 @@ typedef struct {
} PICK_MODE_CONTEXT;
struct macroblock_plane {
- DECLARE_ALIGNED(16, int16_t, src_diff[64*64]);
- DECLARE_ALIGNED(16, int16_t, coeff[64*64]);
+ DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
+ DECLARE_ALIGNED(16, int16_t, coeff[64 * 64]);
struct buf_2d src;
// Quantizer setings
@@ -87,9 +81,6 @@ struct macroblock {
MACROBLOCKD e_mbd;
int skip_block;
- PARTITION_INFO *partition_info; /* work pointer */
- PARTITION_INFO *pi; /* Corresponds to upper left visible macroblock */
- PARTITION_INFO *pip; /* Base of allocated array */
search_site *ss;
int ss_count;
@@ -100,6 +91,7 @@ struct macroblock {
int sadperbit4;
int rddiv;
int rdmult;
+ unsigned int mb_energy;
unsigned int *mb_activity_ptr;
int *mb_norm_activity_ptr;
signed int act_zbin_adj;
@@ -123,11 +115,10 @@ struct macroblock {
int **mvsadcost;
int mbmode_cost[MB_MODE_COUNT];
- unsigned inter_mode_cost[INTER_MODE_CONTEXTS][MB_MODE_COUNT - NEARESTMV];
+ unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
int intra_uv_mode_cost[2][MB_MODE_COUNT];
int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
- int switchable_interp_costs[SWITCHABLE_FILTERS + 1]
- [SWITCHABLE_FILTERS];
+ int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
// These define limits to motion vector components to prevent them
// from extending outside the UMV borders
@@ -136,6 +127,7 @@ struct macroblock {
int mv_row_min;
int mv_row_max;
+ uint8_t zcoeff_blk[TX_SIZES][256];
int skip;
int encode_breakout;
@@ -144,6 +136,7 @@ struct macroblock {
// note that token_costs is the cost when eob node is skipped
vp9_coeff_cost token_costs[TX_SIZES];
+ DECLARE_ALIGNED(16, uint8_t, token_cache[1024]);
int optimize;
@@ -172,19 +165,72 @@ struct macroblock {
PICK_MODE_CONTEXT sb32x64_context[2];
PICK_MODE_CONTEXT sb64x32_context[2];
PICK_MODE_CONTEXT sb64_context;
- int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
+ int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
BLOCK_SIZE b_partitioning[4][4][4];
BLOCK_SIZE mb_partitioning[4][4];
BLOCK_SIZE sb_partitioning[4];
BLOCK_SIZE sb64_partitioning;
- void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
- void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
- void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch);
- void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch);
- void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
- int y_blocks);
+ void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
+};
+
+// TODO(jingning): the variables used here are little complicated. need further
+// refactoring on organizing the temporary buffers, when recursive
+// partition down to 4x4 block size is enabled.
+static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ switch (bsize) {
+ case BLOCK_64X64:
+ return &x->sb64_context;
+ case BLOCK_64X32:
+ return &x->sb64x32_context[xd->sb_index];
+ case BLOCK_32X64:
+ return &x->sb32x64_context[xd->sb_index];
+ case BLOCK_32X32:
+ return &x->sb32_context[xd->sb_index];
+ case BLOCK_32X16:
+ return &x->sb32x16_context[xd->sb_index][xd->mb_index];
+ case BLOCK_16X32:
+ return &x->sb16x32_context[xd->sb_index][xd->mb_index];
+ case BLOCK_16X16:
+ return &x->mb_context[xd->sb_index][xd->mb_index];
+ case BLOCK_16X8:
+ return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+ case BLOCK_8X16:
+ return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
+ case BLOCK_8X8:
+ return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+ case BLOCK_8X4:
+ return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+ case BLOCK_4X8:
+ return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+ case BLOCK_4X4:
+ return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+ default:
+ assert(0);
+ return NULL;
+ }
+}
+
+struct rdcost_block_args {
+ MACROBLOCK *x;
+ ENTROPY_CONTEXT t_above[16];
+ ENTROPY_CONTEXT t_left[16];
+ TX_SIZE tx_size;
+ int bw;
+ int bh;
+ int rate;
+ int64_t dist;
+ int64_t sse;
+ int this_rate;
+ int64_t this_dist;
+ int64_t this_sse;
+ int64_t this_rd;
+ int64_t best_rd;
+ int skip;
+ const int16_t *scan, *nb;
};
#endif // VP9_ENCODER_VP9_BLOCK_H_
diff --git a/libvpx/vp9/encoder/vp9_boolhuff.c b/libvpx/vp9/encoder/vp9_boolhuff.c
index 0f1aa59..32c136e 100644
--- a/libvpx/vp9/encoder/vp9_boolhuff.c
+++ b/libvpx/vp9/encoder/vp9_boolhuff.c
@@ -22,23 +22,28 @@ unsigned int active_section = 0;
#endif
const unsigned int vp9_prob_cost[256] = {
- 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
- 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778,
- 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
- 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516,
- 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433,
- 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
- 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307,
- 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257,
- 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
- 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174,
- 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139,
- 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
- 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77,
- 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50,
- 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
- 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1
-};
+ 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161,
+ 1129, 1099, 1072, 1046, 1023, 1000, 979, 959, 940, 922, 905, 889,
+ 873, 858, 843, 829, 816, 803, 790, 778, 767, 755, 744, 733,
+ 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
+ 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541,
+ 534, 528, 522, 516, 511, 505, 499, 494, 488, 483, 477, 472,
+ 467, 462, 457, 452, 447, 442, 437, 433, 428, 424, 419, 415,
+ 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
+ 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321,
+ 317, 314, 311, 307, 304, 301, 297, 294, 291, 288, 285, 281,
+ 278, 275, 272, 269, 266, 263, 260, 257, 255, 252, 249, 246,
+ 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
+ 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184,
+ 181, 179, 177, 174, 172, 170, 168, 165, 163, 161, 159, 156,
+ 154, 152, 150, 148, 145, 143, 141, 139, 137, 135, 133, 131,
+ 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
+ 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84,
+ 82, 81, 79, 77, 75, 73, 72, 70, 68, 66, 65, 63,
+ 61, 60, 58, 56, 55, 53, 51, 50, 48, 46, 45, 43,
+ 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
+ 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6,
+ 4, 3, 1, 1};
void vp9_start_encode(vp9_writer *br, uint8_t *source) {
br->lowvalue = 0;
diff --git a/libvpx/vp9/encoder/vp9_dct.c b/libvpx/vp9/encoder/vp9_dct.c
index 4f4ad04..065992a 100644
--- a/libvpx/vp9/encoder/vp9_dct.c
+++ b/libvpx/vp9/encoder/vp9_dct.c
@@ -8,16 +8,19 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
#include <assert.h>
#include <math.h>
+
#include "./vpx_config.h"
-#include "vp9/common/vp9_systemdependent.h"
+#include "./vp9_rtcd.h"
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_systemdependent.h"
-static void fdct4_1d(int16_t *input, int16_t *output) {
+#include "vp9/encoder/vp9_dct.h"
+
+static void fdct4(const int16_t *input, int16_t *output) {
int16_t step[4];
int temp1, temp2;
@@ -36,18 +39,17 @@ static void fdct4_1d(int16_t *input, int16_t *output) {
output[3] = dct_const_round_shift(temp2);
}
-void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
// as the first pass results are transposed, we tranpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
- const int stride = pitch >> 1;
int pass;
// We need an intermediate buffer between passes.
int16_t intermediate[4 * 4];
- int16_t *in = input;
+ const int16_t *in = input;
int16_t *out = intermediate;
// Do the two transform/transpose passes
for (pass = 0; pass < 2; ++pass) {
@@ -58,10 +60,10 @@ void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) {
for (i = 0; i < 4; ++i) {
// Load inputs.
if (0 == pass) {
- input[0] = in[0 * stride] << 4;
- input[1] = in[1 * stride] << 4;
- input[2] = in[2 * stride] << 4;
- input[3] = in[3 * stride] << 4;
+ input[0] = in[0 * stride] * 16;
+ input[1] = in[1 * stride] * 16;
+ input[2] = in[2 * stride] * 16;
+ input[3] = in[3 * stride] * 16;
if (i == 0 && input[0]) {
input[0] += 1;
}
@@ -102,7 +104,7 @@ void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) {
}
}
-static void fadst4_1d(int16_t *input, int16_t *output) {
+static void fadst4(const int16_t *input, int16_t *output) {
int x0, x1, x2, x3;
int s0, s1, s2, s3, s4, s5, s6, s7;
@@ -143,14 +145,14 @@ static void fadst4_1d(int16_t *input, int16_t *output) {
}
static const transform_2d FHT_4[] = {
- { fdct4_1d, fdct4_1d }, // DCT_DCT = 0
- { fadst4_1d, fdct4_1d }, // ADST_DCT = 1
- { fdct4_1d, fadst4_1d }, // DCT_ADST = 2
- { fadst4_1d, fadst4_1d } // ADST_ADST = 3
+ { fdct4, fdct4 }, // DCT_DCT = 0
+ { fadst4, fdct4 }, // ADST_DCT = 1
+ { fdct4, fadst4 }, // DCT_ADST = 2
+ { fadst4, fadst4 } // ADST_ADST = 3
};
-void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
- int pitch, TX_TYPE tx_type) {
+void vp9_short_fht4x4_c(const int16_t *input, int16_t *output,
+ int stride, int tx_type) {
int16_t out[4 * 4];
int16_t *outptr = &out[0];
int i, j;
@@ -160,7 +162,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
// Columns
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
- temp_in[j] = input[j * pitch + i] << 4;
+ temp_in[j] = input[j * stride + i] * 16;
if (i == 0 && temp_in[0])
temp_in[0] += 1;
ht.cols(temp_in, temp_out);
@@ -178,12 +180,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
}
}
-void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) {
- vp9_short_fdct4x4_c(input, output, pitch);
- vp9_short_fdct4x4_c(input + 4, output + 16, pitch);
-}
-
-static void fdct8_1d(int16_t *input, int16_t *output) {
+static void fdct8(const int16_t *input, int16_t *output) {
/*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
/*needs32*/ int t0, t1, t2, t3;
/*canbe16*/ int x0, x1, x2, x3;
@@ -198,7 +195,7 @@ static void fdct8_1d(int16_t *input, int16_t *output) {
s6 = input[1] - input[6];
s7 = input[0] - input[7];
- // fdct4_1d(step, step);
+ // fdct4(step, step);
x0 = s0 + s3;
x1 = s1 + s2;
x2 = s1 - s2;
@@ -235,8 +232,7 @@ static void fdct8_1d(int16_t *input, int16_t *output) {
output[7] = dct_const_round_shift(t3);
}
-void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) {
- const int stride = pitch >> 1;
+void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
int i, j;
int16_t intermediate[64];
@@ -250,16 +246,16 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) {
int i;
for (i = 0; i < 8; i++) {
// stage 1
- s0 = (input[0 * stride] + input[7 * stride]) << 2;
- s1 = (input[1 * stride] + input[6 * stride]) << 2;
- s2 = (input[2 * stride] + input[5 * stride]) << 2;
- s3 = (input[3 * stride] + input[4 * stride]) << 2;
- s4 = (input[3 * stride] - input[4 * stride]) << 2;
- s5 = (input[2 * stride] - input[5 * stride]) << 2;
- s6 = (input[1 * stride] - input[6 * stride]) << 2;
- s7 = (input[0 * stride] - input[7 * stride]) << 2;
-
- // fdct4_1d(step, step);
+ s0 = (input[0 * stride] + input[7 * stride]) * 4;
+ s1 = (input[1 * stride] + input[6 * stride]) * 4;
+ s2 = (input[2 * stride] + input[5 * stride]) * 4;
+ s3 = (input[3 * stride] + input[4 * stride]) * 4;
+ s4 = (input[3 * stride] - input[4 * stride]) * 4;
+ s5 = (input[2 * stride] - input[5 * stride]) * 4;
+ s6 = (input[1 * stride] - input[6 * stride]) * 4;
+ s7 = (input[0 * stride] - input[7 * stride]) * 4;
+
+ // fdct4(step, step);
x0 = s0 + s3;
x1 = s1 + s2;
x2 = s1 - s2;
@@ -301,24 +297,23 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) {
// Rows
for (i = 0; i < 8; ++i) {
- fdct8_1d(&intermediate[i * 8], &final_output[i * 8]);
+ fdct8(&intermediate[i * 8], &final_output[i * 8]);
for (j = 0; j < 8; ++j)
final_output[j + i * 8] /= 2;
}
}
-void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
// as the first pass results are transposed, we tranpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
- const int stride = pitch >> 1;
int pass;
// We need an intermediate buffer between passes.
int16_t intermediate[256];
- int16_t *in = input;
+ const int16_t *in = input;
int16_t *out = intermediate;
// Do the two transform/transpose passes
for (pass = 0; pass < 2; ++pass) {
@@ -331,23 +326,23 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
for (i = 0; i < 16; i++) {
if (0 == pass) {
// Calculate input for the first 8 results.
- input[0] = (in[0 * stride] + in[15 * stride]) << 2;
- input[1] = (in[1 * stride] + in[14 * stride]) << 2;
- input[2] = (in[2 * stride] + in[13 * stride]) << 2;
- input[3] = (in[3 * stride] + in[12 * stride]) << 2;
- input[4] = (in[4 * stride] + in[11 * stride]) << 2;
- input[5] = (in[5 * stride] + in[10 * stride]) << 2;
- input[6] = (in[6 * stride] + in[ 9 * stride]) << 2;
- input[7] = (in[7 * stride] + in[ 8 * stride]) << 2;
+ input[0] = (in[0 * stride] + in[15 * stride]) * 4;
+ input[1] = (in[1 * stride] + in[14 * stride]) * 4;
+ input[2] = (in[2 * stride] + in[13 * stride]) * 4;
+ input[3] = (in[3 * stride] + in[12 * stride]) * 4;
+ input[4] = (in[4 * stride] + in[11 * stride]) * 4;
+ input[5] = (in[5 * stride] + in[10 * stride]) * 4;
+ input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
+ input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
// Calculate input for the next 8 results.
- step1[0] = (in[7 * stride] - in[ 8 * stride]) << 2;
- step1[1] = (in[6 * stride] - in[ 9 * stride]) << 2;
- step1[2] = (in[5 * stride] - in[10 * stride]) << 2;
- step1[3] = (in[4 * stride] - in[11 * stride]) << 2;
- step1[4] = (in[3 * stride] - in[12 * stride]) << 2;
- step1[5] = (in[2 * stride] - in[13 * stride]) << 2;
- step1[6] = (in[1 * stride] - in[14 * stride]) << 2;
- step1[7] = (in[0 * stride] - in[15 * stride]) << 2;
+ step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
+ step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
+ step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
+ step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
+ step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
+ step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
+ step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
+ step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
} else {
// Calculate input for the first 8 results.
input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
@@ -368,7 +363,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
}
- // Work on the first eight values; fdct8_1d(input, even_results);
+ // Work on the first eight values; fdct8(input, even_results);
{
/*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
/*needs32*/ int t0, t1, t2, t3;
@@ -384,7 +379,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
s6 = input[1] - input[6];
s7 = input[0] - input[7];
- // fdct4_1d(step, step);
+ // fdct4(step, step);
x0 = s0 + s3;
x1 = s1 + s2;
x2 = s1 - s2;
@@ -486,7 +481,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
}
}
-static void fadst8_1d(int16_t *input, int16_t *output) {
+static void fadst8(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
int x0 = input[7];
@@ -558,14 +553,14 @@ static void fadst8_1d(int16_t *input, int16_t *output) {
}
static const transform_2d FHT_8[] = {
- { fdct8_1d, fdct8_1d }, // DCT_DCT = 0
- { fadst8_1d, fdct8_1d }, // ADST_DCT = 1
- { fdct8_1d, fadst8_1d }, // DCT_ADST = 2
- { fadst8_1d, fadst8_1d } // ADST_ADST = 3
+ { fdct8, fdct8 }, // DCT_DCT = 0
+ { fadst8, fdct8 }, // ADST_DCT = 1
+ { fdct8, fadst8 }, // DCT_ADST = 2
+ { fadst8, fadst8 } // ADST_ADST = 3
};
-void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
- int pitch, TX_TYPE tx_type) {
+void vp9_short_fht8x8_c(const int16_t *input, int16_t *output,
+ int stride, int tx_type) {
int16_t out[64];
int16_t *outptr = &out[0];
int i, j;
@@ -575,7 +570,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
// Columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
- temp_in[j] = input[j * pitch + i] << 2;
+ temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
outptr[j * 8 + i] = temp_out[j];
@@ -593,18 +588,17 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
pixel. */
-void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
+void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
int i;
int a1, b1, c1, d1, e1;
- short *ip = input;
- short *op = output;
- int pitch_short = pitch >> 1;
+ const int16_t *ip = input;
+ int16_t *op = output;
for (i = 0; i < 4; i++) {
- a1 = ip[0 * pitch_short];
- b1 = ip[1 * pitch_short];
- c1 = ip[2 * pitch_short];
- d1 = ip[3 * pitch_short];
+ a1 = ip[0 * stride];
+ b1 = ip[1 * stride];
+ c1 = ip[2 * stride];
+ d1 = ip[3 * stride];
a1 += b1;
d1 = d1 - c1;
@@ -637,24 +631,18 @@ void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
c1 = e1 - c1;
a1 -= c1;
d1 += b1;
- op[0] = a1 << WHT_UPSCALE_FACTOR;
- op[1] = c1 << WHT_UPSCALE_FACTOR;
- op[2] = d1 << WHT_UPSCALE_FACTOR;
- op[3] = b1 << WHT_UPSCALE_FACTOR;
+ op[0] = a1 * UNIT_QUANT_FACTOR;
+ op[1] = c1 * UNIT_QUANT_FACTOR;
+ op[2] = d1 * UNIT_QUANT_FACTOR;
+ op[3] = b1 * UNIT_QUANT_FACTOR;
ip += 4;
op += 4;
}
}
-void vp9_short_walsh8x4_c(short *input, short *output, int pitch) {
- vp9_short_walsh4x4_c(input, output, pitch);
- vp9_short_walsh4x4_c(input + 4, output + 16, pitch);
-}
-
-
// Rewrote to use same algorithm as others.
-static void fdct16_1d(int16_t in[16], int16_t out[16]) {
+static void fdct16(const int16_t in[16], int16_t out[16]) {
/*canbe16*/ int step1[8];
/*canbe16*/ int step2[8];
/*canbe16*/ int step3[8];
@@ -680,7 +668,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) {
step1[6] = in[1] - in[14];
step1[7] = in[0] - in[15];
- // fdct8_1d(step, step);
+ // fdct8(step, step);
{
/*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
/*needs32*/ int t0, t1, t2, t3;
@@ -696,7 +684,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) {
s6 = input[1] - input[6];
s7 = input[0] - input[7];
- // fdct4_1d(step, step);
+ // fdct4(step, step);
x0 = s0 + s3;
x1 = s1 + s2;
x2 = s1 - s2;
@@ -795,7 +783,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) {
out[15] = dct_const_round_shift(temp2);
}
-void fadst16_1d(int16_t *input, int16_t *output) {
+static void fadst16(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
int x0 = input[15];
@@ -958,14 +946,14 @@ void fadst16_1d(int16_t *input, int16_t *output) {
}
static const transform_2d FHT_16[] = {
- { fdct16_1d, fdct16_1d }, // DCT_DCT = 0
- { fadst16_1d, fdct16_1d }, // ADST_DCT = 1
- { fdct16_1d, fadst16_1d }, // DCT_ADST = 2
- { fadst16_1d, fadst16_1d } // ADST_ADST = 3
+ { fdct16, fdct16 }, // DCT_DCT = 0
+ { fadst16, fdct16 }, // ADST_DCT = 1
+ { fdct16, fadst16 }, // DCT_ADST = 2
+ { fadst16, fadst16 } // ADST_ADST = 3
};
-void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
- int pitch, TX_TYPE tx_type) {
+void vp9_short_fht16x16_c(const int16_t *input, int16_t *output,
+ int stride, int tx_type) {
int16_t out[256];
int16_t *outptr = &out[0];
int i, j;
@@ -975,7 +963,7 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
// Columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
- temp_in[j] = input[j * pitch + i] << 2;
+ temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
@@ -1003,7 +991,7 @@ static INLINE int half_round_shift(int input) {
return rv;
}
-static void dct32_1d(int *input, int *output, int round) {
+static void dct32_1d(const int *input, int *output, int round) {
int step[32];
// Stage 1
step[0] = input[0] + input[(32 - 1)];
@@ -1326,8 +1314,7 @@ static void dct32_1d(int *input, int *output, int round) {
output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
}
-void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
- int shortpitch = pitch >> 1;
+void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
int i, j;
int output[32 * 32];
@@ -1335,7 +1322,7 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
for (i = 0; i < 32; ++i) {
int temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
- temp_in[j] = input[j * shortpitch + i] << 2;
+ temp_in[j] = input[j * stride + i] * 4;
dct32_1d(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
@@ -1355,8 +1342,7 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
// Note that although we use dct_32_round in dct32_1d computation flow,
// this 2d fdct32x32 for rate-distortion optimization loop is operating
// within 16 bits precision.
-void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
- int shortpitch = pitch >> 1;
+void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
int i, j;
int output[32 * 32];
@@ -1364,7 +1350,7 @@ void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
for (i = 0; i < 32; ++i) {
int temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
- temp_in[j] = input[j * shortpitch + i] << 2;
+ temp_in[j] = input[j * stride + i] * 4;
dct32_1d(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
// TODO(cd): see quality impact of only doing
@@ -1383,3 +1369,27 @@ void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
out[j + i * 32] = temp_out[j];
}
}
+
+void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+ int stride) {
+ if (tx_type == DCT_DCT)
+ vp9_fdct4x4(input, output, stride);
+ else
+ vp9_short_fht4x4(input, output, stride, tx_type);
+}
+
+void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+ int stride) {
+ if (tx_type == DCT_DCT)
+ vp9_fdct8x8(input, output, stride);
+ else
+ vp9_short_fht8x8(input, output, stride, tx_type);
+}
+
+void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+ int stride) {
+ if (tx_type == DCT_DCT)
+ vp9_fdct16x16(input, output, stride);
+ else
+ vp9_short_fht16x16(input, output, stride, tx_type);
+}
diff --git a/libvpx/vp9/encoder/vp9_dct.h b/libvpx/vp9/encoder/vp9_dct.h
new file mode 100644
index 0000000..aaf976d
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_dct.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_DCT_H_
+#define VP9_ENCODER_VP9_DCT_H_
+
+void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+ int stride);
+
+void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+ int stride);
+
+void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+ int stride);
+
+#endif // VP9_ENCODER_VP9_DCT_H_
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index 44ab02d..a45299b 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -22,6 +22,7 @@
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_extend.h"
#include "vp9/common/vp9_findnearmv.h"
+#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_mvref_common.h"
#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_quant_common.h"
@@ -29,7 +30,6 @@
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_tile_common.h"
-
#include "vp9/encoder/vp9_encodeframe.h"
#include "vp9/encoder/vp9_encodeintra.h"
#include "vp9/encoder/vp9_encodemb.h"
@@ -37,24 +37,44 @@
#include "vp9/encoder/vp9_onyx_int.h"
#include "vp9/encoder/vp9_rdopt.h"
#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/common/vp9_systemdependent.h"
#include "vp9/encoder/vp9_tokenize.h"
+#include "vp9/encoder/vp9_vaq.h"
-#define DBG_PRNT_SEGMAP 0
+#define DBG_PRNT_SEGMAP 0
-static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
- TX_4X4, // ONLY_4X4
- TX_8X8, // ONLY_8X8
- TX_16X16, // ONLY_16X16
- TX_32X32, // ONLY_32X32
- TX_32X32, // TX_MODE_SELECT
-};
// #define ENC_DEBUG
#ifdef ENC_DEBUG
int enc_debug = 0;
#endif
+static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
+ switch (subsize) {
+ case BLOCK_64X64:
+ case BLOCK_64X32:
+ case BLOCK_32X64:
+ case BLOCK_32X32:
+ return &xd->sb_index;
+ case BLOCK_32X16:
+ case BLOCK_16X32:
+ case BLOCK_16X16:
+ return &xd->mb_index;
+ case BLOCK_16X8:
+ case BLOCK_8X16:
+ case BLOCK_8X8:
+ return &xd->b_index;
+ case BLOCK_8X4:
+ case BLOCK_4X8:
+ case BLOCK_4X4:
+ return &xd->ab_index;
+ default:
+ assert(0);
+ return NULL;
+ }
+}
+
static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
int mi_row, int mi_col, BLOCK_SIZE bsize);
@@ -122,7 +142,6 @@ static unsigned int tt_activity_measure(MACROBLOCK *x) {
static unsigned int alt_activity_measure(MACROBLOCK *x, int use_dc_pred) {
return vp9_encode_intra(x, use_dc_pred);
}
-DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = {0};
// Measure the activity of the current macroblock
// What we measure here is TBD so abstracted to this function
@@ -173,8 +192,9 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
tmp = sortlist[j - 1];
sortlist[j - 1] = sortlist[j];
sortlist[j] = tmp;
- } else
- break;
+ } else {
+ break;
+ }
}
}
@@ -246,13 +266,11 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
#if OUTPUT_NORM_ACT_STATS
fprintf(f, "\n");
#endif
-
}
#if OUTPUT_NORM_ACT_STATS
fclose(f);
#endif
-
}
#endif // USE_ACT_INDEX
@@ -264,7 +282,7 @@ static void build_activity_map(VP9_COMP *cpi) {
VP9_COMMON * const cm = &cpi->common;
#if ALT_ACT_MEASURE
- YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+ YV12_BUFFER_CONFIG *new_yv12 = get_frame_new_buffer(cm);
int recon_yoffset;
int recon_y_stride = new_yv12->y_stride;
#endif
@@ -317,7 +335,6 @@ static void build_activity_map(VP9_COMP *cpi) {
// Calculate an activity index number of each mb
calc_activity_index(cpi, x);
#endif
-
}
// Macroblock activity masking
@@ -351,8 +368,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
MODE_INFO *mi = &ctx->mic;
- MB_MODE_INFO * const mbmi = &xd->this_mi->mbmi;
- MODE_INFO *mi_addr = xd->this_mi;
+ MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+ MODE_INFO *mi_addr = xd->mi_8x8[0];
int mb_mode_index = ctx->best_mode_index;
const int mis = cm->mode_info_stride;
@@ -360,7 +377,6 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
const int mi_height = num_8x8_blocks_high_lookup[bsize];
assert(mi->mbmi.mode < MB_MODE_COUNT);
- assert(mb_mode_index < MAX_MODES);
assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES);
assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);
assert(mi->mbmi.sb_type == bsize);
@@ -375,6 +391,10 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
&& (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y)
xd->mi_8x8[x_idx + y * mis] = mi_addr;
+ if (cpi->sf.variance_adaptive_quantization) {
+ vp9_mb_init_quantizer(cpi, x);
+ }
+
// FIXME(rbultje) I'm pretty sure this should go to the end of this block
// (i.e. after the output_enabled)
if (bsize < BLOCK_32X32) {
@@ -384,12 +404,14 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
}
if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
- *x->partition_info = ctx->partition_info;
mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
}
x->skip = ctx->skip;
+ vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
+ sizeof(uint8_t) * ctx->num_4x4_blk);
+
if (!output_enabled)
return;
@@ -398,15 +420,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i];
}
- if (cm->frame_type == KEY_FRAME) {
- // Restore the coding modes to that held in the coding context
- // if (mb_mode == I4X4_PRED)
- // for (i = 0; i < 16; i++)
- // {
- // xd->block[i].bmi.as_mode =
- // xd->mode_info_context->bmi[i].as_mode;
- // assert(xd->mode_info_context->bmi[i].as_mode < MB_MODE_COUNT);
- // }
+ if (frame_is_intra_only(cm)) {
#if CONFIG_INTERNAL_STATS
static const int kf_mode_index[] = {
THR_DC /*DC_PRED*/,
@@ -419,7 +433,6 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
THR_D207_PRED /*D207_PRED*/,
THR_D63_PRED /*D63_PRED*/,
THR_TM /*TM_PRED*/,
- THR_B_PRED /*I4X4_PRED*/,
};
cpi->mode_chosen_counts[kf_mode_index[mi->mbmi.mode]]++;
#endif
@@ -428,18 +441,19 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
cpi->mode_chosen_counts[mb_mode_index]++;
if (is_inter_block(mbmi)
&& (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
- int_mv best_mv, best_second_mv;
+ int_mv best_mv[2];
const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];
const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
- best_mv.as_int = ctx->best_ref_mv.as_int;
- best_second_mv.as_int = ctx->second_best_ref_mv.as_int;
+ best_mv[0].as_int = ctx->best_ref_mv.as_int;
+ best_mv[1].as_int = ctx->second_best_ref_mv.as_int;
if (mbmi->mode == NEWMV) {
- best_mv.as_int = mbmi->ref_mvs[rf1][0].as_int;
- best_second_mv.as_int = mbmi->ref_mvs[rf2][0].as_int;
+ best_mv[0].as_int = mbmi->ref_mvs[rf1][0].as_int;
+ if (rf2 > 0)
+ best_mv[1].as_int = mbmi->ref_mvs[rf2][0].as_int;
}
- mbmi->best_mv.as_int = best_mv.as_int;
- mbmi->best_second_mv.as_int = best_second_mv.as_int;
- vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv);
+ mbmi->best_mv[0].as_int = best_mv[0].as_int;
+ mbmi->best_mv[1].as_int = best_mv[1].as_int;
+ vp9_update_mv_count(cpi, x, best_mv);
}
if (cm->mcomp_filter_type == SWITCHABLE && is_inter_mode(mbmi->mode)) {
@@ -451,28 +465,27 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
}
}
void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
- int mb_row, int mb_col) {
- uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, src
- ->alpha_buffer};
- int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, src
- ->alpha_stride};
+ int mi_row, int mi_col) {
+ uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+ src->alpha_buffer};
+ const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+ src->alpha_stride};
int i;
- for (i = 0; i < MAX_MB_PLANE; i++) {
- setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mb_row, mb_col,
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mi_row, mi_col,
NULL, x->e_mbd.plane[i].subsampling_x,
x->e_mbd.plane[i].subsampling_y);
- }
}
-static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
- BLOCK_SIZE bsize) {
+static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
MACROBLOCK *const x = &cpi->mb;
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -486,16 +499,12 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
const int idx_map = mb_row * cm->mb_cols + mb_col;
const struct segmentation *const seg = &cm->seg;
- set_skip_context(cm, xd, mi_row, mi_col);
- set_partition_seg_context(cm, xd, mi_row, mi_col);
+ set_skip_context(xd, cpi->above_context, cpi->left_context, mi_row, mi_col);
// Activity map pointer
x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
x->active_ptr = cpi->active_map + idx_map;
- /* pointers to mode info contexts */
- x->partition_info = x->pi + idx_str;
-
xd->mi_8x8 = cm->mi_grid_visible + idx_str;
xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
@@ -503,10 +512,9 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
// cannot be used.
xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
- xd->this_mi =
xd->mi_8x8[0] = cm->mi + idx_str;
- mbmi = &xd->this_mi->mbmi;
+ mbmi = &xd->mi_8x8[0]->mbmi;
// Set up destination pointers
setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col);
@@ -520,7 +528,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
// Set up distance of MB to edge of frame in 1/8th pel units
assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
- set_mi_row_col(cm, xd, mi_row, mi_height, mi_col, mi_width);
+ set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+ cm->mi_rows, cm->mi_cols);
/* set up source buffers */
vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
@@ -531,10 +540,11 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
/* segment ID */
if (seg->enabled) {
- uint8_t *map = seg->update_map ? cpi->segmentation_map
- : cm->last_frame_seg_map;
- mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
-
+ if (!cpi->sf.variance_adaptive_quantization) {
+ uint8_t *map = seg->update_map ? cpi->segmentation_map
+ : cm->last_frame_seg_map;
+ mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+ }
vp9_mb_init_quantizer(cpi, x);
if (seg->enabled && cpi->seg0_cnt > 0
@@ -546,9 +556,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
const int x = mb_col & ~3;
const int p16 = ((mb_row & 1) << 1) + (mb_col & 1);
const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
- const int tile_progress = cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
- const int mb_cols = (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start)
- >> 1;
+ const int tile_progress = tile->mi_col_start * cm->mb_rows >> 1;
+ const int mb_cols = (tile->mi_col_end - tile->mi_col_start) >> 1;
cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress)
<< 16) / cm->MBs;
@@ -561,13 +570,19 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
}
}
-static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
+static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
+ int mi_row, int mi_col,
int *totalrate, int64_t *totaldist,
BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
int64_t best_rd) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
+ int orig_rdmult = x->rdmult;
+ double rdmult_ratio;
+
+ vp9_clear_system_state(); // __asm emms;
+ rdmult_ratio = 1.0; // avoid uninitialized warnings
// Use the lower precision, but faster, 32x32 fdct for mode selection.
x->use_lp32x32fdct = 1;
@@ -582,35 +597,66 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
}
}
- set_offsets(cpi, mi_row, mi_col, bsize);
- xd->this_mi->mbmi.sb_type = bsize;
+ set_offsets(cpi, tile, mi_row, mi_col, bsize);
+ xd->mi_8x8[0]->mbmi.sb_type = bsize;
// Set to zero to make sure we do not use the previous encoded frame stats
- xd->this_mi->mbmi.skip_coeff = 0;
+ xd->mi_8x8[0]->mbmi.skip_coeff = 0;
x->source_variance = get_sby_perpixel_variance(cpi, x, bsize);
+ if (cpi->sf.variance_adaptive_quantization) {
+ int energy;
+ if (bsize <= BLOCK_16X16) {
+ energy = x->mb_energy;
+ } else {
+ energy = vp9_block_energy(cpi, x, bsize);
+ }
+
+ xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy);
+ rdmult_ratio = vp9_vaq_rdmult_ratio(energy);
+ vp9_mb_init_quantizer(cpi, x);
+ }
+
if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
vp9_activity_masking(cpi, x);
+ if (cpi->sf.variance_adaptive_quantization) {
+ vp9_clear_system_state(); // __asm emms;
+ x->rdmult = round(x->rdmult * rdmult_ratio);
+ }
+
// Find best coding mode & reconstruct the MB so it is available
// as a predictor for MBs that follow in the SB
- if (cm->frame_type == KEY_FRAME)
+ if (frame_is_intra_only(cm)) {
vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx,
best_rd);
- else
- vp9_rd_pick_inter_mode_sb(cpi, x, mi_row, mi_col, totalrate, totaldist,
- bsize, ctx, best_rd);
+ } else {
+ if (bsize >= BLOCK_8X8)
+ vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col,
+ totalrate, totaldist, bsize, ctx, best_rd);
+ else
+ vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, totalrate,
+ totaldist, bsize, ctx, best_rd);
+ }
+
+ if (cpi->sf.variance_adaptive_quantization) {
+ x->rdmult = orig_rdmult;
+ if (*totalrate != INT_MAX) {
+ vp9_clear_system_state(); // __asm emms;
+ *totalrate = round(*totalrate * rdmult_ratio);
+ }
+ }
}
static void update_stats(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- MODE_INFO *mi = xd->this_mi;
+ MODE_INFO *mi = xd->mi_8x8[0];
MB_MODE_INFO *const mbmi = &mi->mbmi;
- if (cm->frame_type != KEY_FRAME) {
+ if (!frame_is_intra_only(cm)) {
const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
SEG_LVL_REF_FRAME);
@@ -637,49 +683,6 @@ static void update_stats(VP9_COMP *cpi) {
[mbmi->ref_frame[0] != GOLDEN_FRAME]++;
}
}
-
- // Count of last ref frame 0,0 usage
- if (mbmi->mode == ZEROMV && mbmi->ref_frame[0] == LAST_FRAME)
- cpi->inter_zz_count++;
- }
-}
-
-// TODO(jingning): the variables used here are little complicated. need further
-// refactoring on organizing the temporary buffers, when recursive
-// partition down to 4x4 block size is enabled.
-static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
- MACROBLOCKD *const xd = &x->e_mbd;
-
- switch (bsize) {
- case BLOCK_64X64:
- return &x->sb64_context;
- case BLOCK_64X32:
- return &x->sb64x32_context[xd->sb_index];
- case BLOCK_32X64:
- return &x->sb32x64_context[xd->sb_index];
- case BLOCK_32X32:
- return &x->sb32_context[xd->sb_index];
- case BLOCK_32X16:
- return &x->sb32x16_context[xd->sb_index][xd->mb_index];
- case BLOCK_16X32:
- return &x->sb16x32_context[xd->sb_index][xd->mb_index];
- case BLOCK_16X16:
- return &x->mb_context[xd->sb_index][xd->mb_index];
- case BLOCK_16X8:
- return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
- case BLOCK_8X16:
- return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
- case BLOCK_8X8:
- return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
- case BLOCK_8X4:
- return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
- case BLOCK_4X8:
- return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
- case BLOCK_4X4:
- return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
- default:
- assert(0);
- return NULL ;
}
}
@@ -696,7 +699,7 @@ static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) {
return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
default:
assert(0);
- return NULL ;
+ return NULL;
}
}
@@ -705,7 +708,6 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
BLOCK_SIZE bsize) {
- VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
int p;
@@ -715,28 +717,27 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
int mi_height = num_8x8_blocks_high_lookup[bsize];
for (p = 0; p < MAX_MB_PLANE; p++) {
vpx_memcpy(
- cm->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+ cpi->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
a + num_4x4_blocks_wide * p,
(sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
xd->plane[p].subsampling_x);
vpx_memcpy(
- cm->left_context[p]
+ cpi->left_context[p]
+ ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
l + num_4x4_blocks_high * p,
(sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
xd->plane[p].subsampling_y);
}
- vpx_memcpy(cm->above_seg_context + mi_col, sa,
- sizeof(PARTITION_CONTEXT) * mi_width);
- vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
- sizeof(PARTITION_CONTEXT) * mi_height);
+ vpx_memcpy(cpi->above_seg_context + mi_col, sa,
+ sizeof(*cpi->above_seg_context) * mi_width);
+ vpx_memcpy(cpi->left_seg_context + (mi_row & MI_MASK), sl,
+ sizeof(cpi->left_seg_context[0]) * mi_height);
}
static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
BLOCK_SIZE bsize) {
- const VP9_COMMON *const cm = &cpi->common;
const MACROBLOCK *const x = &cpi->mb;
const MACROBLOCKD *const xd = &x->e_mbd;
int p;
@@ -749,23 +750,24 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
for (p = 0; p < MAX_MB_PLANE; ++p) {
vpx_memcpy(
a + num_4x4_blocks_wide * p,
- cm->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+ cpi->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
(sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
xd->plane[p].subsampling_x);
vpx_memcpy(
l + num_4x4_blocks_high * p,
- cm->left_context[p]
+ cpi->left_context[p]
+ ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
(sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
xd->plane[p].subsampling_y);
}
- vpx_memcpy(sa, cm->above_seg_context + mi_col,
- sizeof(PARTITION_CONTEXT) * mi_width);
- vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
- sizeof(PARTITION_CONTEXT) * mi_height);
+ vpx_memcpy(sa, cpi->above_seg_context + mi_col,
+ sizeof(*cpi->above_seg_context) * mi_width);
+ vpx_memcpy(sl, cpi->left_seg_context + (mi_row & MI_MASK),
+ sizeof(cpi->left_seg_context[0]) * mi_height);
}
-static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
+ TOKENEXTRA **tp, int mi_row, int mi_col,
int output_enabled, BLOCK_SIZE bsize, int sub_index) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
@@ -783,7 +785,7 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
if (xd->ab_index > 0)
return;
}
- set_offsets(cpi, mi_row, mi_col, bsize);
+ set_offsets(cpi, tile, mi_row, mi_col, bsize);
update_state(cpi, get_block_context(x, bsize), bsize, output_enabled);
encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
@@ -795,7 +797,8 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
}
}
-static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
+ TOKENEXTRA **tp, int mi_row, int mi_col,
int output_enabled, BLOCK_SIZE bsize) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
@@ -812,8 +815,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
c1 = BLOCK_4X4;
if (bsize >= BLOCK_8X8) {
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
+ pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+ mi_row, mi_col, bsize);
c1 = *(get_sb_partitioning(x, bsize));
}
partition = partition_lookup[bsl][c1];
@@ -822,19 +825,19 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
case PARTITION_NONE:
if (output_enabled && bsize >= BLOCK_8X8)
cpi->partition_count[pl][PARTITION_NONE]++;
- encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
+ encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, -1);
break;
case PARTITION_VERT:
if (output_enabled)
cpi->partition_count[pl][PARTITION_VERT]++;
- encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
- encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
+ encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0);
+ encode_b(cpi, tile, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
break;
case PARTITION_HORZ:
if (output_enabled)
cpi->partition_count[pl][PARTITION_HORZ]++;
- encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
- encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
+ encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0);
+ encode_b(cpi, tile, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
break;
case PARTITION_SPLIT:
subsize = get_subsize(bsize, PARTITION_SPLIT);
@@ -846,7 +849,7 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
const int x_idx = i & 1, y_idx = i >> 1;
*get_sb_index(xd, subsize) = i;
- encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
+ encode_sb(cpi, tile, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
output_enabled, subsize);
}
break;
@@ -855,10 +858,9 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
break;
}
- if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) {
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- update_partition_context(xd, c1, bsize);
- }
+ if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+ update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
+ mi_row, mi_col, c1, bsize);
}
// Check to see if the given partition size is allowed for a specified number
@@ -886,13 +888,13 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
// However, at the bottom and right borders of the image the requested size
// may not be allowed in which case this code attempts to choose the largest
// allowable partition.
-static void set_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
- int mi_row, int mi_col) {
+static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
+ MODE_INFO **mi_8x8, int mi_row, int mi_col) {
VP9_COMMON *const cm = &cpi->common;
BLOCK_SIZE bsize = cpi->sf.always_this_block_size;
const int mis = cm->mode_info_stride;
- int row8x8_remaining = cm->cur_tile_mi_row_end - mi_row;
- int col8x8_remaining = cm->cur_tile_mi_col_end - mi_col;
+ int row8x8_remaining = tile->mi_row_end - mi_row;
+ int col8x8_remaining = tile->mi_col_end - mi_col;
int block_row, block_col;
MODE_INFO * mi_upper_left = cm->mi + mi_row * mis + mi_col;
int bh = num_8x8_blocks_high_lookup[bsize];
@@ -936,7 +938,7 @@ static void copy_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
for (block_col = 0; block_col < 8; ++block_col) {
MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col];
BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
- int offset;
+ ptrdiff_t offset;
if (prev_mi) {
offset = prev_mi - cm->prev_mi;
@@ -947,324 +949,29 @@ static void copy_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
}
}
-static void set_block_size(VP9_COMMON * const cm, MODE_INFO **mi_8x8,
- BLOCK_SIZE bsize, int mis, int mi_row,
- int mi_col) {
- int r, c;
- const int bs = MAX(num_8x8_blocks_wide_lookup[bsize],
- num_8x8_blocks_high_lookup[bsize]);
- const int idx_str = mis * mi_row + mi_col;
- MODE_INFO **const mi2 = &mi_8x8[idx_str];
-
- mi2[0] = cm->mi + idx_str;
- mi2[0]->mbmi.sb_type = bsize;
-
- for (r = 0; r < bs; r++)
- for (c = 0; c < bs; c++)
- if (mi_row + r < cm->mi_rows && mi_col + c < cm->mi_cols)
- mi2[r * mis + c] = mi2[0];
-}
-
-typedef struct {
- int64_t sum_square_error;
- int64_t sum_error;
- int count;
- int variance;
-} var;
-
-typedef struct {
- var none;
- var horz[2];
- var vert[2];
-} partition_variance;
-
-#define VT(TYPE, BLOCKSIZE) \
- typedef struct { \
- partition_variance vt; \
- BLOCKSIZE split[4]; } TYPE;
-
-VT(v8x8, var)
-VT(v16x16, v8x8)
-VT(v32x32, v16x16)
-VT(v64x64, v32x32)
-
-typedef struct {
- partition_variance *vt;
- var *split[4];
-} vt_node;
-
-typedef enum {
- V16X16,
- V32X32,
- V64X64,
-} TREE_LEVEL;
-
-static void tree_to_node(void *data, BLOCK_SIZE bsize, vt_node *node) {
- int i;
- switch (bsize) {
- case BLOCK_64X64: {
- v64x64 *vt = (v64x64 *) data;
- node->vt = &vt->vt;
- for (i = 0; i < 4; i++)
- node->split[i] = &vt->split[i].vt.none;
- break;
- }
- case BLOCK_32X32: {
- v32x32 *vt = (v32x32 *) data;
- node->vt = &vt->vt;
- for (i = 0; i < 4; i++)
- node->split[i] = &vt->split[i].vt.none;
- break;
- }
- case BLOCK_16X16: {
- v16x16 *vt = (v16x16 *) data;
- node->vt = &vt->vt;
- for (i = 0; i < 4; i++)
- node->split[i] = &vt->split[i].vt.none;
- break;
- }
- case BLOCK_8X8: {
- v8x8 *vt = (v8x8 *) data;
- node->vt = &vt->vt;
- for (i = 0; i < 4; i++)
- node->split[i] = &vt->split[i];
- break;
- }
- default:
- node->vt = 0;
- for (i = 0; i < 4; i++)
- node->split[i] = 0;
- assert(-1);
- }
-}
-
-// Set variance values given sum square error, sum error, count.
-static void fill_variance(var *v, int64_t s2, int64_t s, int c) {
- v->sum_square_error = s2;
- v->sum_error = s;
- v->count = c;
- if (c > 0)
- v->variance = 256
- * (v->sum_square_error - v->sum_error * v->sum_error / v->count)
- / v->count;
- else
- v->variance = 0;
-}
-
-// Combine 2 variance structures by summing the sum_error, sum_square_error,
-// and counts and then calculating the new variance.
-void sum_2_variances(var *r, var *a, var*b) {
- fill_variance(r, a->sum_square_error + b->sum_square_error,
- a->sum_error + b->sum_error, a->count + b->count);
-}
-
-static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
- vt_node node;
- tree_to_node(data, bsize, &node);
- sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]);
- sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]);
- sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]);
- sum_2_variances(&node.vt->vert[1], node.split[1], node.split[3]);
- sum_2_variances(&node.vt->none, &node.vt->vert[0], &node.vt->vert[1]);
-}
-
-#if PERFORM_RANDOM_PARTITIONING
-static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
- BLOCK_SIZE block_size, int mi_row,
- int mi_col, int mi_size) {
- VP9_COMMON * const cm = &cpi->common;
- vt_node vt;
- const int mis = cm->mode_info_stride;
- int64_t threshold = 4 * cpi->common.base_qindex * cpi->common.base_qindex;
-
- tree_to_node(data, block_size, &vt);
-
- // split none is available only if we have more than half a block size
- // in width and height inside the visible image
- if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows &&
- (rand() & 3) < 1) {
- set_block_size(cm, m, block_size, mis, mi_row, mi_col);
- return 1;
- }
-
- // vertical split is available on all but the bottom border
- if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
- && (rand() & 3) < 1) {
- set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
- mi_col);
- return 1;
- }
-
- // horizontal split is available on all but the right border
- if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
- && (rand() & 3) < 1) {
- set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
- mi_col);
- return 1;
- }
-
- return 0;
-}
-
-#else // !PERFORM_RANDOM_PARTITIONING
-
-static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO **m,
- BLOCK_SIZE bsize, int mi_row,
- int mi_col, int mi_size) {
- VP9_COMMON * const cm = &cpi->common;
- vt_node vt;
- const int mis = cm->mode_info_stride;
- int64_t threshold = 50 * cpi->common.base_qindex;
-
- tree_to_node(data, bsize, &vt);
-
- // split none is available only if we have more than half a block size
- // in width and height inside the visible image
- if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows
- && vt.vt->none.variance < threshold) {
- set_block_size(cm, m, bsize, mis, mi_row, mi_col);
- return 1;
- }
-
- // vertical split is available on all but the bottom border
- if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
- && vt.vt->vert[1].variance < threshold) {
- set_block_size(cm, m, get_subsize(bsize, PARTITION_VERT), mis, mi_row,
- mi_col);
- return 1;
- }
-
- // horizontal split is available on all but the right border
- if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
- && vt.vt->horz[1].variance < threshold) {
- set_block_size(cm, m, get_subsize(bsize, PARTITION_HORZ), mis, mi_row,
- mi_col);
- return 1;
- }
-
- return 0;
-}
-#endif // PERFORM_RANDOM_PARTITIONING
-
-static void choose_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
- int mi_row, int mi_col) {
- VP9_COMMON * const cm = &cpi->common;
- MACROBLOCK *x = &cpi->mb;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
+static int sb_has_motion(VP9_COMP *cpi, MODE_INFO **prev_mi_8x8) {
+ VP9_COMMON *const cm = &cpi->common;
const int mis = cm->mode_info_stride;
- // TODO(JBB): More experimentation or testing of this threshold;
- int64_t threshold = 4;
- int i, j, k;
- v64x64 vt;
- unsigned char * s;
- int sp;
- const unsigned char * d;
- int dp;
- int pixels_wide = 64, pixels_high = 64;
-
- vp9_zero(vt);
- set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
-
- if (xd->mb_to_right_edge < 0)
- pixels_wide += (xd->mb_to_right_edge >> 3);
-
- if (xd->mb_to_bottom_edge < 0)
- pixels_high += (xd->mb_to_bottom_edge >> 3);
-
- s = x->plane[0].src.buf;
- sp = x->plane[0].src.stride;
-
- // TODO(JBB): Clearly the higher the quantizer the fewer partitions we want
- // but this needs more experimentation.
- threshold = threshold * cpi->common.base_qindex * cpi->common.base_qindex;
-
- d = vp9_64x64_zeros;
- dp = 64;
- if (cm->frame_type != KEY_FRAME) {
- int_mv nearest_mv, near_mv;
- const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, LAST_FRAME)];
- YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx];
- YV12_BUFFER_CONFIG *second_ref_fb = NULL;
-
- setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col,
- &xd->scale_factor[0]);
- setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
- &xd->scale_factor[1]);
+ int block_row, block_col;
- xd->this_mi->mbmi.ref_frame[0] = LAST_FRAME;
- xd->this_mi->mbmi.sb_type = BLOCK_64X64;
- vp9_find_best_ref_mvs(xd,
- mi_8x8[0]->mbmi.ref_mvs[mi_8x8[0]->mbmi.ref_frame[0]],
- &nearest_mv, &near_mv);
-
- xd->this_mi->mbmi.mv[0] = nearest_mv;
- vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64);
-
- d = xd->plane[0].dst.buf;
- dp = xd->plane[0].dst.stride;
- }
-
- // Fill in the entire tree of 8x8 variances for splits.
- for (i = 0; i < 4; i++) {
- const int x32_idx = ((i & 1) << 5);
- const int y32_idx = ((i >> 1) << 5);
- for (j = 0; j < 4; j++) {
- const int x16_idx = x32_idx + ((j & 1) << 4);
- const int y16_idx = y32_idx + ((j >> 1) << 4);
- v16x16 *vst = &vt.split[i].split[j];
- for (k = 0; k < 4; k++) {
- int x_idx = x16_idx + ((k & 1) << 3);
- int y_idx = y16_idx + ((k >> 1) << 3);
- unsigned int sse = 0;
- int sum = 0;
- if (x_idx < pixels_wide && y_idx < pixels_high)
- vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
- d + y_idx * dp + x_idx, dp, &sse, &sum);
- fill_variance(&vst->split[k].vt.none, sse, sum, 64);
- }
- }
- }
- // Fill the rest of the variance tree by summing the split partition
- // values.
- for (i = 0; i < 4; i++) {
- for (j = 0; j < 4; j++) {
- fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
- }
- fill_variance_tree(&vt.split[i], BLOCK_32X32);
- }
- fill_variance_tree(&vt, BLOCK_64X64);
- // Now go through the entire structure, splitting every block size until
- // we get to one that's got a variance lower than our threshold, or we
- // hit 8x8.
- if (!set_vt_partitioning(cpi, &vt, mi_8x8, BLOCK_64X64, mi_row, mi_col,
- 4)) {
- for (i = 0; i < 4; ++i) {
- const int x32_idx = ((i & 1) << 2);
- const int y32_idx = ((i >> 1) << 2);
- if (!set_vt_partitioning(cpi, &vt.split[i], mi_8x8, BLOCK_32X32,
- (mi_row + y32_idx), (mi_col + x32_idx), 2)) {
- for (j = 0; j < 4; ++j) {
- const int x16_idx = ((j & 1) << 1);
- const int y16_idx = ((j >> 1) << 1);
- if (!set_vt_partitioning(cpi, &vt.split[i].split[j], mi_8x8,
- BLOCK_16X16,
- (mi_row + y32_idx + y16_idx),
- (mi_col + x32_idx + x16_idx), 1)) {
- for (k = 0; k < 4; ++k) {
- const int x8_idx = (k & 1);
- const int y8_idx = (k >> 1);
- set_block_size(cm, mi_8x8, BLOCK_8X8, mis,
- (mi_row + y32_idx + y16_idx + y8_idx),
- (mi_col + x32_idx + x16_idx + x8_idx));
- }
- }
+ if (cm->prev_mi) {
+ for (block_row = 0; block_row < 8; ++block_row) {
+ for (block_col = 0; block_col < 8; ++block_col) {
+ MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col];
+ if (prev_mi) {
+ if (abs(prev_mi->mbmi.mv[0].as_mv.row) >= 8 ||
+ abs(prev_mi->mbmi.mv[0].as_mv.col) >= 8)
+ return 1;
}
}
}
}
+ return 0;
}
-static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+static void rd_use_partition(VP9_COMP *cpi,
+ const TileInfo *const tile,
+ MODE_INFO **mi_8x8,
TOKENEXTRA **tp, int mi_row, int mi_col,
BLOCK_SIZE bsize, int *rate, int64_t *dist,
int do_recon) {
@@ -1315,6 +1022,11 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
}
save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ if (bsize == BLOCK_16X16) {
+ set_offsets(cpi, tile, mi_row, mi_col, bsize);
+ x->mb_energy = vp9_block_energy(cpi, x, bsize);
+ }
+
x->fast_ms = 0;
x->subblock_ref = 0;
@@ -1338,11 +1050,12 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
mi_row + (ms >> 1) < cm->mi_rows &&
mi_col + (ms >> 1) < cm->mi_cols) {
*(get_sb_partitioning(x, bsize)) = bsize;
- pick_sb_modes(cpi, mi_row, mi_col, &none_rate, &none_dist, bsize,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
get_block_context(x, bsize), INT64_MAX);
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
+ pl = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row, mi_col, bsize);
none_rate += x->partition_cost[pl][PARTITION_NONE];
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -1353,12 +1066,12 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
switch (partition) {
case PARTITION_NONE:
- pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
bsize, get_block_context(x, bsize), INT64_MAX);
break;
case PARTITION_HORZ:
*get_sb_index(xd, subsize) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
subsize, get_block_context(x, subsize), INT64_MAX);
if (last_part_rate != INT_MAX &&
bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
@@ -1367,7 +1080,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
*get_sb_index(xd, subsize) = 1;
- pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
+ pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
get_block_context(x, subsize), INT64_MAX);
if (rt == INT_MAX || dt == INT_MAX) {
last_part_rate = INT_MAX;
@@ -1381,7 +1094,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
break;
case PARTITION_VERT:
*get_sb_index(xd, subsize) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
subsize, get_block_context(x, subsize), INT64_MAX);
if (last_part_rate != INT_MAX &&
bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
@@ -1390,7 +1103,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
*get_sb_index(xd, subsize) = 1;
- pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
+ pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
get_block_context(x, subsize), INT64_MAX);
if (rt == INT_MAX || dt == INT_MAX) {
last_part_rate = INT_MAX;
@@ -1417,7 +1130,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
*get_sb_index(xd, subsize) = i;
- rd_use_partition(cpi, mi_8x8 + jj * bss * mis + ii * bss, tp,
+ rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
i != 3);
if (rt == INT_MAX || dt == INT_MAX) {
@@ -1432,8 +1145,9 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
default:
assert(0);
}
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
+
+ pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+ mi_row, mi_col, bsize);
if (last_part_rate < INT_MAX)
last_part_rate += x->partition_cost[pl][partition];
@@ -1465,7 +1179,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
+ pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
split_subsize, get_block_context(x, split_subsize),
INT64_MAX);
@@ -1478,17 +1192,18 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
}
if (i != 3)
- encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
+ encode_sb(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, 0,
split_subsize);
split_rate += rt;
split_dist += dt;
- set_partition_seg_context(cm, xd, mi_row + y_idx, mi_col + x_idx);
- pl = partition_plane_context(xd, bsize);
+ pl = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row + y_idx, mi_col + x_idx, bsize);
split_rate += x->partition_cost[pl][PARTITION_NONE];
}
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
+ pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+ mi_row, mi_col, bsize);
if (split_rate < INT_MAX) {
split_rate += x->partition_cost[pl][PARTITION_SPLIT];
@@ -1523,7 +1238,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
if (do_recon)
- encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+ encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
*rate = chosen_rate;
*dist = chosen_dist;
@@ -1571,66 +1286,68 @@ static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8,
// Look at neighboring blocks and set a min and max partition size based on
// what they chose.
-static void rd_auto_partition_range(VP9_COMP *cpi, int row, int col,
+static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
+ int row, int col,
BLOCK_SIZE *min_block_size,
BLOCK_SIZE *max_block_size) {
+ VP9_COMMON * const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
MODE_INFO ** mi_8x8 = xd->mi_8x8;
+ MODE_INFO ** prev_mi_8x8 = xd->prev_mi_8x8;
+
const int left_in_image = xd->left_available && mi_8x8[-1];
const int above_in_image = xd->up_available &&
mi_8x8[-xd->mode_info_stride];
MODE_INFO ** above_sb64_mi_8x8;
MODE_INFO ** left_sb64_mi_8x8;
- // Frequency check
- if (cpi->sf.auto_min_max_partition_count <= 0) {
- cpi->sf.auto_min_max_partition_count =
- cpi->sf.auto_min_max_partition_interval;
+ int row8x8_remaining = tile->mi_row_end - row;
+ int col8x8_remaining = tile->mi_col_end - col;
+ int bh, bw;
+
+ // Trap case where we do not have a prediction.
+ if (!left_in_image && !above_in_image &&
+ ((cm->frame_type == KEY_FRAME) || !cm->prev_mi)) {
*min_block_size = BLOCK_4X4;
*max_block_size = BLOCK_64X64;
} else {
- --cpi->sf.auto_min_max_partition_count;
-
- // Set default values if no left or above neighbour
- if (!left_in_image && !above_in_image) {
- *min_block_size = BLOCK_4X4;
- *max_block_size = BLOCK_64X64;
- } else {
- VP9_COMMON *const cm = &cpi->common;
- int row8x8_remaining = cm->cur_tile_mi_row_end - row;
- int col8x8_remaining = cm->cur_tile_mi_col_end - col;
- int bh, bw;
-
- // Default "min to max" and "max to min"
- *min_block_size = BLOCK_64X64;
- *max_block_size = BLOCK_4X4;
-
- // Find the min and max partition sizes used in the left SB64
- if (left_in_image) {
- left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE];
- get_sb_partition_size_range(cpi, left_sb64_mi_8x8,
- min_block_size, max_block_size);
- }
-
- // Find the min and max partition sizes used in the above SB64 taking
- // the values found for left as a starting point.
- if (above_in_image) {
- above_sb64_mi_8x8 = &mi_8x8[-xd->mode_info_stride * MI_BLOCK_SIZE];
- get_sb_partition_size_range(cpi, above_sb64_mi_8x8,
- min_block_size, max_block_size);
- }
+ // Default "min to max" and "max to min"
+ *min_block_size = BLOCK_64X64;
+ *max_block_size = BLOCK_4X4;
+
+ // NOTE: each call to get_sb_partition_size_range() uses the previous
+ // passed in values for min and max as a starting point.
+ //
+ // Find the min and max partition used in previous frame at this location
+ if (cm->prev_mi && (cm->frame_type != KEY_FRAME)) {
+ get_sb_partition_size_range(cpi, prev_mi_8x8,
+ min_block_size, max_block_size);
+ }
- // Give a bit of leaway either side of the observed min and max
- *min_block_size = min_partition_size[*min_block_size];
- *max_block_size = max_partition_size[*max_block_size];
+ // Find the min and max partition sizes used in the left SB64
+ if (left_in_image) {
+ left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE];
+ get_sb_partition_size_range(cpi, left_sb64_mi_8x8,
+ min_block_size, max_block_size);
+ }
- // Check border cases where max and min from neighbours may not be legal.
- *max_block_size = find_partition_size(*max_block_size,
- row8x8_remaining, col8x8_remaining,
- &bh, &bw);
- *min_block_size = MIN(*min_block_size, *max_block_size);
+ // Find the min and max partition sizes used in the above SB64.
+ if (above_in_image) {
+ above_sb64_mi_8x8 = &mi_8x8[-xd->mode_info_stride * MI_BLOCK_SIZE];
+ get_sb_partition_size_range(cpi, above_sb64_mi_8x8,
+ min_block_size, max_block_size);
}
}
+
+ // Give a bit of leaway either side of the observed min and max
+ *min_block_size = min_partition_size[*min_block_size];
+ *max_block_size = max_partition_size[*max_block_size];
+
+ // Check border cases where max and min from neighbours may not be legal.
+ *max_block_size = find_partition_size(*max_block_size,
+ row8x8_remaining, col8x8_remaining,
+ &bh, &bw);
+ *min_block_size = MIN(*min_block_size, *max_block_size);
}
static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
@@ -1735,7 +1452,8 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
// unlikely to be selected depending on previous rate-distortion optimization
// results, for encoding speed-up.
-static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
+static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
+ TOKENEXTRA **tp, int mi_row,
int mi_col, BLOCK_SIZE bsize, int *rate,
int64_t *dist, int do_recon, int64_t best_rd) {
VP9_COMMON * const cm = &cpi->common;
@@ -1772,7 +1490,13 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
return;
}
}
- assert(mi_height_log2(bsize) == mi_width_log2(bsize));
+ assert(num_8x8_blocks_wide_lookup[bsize] ==
+ num_8x8_blocks_high_lookup[bsize]);
+
+ if (bsize == BLOCK_16X16) {
+ set_offsets(cpi, tile, mi_row, mi_col, bsize);
+ x->mb_energy = vp9_block_energy(cpi, x, bsize);
+ }
// Determine partition types in search according to the speed features.
// The threshold set here has to be of square block size.
@@ -1807,12 +1531,13 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
// PARTITION_NONE
if (partition_none_allowed) {
- pick_sb_modes(cpi, mi_row, mi_col, &this_rate, &this_dist, bsize,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
get_block_context(x, bsize), best_rd);
if (this_rate != INT_MAX) {
if (bsize >= BLOCK_8X8) {
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
+ pl = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row, mi_col, bsize);
this_rate += x->partition_cost[pl][PARTITION_NONE];
}
sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
@@ -1860,7 +1585,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
*get_sb_index(xd, subsize) = i;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
- rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+ rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize,
&this_rate, &this_dist, i != 3, best_rd - sum_rd);
if (this_rate == INT_MAX) {
@@ -1872,8 +1597,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
}
}
if (sum_rd < best_rd && i == 4) {
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
+ pl = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row, mi_col, bsize);
sum_rate += x->partition_cost[pl][PARTITION_SPLIT];
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
if (sum_rd < best_rd) {
@@ -1881,12 +1607,12 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
best_dist = sum_dist;
best_rd = sum_rd;
*(get_sb_partitioning(x, bsize)) = subsize;
- } else {
- // skip rectangular partition test when larger block size
- // gives better rd cost
- if (cpi->sf.less_rectangular_check)
- do_rect &= !partition_none_allowed;
}
+ } else {
+ // skip rectangular partition test when larger block size
+ // gives better rd cost
+ if (cpi->sf.less_rectangular_check)
+ do_rect &= !partition_none_allowed;
}
partition_split_done = 1;
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -1906,7 +1632,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
*get_sb_index(xd, subsize) = 0;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
- pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
get_block_context(x, subsize), best_rd);
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
@@ -1917,7 +1643,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
*get_sb_index(xd, subsize) = 1;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
- pick_sb_modes(cpi, mi_row + ms, mi_col, &this_rate,
+ pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
&this_dist, subsize, get_block_context(x, subsize),
best_rd - sum_rd);
if (this_rate == INT_MAX) {
@@ -1929,8 +1655,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
}
}
if (sum_rd < best_rd) {
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
+ pl = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row, mi_col, bsize);
sum_rate += x->partition_cost[pl][PARTITION_HORZ];
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
if (sum_rd < best_rd) {
@@ -1950,7 +1677,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
*get_sb_index(xd, subsize) = 0;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
- pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
get_block_context(x, subsize), best_rd);
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
@@ -1960,7 +1687,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
*get_sb_index(xd, subsize) = 1;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
- pick_sb_modes(cpi, mi_row, mi_col + ms, &this_rate,
+ pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
&this_dist, subsize, get_block_context(x, subsize),
best_rd - sum_rd);
if (this_rate == INT_MAX) {
@@ -1972,8 +1699,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
}
}
if (sum_rd < best_rd) {
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
+ pl = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row, mi_col, bsize);
sum_rate += x->partition_cost[pl][PARTITION_VERT];
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
if (sum_rd < best_rd) {
@@ -1991,7 +1719,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
*dist = best_dist;
if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon)
- encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+ encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
if (bsize == BLOCK_64X64) {
assert(tp_orig < *tp);
assert(best_rate < INT_MAX);
@@ -2002,10 +1730,10 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
}
// Examines 64x64 block and chooses a best reference frame
-static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
+static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile,
+ int mi_row, int mi_col) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
- MACROBLOCKD * const xd = &x->e_mbd;
int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl;
int ms = bs / 2;
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
@@ -2023,10 +1751,10 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
if ((mi_row + (ms >> 1) < cm->mi_rows) &&
(mi_col + (ms >> 1) < cm->mi_cols)) {
cpi->set_ref_frame_mask = 1;
- pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_64X64,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64,
get_block_context(x, BLOCK_64X64), INT64_MAX);
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, BLOCK_64X64);
+ pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+ mi_row, mi_col, BLOCK_64X64);
r += x->partition_cost[pl][PARTITION_NONE];
*(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64;
@@ -2036,27 +1764,27 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
}
-static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
- int *totalrate) {
+static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
+ int mi_row, TOKENEXTRA **tp, int *totalrate) {
VP9_COMMON * const cm = &cpi->common;
int mi_col;
// Initialize the left context for the new SB row
- vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));
- vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
+ vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context));
+ vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context));
// Code each SB in the row
- for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+ for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE) {
int dummy_rate;
int64_t dummy_dist;
- vpx_memset(cpi->mb.pred_mv, 0, sizeof(cpi->mb.pred_mv));
+ vp9_zero(cpi->mb.pred_mv);
if (cpi->sf.reference_masking)
- rd_pick_reference_frame(cpi, mi_row, mi_col);
+ rd_pick_reference_frame(cpi, tile, mi_row, mi_col);
- if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning ||
+ if (cpi->sf.use_lastframe_partitioning ||
cpi->sf.use_one_partition_size_always ) {
const int idx_str = cm->mode_info_stride * mi_row + mi_col;
MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
@@ -2064,13 +1792,9 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
cpi->mb.source_variance = UINT_MAX;
if (cpi->sf.use_one_partition_size_always) {
- set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
- set_partitioning(cpi, mi_8x8, mi_row, mi_col);
- rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1);
- } else if (cpi->sf.partition_by_variance) {
- choose_partitioning(cpi, cm->mi_grid_visible, mi_row, mi_col);
- rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+ set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+ set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
+ rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1);
} else {
if ((cpi->common.current_video_frame
@@ -2078,31 +1802,34 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
|| cm->prev_mi == 0
|| cpi->common.show_frame == 0
|| cpi->common.frame_type == KEY_FRAME
- || cpi->is_src_frame_alt_ref) {
+ || cpi->is_src_frame_alt_ref
+ || ((cpi->sf.use_lastframe_partitioning ==
+ LAST_FRAME_PARTITION_LOW_MOTION) &&
+ sb_has_motion(cpi, prev_mi_8x8))) {
// If required set upper and lower partition size limits
if (cpi->sf.auto_min_max_partition_size) {
- set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
- rd_auto_partition_range(cpi, mi_row, mi_col,
+ set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+ rd_auto_partition_range(cpi, tile, mi_row, mi_col,
&cpi->sf.min_partition_size,
&cpi->sf.max_partition_size);
}
- rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64,
+ rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1, INT64_MAX);
} else {
copy_partitioning(cpi, mi_8x8, prev_mi_8x8);
- rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+ rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1);
}
}
} else {
// If required set upper and lower partition size limits
if (cpi->sf.auto_min_max_partition_size) {
- set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
- rd_auto_partition_range(cpi, mi_row, mi_col,
+ set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+ rd_auto_partition_range(cpi, tile, mi_row, mi_col,
&cpi->sf.min_partition_size,
&cpi->sf.max_partition_size);
}
- rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64,
+ rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1, INT64_MAX);
}
}
@@ -2120,7 +1847,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
xd->mode_info_stride = cm->mode_info_stride;
// reset intra mode contexts
- if (cm->frame_type == KEY_FRAME)
+ if (frame_is_intra_only(cm))
vp9_init_mbmode_probs(cm);
// Copy data over into macro block data structures.
@@ -2129,16 +1856,16 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
// TODO(jkoleszar): are these initializations required?
setup_pre_planes(xd, 0, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]],
0, 0, NULL);
- setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
+ setup_dst_planes(xd, get_frame_new_buffer(cm), 0, 0);
setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
- xd->this_mi->mbmi.mode = DC_PRED;
- xd->this_mi->mbmi.uv_mode = DC_PRED;
+ xd->mi_8x8[0]->mbmi.mode = DC_PRED;
+ xd->mi_8x8[0]->mbmi.uv_mode = DC_PRED;
- vp9_zero(cpi->y_mode_count)
- vp9_zero(cpi->y_uv_mode_count)
- vp9_zero(cm->counts.inter_mode)
+ vp9_zero(cpi->y_mode_count);
+ vp9_zero(cpi->y_uv_mode_count);
+ vp9_zero(cm->counts.inter_mode);
vp9_zero(cpi->partition_count);
vp9_zero(cpi->intra_inter_count);
vp9_zero(cpi->comp_inter_count);
@@ -2149,29 +1876,26 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
// Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer.
- vpx_memset(cm->above_context[0], 0,
- sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols);
- vpx_memset(cm->above_seg_context, 0,
- sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
+ vpx_memset(cpi->above_context[0], 0,
+ sizeof(*cpi->above_context[0]) *
+ 2 * aligned_mi_cols * MAX_MB_PLANE);
+ vpx_memset(cpi->above_seg_context, 0,
+ sizeof(*cpi->above_seg_context) * aligned_mi_cols);
}
static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
if (lossless) {
// printf("Switching to lossless\n");
- cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
- cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
- cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
- cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
+ cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
+ cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
cpi->mb.optimize = 0;
cpi->common.lf.filter_level = 0;
cpi->zbin_mode_boost_enabled = 0;
cpi->common.tx_mode = ONLY_4X4;
} else {
// printf("Not lossless\n");
- cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
- cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
- cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
- cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
+ cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
+ cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
}
}
@@ -2204,21 +1928,15 @@ static void encode_frame_internal(VP9_COMP *cpi) {
totalrate = 0;
- // Reset frame count of inter 0,0 motion vector usage.
- cpi->inter_zz_count = 0;
-
vp9_zero(cm->counts.switchable_interp);
- vp9_zero(cpi->txfm_stepdown_count);
+ vp9_zero(cpi->tx_stepdown_count);
xd->mi_8x8 = cm->mi_grid_visible;
// required for vp9_frame_init_quantizer
- xd->this_mi =
xd->mi_8x8[0] = cm->mi;
- xd->mic_stream_ptr = cm->mi;
xd->last_mi = cm->prev_mi;
-
vp9_zero(cpi->NMVcount);
vp9_zero(cpi->coef_counts);
vp9_zero(cm->counts.eob_branch);
@@ -2229,7 +1947,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
vp9_frame_init_quantizer(cpi);
- vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q);
+ vp9_initialize_rd_consts(cpi);
vp9_initialize_me_consts(cpi, cm->base_qindex);
switch_tx_mode(cpi);
@@ -2263,16 +1981,15 @@ static void encode_frame_internal(VP9_COMP *cpi) {
const int tile_rows = 1 << cm->log2_tile_rows;
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- vp9_get_tile_row_offsets(cm, tile_row);
-
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ TileInfo tile;
TOKENEXTRA *tp_old = tp;
// For each row of SBs in the frame
- vp9_get_tile_col_offsets(cm, tile_col);
- for (mi_row = cm->cur_tile_mi_row_start;
- mi_row < cm->cur_tile_mi_row_end; mi_row += 8)
- encode_sb_row(cpi, mi_row, &tp, &totalrate);
+ vp9_tile_init(&tile, cm, tile_row, tile_col);
+ for (mi_row = tile.mi_row_start;
+ mi_row < tile.mi_row_end; mi_row += 8)
+ encode_sb_row(cpi, &tile, mi_row, &tp, &totalrate);
cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -2306,7 +2023,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
// Keep record of the total distortion this time around for future use
cpi->last_frame_distortion = cpi->frame_distortion;
#endif
-
}
static int check_dual_ref_flags(VP9_COMP *cpi) {
@@ -2347,18 +2063,19 @@ static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO **mi_8x8,
int mis, TX_SIZE max_tx_size, int bw, int bh,
int mi_row, int mi_col, BLOCK_SIZE bsize) {
VP9_COMMON * const cm = &cpi->common;
- MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi;
- if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) {
return;
-
- if (mbmi->tx_size > max_tx_size) {
- const int ymbs = MIN(bh, cm->mi_rows - mi_row);
- const int xmbs = MIN(bw, cm->mi_cols - mi_col);
-
- assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
- get_skip_flag(mi_8x8, mis, ymbs, xmbs));
- set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size);
+ } else {
+ MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi;
+ if (mbmi->tx_size > max_tx_size) {
+ const int ymbs = MIN(bh, cm->mi_rows - mi_row);
+ const int xmbs = MIN(bw, cm->mi_cols - mi_col);
+
+ assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
+ get_skip_flag(mi_8x8, mis, ymbs, xmbs));
+ set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size);
+ }
}
}
@@ -2424,7 +2141,7 @@ static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
static int get_frame_type(VP9_COMP *cpi) {
int frame_type;
- if (cpi->common.frame_type == KEY_FRAME)
+ if (frame_is_intra_only(&cpi->common))
frame_type = 0;
else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
frame_type = 3;
@@ -2453,9 +2170,9 @@ static void select_tx_mode(VP9_COMP *cpi) {
unsigned int total = 0;
int i;
for (i = 0; i < TX_SIZES; ++i)
- total += cpi->txfm_stepdown_count[i];
+ total += cpi->tx_stepdown_count[i];
if (total) {
- double fraction = (double)cpi->txfm_stepdown_count[0] / total;
+ double fraction = (double)cpi->tx_stepdown_count[0] / total;
cpi->common.tx_mode = fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT;
// printf("fraction = %f\n", fraction);
} // else keep unchanged
@@ -2472,21 +2189,23 @@ void vp9_encode_frame(VP9_COMP *cpi) {
// requires further work in the rd loop. For now the only supported encoder
// side behavior is where the ALT ref buffer has opposite sign bias to
// the other two.
- if ((cm->ref_frame_sign_bias[ALTREF_FRAME]
- == cm->ref_frame_sign_bias[GOLDEN_FRAME])
- || (cm->ref_frame_sign_bias[ALTREF_FRAME]
- == cm->ref_frame_sign_bias[LAST_FRAME])) {
- cm->allow_comp_inter_inter = 0;
- } else {
- cm->allow_comp_inter_inter = 1;
- cm->comp_fixed_ref = ALTREF_FRAME;
- cm->comp_var_ref[0] = LAST_FRAME;
- cm->comp_var_ref[1] = GOLDEN_FRAME;
+ if (!frame_is_intra_only(cm)) {
+ if ((cm->ref_frame_sign_bias[ALTREF_FRAME]
+ == cm->ref_frame_sign_bias[GOLDEN_FRAME])
+ || (cm->ref_frame_sign_bias[ALTREF_FRAME]
+ == cm->ref_frame_sign_bias[LAST_FRAME])) {
+ cm->allow_comp_inter_inter = 0;
+ } else {
+ cm->allow_comp_inter_inter = 1;
+ cm->comp_fixed_ref = ALTREF_FRAME;
+ cm->comp_var_ref[0] = LAST_FRAME;
+ cm->comp_var_ref[1] = GOLDEN_FRAME;
+ }
}
if (cpi->sf.RD) {
int i, pred_type;
- INTERPOLATIONFILTERTYPE filter_type;
+ INTERPOLATION_TYPE filter_type;
/*
* This code does a single RD pass over the whole frame assuming
* either compound, single or hybrid prediction as per whatever has
@@ -2554,7 +2273,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
}
- for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs;
cpi->rd_filter_threshes[frame_type][i] =
(cpi->rd_filter_threshes[frame_type][i] + diff) / 2;
@@ -2627,7 +2346,6 @@ void vp9_encode_frame(VP9_COMP *cpi) {
} else {
encode_frame_internal(cpi);
}
-
}
static void sum_intra_stats(VP9_COMP *cpi, const MODE_INFO *mi) {
@@ -2732,7 +2450,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[0])];
YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx];
YV12_BUFFER_CONFIG *second_ref_fb = NULL;
- if (mbmi->ref_frame[1] > 0) {
+ if (has_second_ref(mbmi)) {
idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[1])];
second_ref_fb = &cm->yv12_fb[idx];
}
@@ -2744,7 +2462,6 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
&xd->scale_factor[1]);
-
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
}
@@ -2770,7 +2487,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
(mbmi->skip_coeff ||
vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
const uint8_t context = vp9_get_pred_context_tx_size(xd);
- update_tx_counts(bsize, context, mbmi->tx_size, &cm->counts.tx);
+ ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size];
} else {
int x, y;
TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode];
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.h b/libvpx/vp9/encoder/vp9_encodeframe.h
index 3991969..3e9f538 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.h
+++ b/libvpx/vp9/encoder/vp9_encodeframe.h
@@ -17,6 +17,6 @@ struct yv12_buffer_config;
void vp9_setup_src_planes(struct macroblock *x,
const struct yv12_buffer_config *src,
- int mb_row, int mb_col);
+ int mi_row, int mi_col);
#endif // VP9_ENCODER_VP9_ENCODEFRAME_H_
diff --git a/libvpx/vp9/encoder/vp9_encodeintra.c b/libvpx/vp9/encoder/vp9_encodeintra.c
index c5e5dff..32b4593 100644
--- a/libvpx/vp9/encoder/vp9_encodeintra.c
+++ b/libvpx/vp9/encoder/vp9_encodeintra.c
@@ -9,7 +9,7 @@
*/
#include "./vpx_config.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/common/vp9_reconintra.h"
#include "vp9/encoder/vp9_encodemb.h"
diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c
index 8dd80a5..75ed8ea 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libvpx/vp9/encoder/vp9_encodemb.c
@@ -8,19 +8,22 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+
+#include "./vp9_rtcd.h"
#include "./vpx_config.h"
-#include "vp9/encoder/vp9_encodemb.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_reconinter.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/encoder/vp9_tokenize.h"
#include "vp9/common/vp9_reconintra.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/encoder/vp9_rdopt.h"
#include "vp9/common/vp9_systemdependent.h"
-#include "vp9_rtcd.h"
-DECLARE_ALIGNED(16, extern const uint8_t,
- vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+#include "vp9/encoder/vp9_dct.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_tokenize.h"
void vp9_subtract_block_c(int rows, int cols,
int16_t *diff_ptr, ptrdiff_t diff_stride,
@@ -38,37 +41,6 @@ void vp9_subtract_block_c(int rows, int cols,
}
}
-static void inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob,
- int16_t *dqcoeff, uint8_t *dest,
- int stride) {
- if (eob <= 1)
- xd->inv_txm4x4_1_add(dqcoeff, dest, stride);
- else
- xd->inv_txm4x4_add(dqcoeff, dest, stride);
-}
-
-static void inverse_transform_b_8x8_add(int eob,
- int16_t *dqcoeff, uint8_t *dest,
- int stride) {
- if (eob <= 1)
- vp9_short_idct8x8_1_add(dqcoeff, dest, stride);
- else if (eob <= 10)
- vp9_short_idct10_8x8_add(dqcoeff, dest, stride);
- else
- vp9_short_idct8x8_add(dqcoeff, dest, stride);
-}
-
-static void inverse_transform_b_16x16_add(int eob,
- int16_t *dqcoeff, uint8_t *dest,
- int stride) {
- if (eob <= 1)
- vp9_short_idct16x16_1_add(dqcoeff, dest, stride);
- else if (eob <= 10)
- vp9_short_idct10_16x16_add(dqcoeff, dest, stride);
- else
- vp9_short_idct16x16_add(dqcoeff, dest, stride);
-}
-
static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
struct macroblock_plane *const p = &x->plane[plane];
const MACROBLOCKD *const xd = &x->e_mbd;
@@ -97,8 +69,7 @@ void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
vp9_subtract_sbuv(x, bsize);
}
-
-#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
+#define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
typedef struct vp9_token_state vp9_token_state;
struct vp9_token_state {
@@ -109,7 +80,7 @@ struct vp9_token_state {
short qc;
};
-// TODO: experiments to find optimal multiple numbers
+// TODO(jimbankoski): experiment to find optimal RD numbers.
#define Y1_RD_MULT 4
#define UV_RD_MULT 2
@@ -147,7 +118,7 @@ static void optimize_b(MACROBLOCK *mb,
TX_SIZE tx_size) {
MACROBLOCKD *const xd = &mb->e_mbd;
struct macroblockd_plane *pd = &xd->plane[plane];
- const int ref = is_inter_block(&xd->this_mi->mbmi);
+ const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
vp9_token_state tokens[1025][2];
unsigned best_index[1025][2];
const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block);
@@ -161,40 +132,18 @@ static void optimize_b(MACROBLOCK *mb,
int best, band, pt;
PLANE_TYPE type = pd->plane_type;
int err_mult = plane_rd_mult[type];
- int default_eob;
+ const int default_eob = 16 << (tx_size << 1);
const int16_t *scan, *nb;
const int mul = 1 + (tx_size == TX_32X32);
uint8_t token_cache[1024];
const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block);
const int16_t *dequant_ptr = pd->dequant;
- const uint8_t * band_translate;
+ const uint8_t *const band_translate = get_band_translate(tx_size);
assert((!type && !plane) || (type && plane));
dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
- switch (tx_size) {
- default:
- case TX_4X4:
- default_eob = 16;
- scan = get_scan_4x4(get_tx_type_4x4(type, xd, ib));
- band_translate = vp9_coefband_trans_4x4;
- break;
- case TX_8X8:
- scan = get_scan_8x8(get_tx_type_8x8(type, xd));
- default_eob = 64;
- band_translate = vp9_coefband_trans_8x8plus;
- break;
- case TX_16X16:
- scan = get_scan_16x16(get_tx_type_16x16(type, xd));
- default_eob = 256;
- band_translate = vp9_coefband_trans_8x8plus;
- break;
- case TX_32X32:
- scan = vp9_default_scan_32x32;
- default_eob = 1024;
- band_translate = vp9_coefband_trans_8x8plus;
- break;
- }
+ get_scan(xd, tx_size, type, ib, &scan, &nb);
assert(eob <= default_eob);
/* Now set up a Viterbi trellis to evaluate alternative roundings. */
@@ -213,7 +162,6 @@ static void optimize_b(MACROBLOCK *mb,
for (i = 0; i < eob; i++)
token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
qcoeff_ptr[scan[i]]].token];
- nb = vp9_get_coef_neighbors_handle(scan);
for (i = eob; i-- > i0;) {
int base_bits, d2, dx;
@@ -312,11 +260,10 @@ static void optimize_b(MACROBLOCK *mb,
best_index[i][1] = best;
/* Finally, make this the new head of the trellis. */
next = i;
- }
- /* There's no choice to make for a zero coefficient, so we don't
- * add a new trellis node, but we do need to update the costs.
- */
- else {
+ } else {
+ /* There's no choice to make for a zero coefficient, so we don't
+ * add a new trellis node, but we do need to update the costs.
+ */
band = get_coef_band(band_translate, i + 1);
t0 = tokens[next][0].token;
t1 = tokens[next][1].token;
@@ -385,38 +332,12 @@ static void optimize_init_b(int plane, BLOCK_SIZE bsize,
const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
- const MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+ const MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
- int i;
- switch (tx_size) {
- case TX_4X4:
- vpx_memcpy(args->ctx->ta[plane], pd->above_context,
- sizeof(ENTROPY_CONTEXT) * num_4x4_w);
- vpx_memcpy(args->ctx->tl[plane], pd->left_context,
- sizeof(ENTROPY_CONTEXT) * num_4x4_h);
- break;
- case TX_8X8:
- for (i = 0; i < num_4x4_w; i += 2)
- args->ctx->ta[plane][i] = !!*(uint16_t *)&pd->above_context[i];
- for (i = 0; i < num_4x4_h; i += 2)
- args->ctx->tl[plane][i] = !!*(uint16_t *)&pd->left_context[i];
- break;
- case TX_16X16:
- for (i = 0; i < num_4x4_w; i += 4)
- args->ctx->ta[plane][i] = !!*(uint32_t *)&pd->above_context[i];
- for (i = 0; i < num_4x4_h; i += 4)
- args->ctx->tl[plane][i] = !!*(uint32_t *)&pd->left_context[i];
- break;
- case TX_32X32:
- for (i = 0; i < num_4x4_w; i += 8)
- args->ctx->ta[plane][i] = !!*(uint64_t *)&pd->above_context[i];
- for (i = 0; i < num_4x4_h; i += 8)
- args->ctx->tl[plane][i] = !!*(uint64_t *)&pd->left_context[i];
- break;
- default:
- assert(0);
- }
+ vp9_get_entropy_contexts(tx_size, args->ctx->ta[plane], args->ctx->tl[plane],
+ pd->above_context, pd->left_context,
+ num_4x4_w, num_4x4_h);
}
void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -445,9 +366,9 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
yoff = 32 * (block >> twl);
src_diff = p->src_diff + 4 * bw * yoff + xoff;
if (x->use_lp32x32fdct)
- vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
+ vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
else
- vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+ vp9_fdct32x32(src_diff, coeff, bw * 4);
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -459,7 +380,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
xoff = 16 * (block & twmask);
yoff = 16 * (block >> twl);
src_diff = p->src_diff + 4 * bw * yoff + xoff;
- x->fwd_txm16x16(src_diff, coeff, bw * 8);
+ vp9_fdct16x16(src_diff, coeff, bw * 4);
vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -471,7 +392,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
xoff = 8 * (block & twmask);
yoff = 8 * (block >> twl);
src_diff = p->src_diff + 4 * bw * yoff + xoff;
- x->fwd_txm8x8(src_diff, coeff, bw * 8);
+ vp9_fdct8x8(src_diff, coeff, bw * 4);
vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -482,7 +403,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
xoff = 4 * (block & twmask);
yoff = 4 * (block >> twl);
src_diff = p->src_diff + 4 * bw * yoff + xoff;
- x->fwd_txm4x4(src_diff, coeff, bw * 8);
+ x->fwd_txm4x4(src_diff, coeff, bw * 4);
vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -497,6 +418,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
struct encode_b_args *const args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
+ struct optimize_ctx *const ctx = args->ctx;
struct macroblockd_plane *const pd = &xd->plane[plane];
const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
block);
@@ -504,38 +426,68 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
pd->dst.buf, pd->dst.stride);
+
+ // TODO(jingning): per transformed block zero forcing only enabled for
+ // luma component. will integrate chroma components as well.
+ if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+ int i, j;
+ pd->eobs[block] = 0;
+ txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+ ctx->ta[plane][i] = 0;
+ ctx->tl[plane][j] = 0;
+ return;
+ }
+
vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
if (x->optimize)
- vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
+ vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
if (x->skip_encode || pd->eobs[block] == 0)
return;
switch (tx_size) {
case TX_32X32:
- vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
+ vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
break;
case TX_16X16:
- inverse_transform_b_16x16_add(pd->eobs[block], dqcoeff, dst,
- pd->dst.stride);
+ vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
break;
case TX_8X8:
- inverse_transform_b_8x8_add(pd->eobs[block], dqcoeff, dst,
- pd->dst.stride);
+ vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
break;
case TX_4X4:
// this is like vp9_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
- inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
- dst, pd->dst.stride);
+ xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
break;
default:
assert(!"Invalid transform size");
}
}
+static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct encode_b_args *const args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
+ block);
+
+ int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
+ pd->dst.buf, pd->dst.stride);
+
+ vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
+
+ if (pd->eobs[block] == 0)
+ return;
+
+ xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+}
+
void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
struct optimize_ctx ctx;
@@ -545,7 +497,7 @@ void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
if (x->optimize)
optimize_init_b(0, bsize, &arg);
- foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
+ foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1, &arg);
}
void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
@@ -569,7 +521,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
struct encode_b_args* const args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
@@ -607,14 +559,14 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_subtract_block(32, 32, src_diff, bw * 4,
src, p->src.stride, dst, pd->dst.stride);
if (x->use_lp32x32fdct)
- vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
+ vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
else
- vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+ vp9_fdct32x32(src_diff, coeff, bw * 4);
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
if (!x->skip_encode && *eob)
- vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
+ vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob);
break;
case TX_16X16:
tx_type = get_tx_type_16x16(pd->plane_type, xd);
@@ -631,19 +583,12 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
dst, pd->dst.stride, dst, pd->dst.stride);
vp9_subtract_block(16, 16, src_diff, bw * 4,
src, p->src.stride, dst, pd->dst.stride);
- if (tx_type != DCT_DCT)
- vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type);
- else
- x->fwd_txm16x16(src_diff, coeff, bw * 8);
+ vp9_fht16x16(tx_type, src_diff, coeff, bw * 4);
vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
- if (!x->skip_encode && *eob) {
- if (tx_type == DCT_DCT)
- inverse_transform_b_16x16_add(*eob, dqcoeff, dst, pd->dst.stride);
- else
- vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
- }
+ if (!x->skip_encode && *eob)
+ vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
break;
case TX_8X8:
tx_type = get_tx_type_8x8(pd->plane_type, xd);
@@ -660,26 +605,19 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
dst, pd->dst.stride, dst, pd->dst.stride);
vp9_subtract_block(8, 8, src_diff, bw * 4,
src, p->src.stride, dst, pd->dst.stride);
- if (tx_type != DCT_DCT)
- vp9_short_fht8x8(src_diff, coeff, bw * 4, tx_type);
- else
- x->fwd_txm8x8(src_diff, coeff, bw * 8);
+ vp9_fht8x8(tx_type, src_diff, coeff, bw * 4);
vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
- if (!x->skip_encode && *eob) {
- if (tx_type == DCT_DCT)
- inverse_transform_b_8x8_add(*eob, dqcoeff, dst, pd->dst.stride);
- else
- vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
- }
+ if (!x->skip_encode && *eob)
+ vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
break;
case TX_4X4:
tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
scan = get_scan_4x4(tx_type);
iscan = get_iscan_4x4(tx_type);
if (mbmi->sb_type < BLOCK_8X8 && plane == 0)
- mode = xd->this_mi->bmi[block].as_mode;
+ mode = xd->mi_8x8[0]->bmi[block].as_mode;
else
mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
@@ -695,7 +633,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
if (tx_type != DCT_DCT)
vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
else
- x->fwd_txm4x4(src_diff, coeff, bw * 8);
+ x->fwd_txm4x4(src_diff, coeff, bw * 4);
vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -704,9 +642,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
// this is like vp9_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
- inverse_transform_b_4x4_add(xd, *eob, dqcoeff, dst, pd->dst.stride);
+ xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob);
else
- vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
+ vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type);
}
break;
default:
diff --git a/libvpx/vp9/encoder/vp9_encodemb.h b/libvpx/vp9/encoder/vp9_encodemb.h
index 54e69fd..61dd735 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/libvpx/vp9/encoder/vp9_encodemb.h
@@ -16,32 +16,17 @@
#include "vp9/encoder/vp9_onyx_int.h"
#include "vp9/common/vp9_onyxc_int.h"
-typedef enum {
- RD_DC_PRED = DC_PRED,
- RD_V_PRED = V_PRED,
- RD_H_PRED = H_PRED,
- RD_D45_PRED = D45_PRED,
- RD_D135_PRED = D135_PRED,
- RD_D117_PRED = D117_PRED,
- RD_D153_PRED = D153_PRED,
- RD_D207_PRED = D207_PRED,
- RD_D63_PRED = D63_PRED,
- RD_TM_PRED = TM_PRED,
- RD_NEARESTMV = NEARESTMV,
- RD_NEARMV = NEARMV,
- RD_ZEROMV = ZEROMV,
- RD_NEWMV = NEWMV,
- RD_I4X4_PRED,
- RD_SPLITMV,
- RD_MODE_COUNT
-} RD_PREDICTION_MODE;
-
typedef struct {
- RD_PREDICTION_MODE mode;
+ MB_PREDICTION_MODE mode;
MV_REFERENCE_FRAME ref_frame;
MV_REFERENCE_FRAME second_ref_frame;
} MODE_DEFINITION;
+typedef struct {
+ MV_REFERENCE_FRAME ref_frame;
+ MV_REFERENCE_FRAME second_ref_frame;
+} REF_DEFINITION;
+
struct optimize_ctx {
ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c
index ed3a2bb..e2c6c4c 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/libvpx/vp9/encoder/vp9_encodemv.c
@@ -8,13 +8,13 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <math.h>
#include "vp9/common/vp9_common.h"
-#include "vp9/encoder/vp9_encodemv.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/encoder/vp9_encodemv.h"
-#include <math.h>
#ifdef ENTROPY_STATS
extern unsigned int active_section;
@@ -124,8 +124,9 @@ static void build_nmv_component_cost_table(int *mvcost,
}
}
-static int update_mv(vp9_writer *w, const unsigned int ct[2],
- vp9_prob *cur_p, vp9_prob new_p, vp9_prob upd_p) {
+static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p,
+ vp9_prob upd_p) {
+ const vp9_prob new_p = get_binary_prob(ct[0], ct[1]);
vp9_prob mod_p = new_p | 1;
const int cur_b = cost_branch256(ct, *cur_p);
const int mod_b = cost_branch256(ct, mod_p);
@@ -143,7 +144,6 @@ static int update_mv(vp9_writer *w, const unsigned int ct[2],
static void counts_to_nmv_context(
nmv_context_counts *nmv_count,
- nmv_context *prob,
int usehp,
unsigned int (*branch_ct_joint)[2],
unsigned int (*branch_ct_sign)[2],
@@ -156,29 +156,24 @@ static void counts_to_nmv_context(
unsigned int (*branch_ct_hp)[2]) {
int i, j, k;
vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
- prob->joints,
branch_ct_joint,
nmv_count->joints, 0);
for (i = 0; i < 2; ++i) {
const uint32_t s0 = nmv_count->comps[i].sign[0];
const uint32_t s1 = nmv_count->comps[i].sign[1];
- prob->comps[i].sign = get_binary_prob(s0, s1);
branch_ct_sign[i][0] = s0;
branch_ct_sign[i][1] = s1;
vp9_tree_probs_from_distribution(vp9_mv_class_tree,
- prob->comps[i].classes,
- branch_ct_classes[i],
- nmv_count->comps[i].classes, 0);
+ branch_ct_classes[i],
+ nmv_count->comps[i].classes, 0);
vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
- prob->comps[i].class0,
branch_ct_class0[i],
nmv_count->comps[i].class0, 0);
for (j = 0; j < MV_OFFSET_BITS; ++j) {
const uint32_t b0 = nmv_count->comps[i].bits[j][0];
const uint32_t b1 = nmv_count->comps[i].bits[j][1];
- prob->comps[i].bits[j] = get_binary_prob(b0, b1);
branch_ct_bits[i][j][0] = b0;
branch_ct_bits[i][j][1] = b1;
}
@@ -186,12 +181,10 @@ static void counts_to_nmv_context(
for (i = 0; i < 2; ++i) {
for (k = 0; k < CLASS0_SIZE; ++k) {
vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
- prob->comps[i].class0_fp[k],
branch_ct_class0_fp[i][k],
nmv_count->comps[i].class0_fp[k], 0);
}
vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
- prob->comps[i].fp,
branch_ct_fp[i],
nmv_count->comps[i].fp, 0);
}
@@ -202,11 +195,9 @@ static void counts_to_nmv_context(
const uint32_t hp0 = nmv_count->comps[i].hp[0];
const uint32_t hp1 = nmv_count->comps[i].hp[1];
- prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
branch_ct_class0_hp[i][0] = c0_hp0;
branch_ct_class0_hp[i][1] = c0_hp1;
- prob->comps[i].hp = get_binary_prob(hp0, hp1);
branch_ct_hp[i][0] = hp0;
branch_ct_hp[i][1] = hp1;
}
@@ -215,7 +206,6 @@ static void counts_to_nmv_context(
void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
int i, j;
- nmv_context prob;
unsigned int branch_ct_joint[MV_JOINTS - 1][2];
unsigned int branch_ct_sign[2][2];
unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
@@ -227,30 +217,28 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
unsigned int branch_ct_hp[2][2];
nmv_context *mvc = &cpi->common.fc.nmvc;
- counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
+ counts_to_nmv_context(&cpi->NMVcount, usehp,
branch_ct_joint, branch_ct_sign, branch_ct_classes,
branch_ct_class0, branch_ct_bits,
branch_ct_class0_fp, branch_ct_fp,
branch_ct_class0_hp, branch_ct_hp);
for (j = 0; j < MV_JOINTS - 1; ++j)
- update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j],
- NMV_UPDATE_PROB);
+ update_mv(bc, branch_ct_joint[j], &mvc->joints[j], NMV_UPDATE_PROB);
for (i = 0; i < 2; ++i) {
- update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign,
- prob.comps[i].sign, NMV_UPDATE_PROB);
+ update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign, NMV_UPDATE_PROB);
for (j = 0; j < MV_CLASSES - 1; ++j)
update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
- prob.comps[i].classes[j], NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
for (j = 0; j < CLASS0_SIZE - 1; ++j)
update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
- prob.comps[i].class0[j], NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
for (j = 0; j < MV_OFFSET_BITS; ++j)
update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
- prob.comps[i].bits[j], NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
}
for (i = 0; i < 2; ++i) {
@@ -258,21 +246,19 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
int k;
for (k = 0; k < 3; ++k)
update_mv(bc, branch_ct_class0_fp[i][j][k],
- &mvc->comps[i].class0_fp[j][k],
- prob.comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
+ &mvc->comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
}
for (j = 0; j < 3; ++j)
- update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j],
- prob.comps[i].fp[j], NMV_UPDATE_PROB);
+ update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j], NMV_UPDATE_PROB);
}
if (usehp) {
for (i = 0; i < 2; ++i) {
update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
- prob.comps[i].class0_hp, NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
- prob.comps[i].hp, NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
}
}
}
@@ -314,44 +300,34 @@ void vp9_build_nmv_cost_table(int *mvjoint,
build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp);
}
-void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
- int_mv *best_ref_mv, int_mv *second_best_ref_mv) {
+static void inc_mvs(int_mv mv[2], int_mv ref[2], int is_compound,
+ nmv_context_counts *counts) {
+ int i;
+ for (i = 0; i < 1 + is_compound; ++i) {
+ const MV diff = { mv[i].as_mv.row - ref[i].as_mv.row,
+ mv[i].as_mv.col - ref[i].as_mv.col };
+ vp9_inc_mv(&diff, counts);
+ }
+}
+
+void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) {
MODE_INFO *mi = x->e_mbd.mi_8x8[0];
MB_MODE_INFO *const mbmi = &mi->mbmi;
- MV diff;
- const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
- const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
- int idx, idy;
+ const int is_compound = has_second_ref(mbmi);
if (mbmi->sb_type < BLOCK_8X8) {
- PARTITION_INFO *pi = x->partition_info;
- for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
- for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi->sb_type];
+ int idx, idy;
+
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
const int i = idy * 2 + idx;
- if (pi->bmi[i].mode == NEWMV) {
- diff.row = mi->bmi[i].as_mv[0].as_mv.row - best_ref_mv->as_mv.row;
- diff.col = mi->bmi[i].as_mv[0].as_mv.col - best_ref_mv->as_mv.col;
- vp9_inc_mv(&diff, &cpi->NMVcount);
-
- if (mi->mbmi.ref_frame[1] > INTRA_FRAME) {
- diff.row = mi->bmi[i].as_mv[1].as_mv.row -
- second_best_ref_mv->as_mv.row;
- diff.col = mi->bmi[i].as_mv[1].as_mv.col -
- second_best_ref_mv->as_mv.col;
- vp9_inc_mv(&diff, &cpi->NMVcount);
- }
- }
+ if (mi->bmi[i].as_mode == NEWMV)
+ inc_mvs(mi->bmi[i].as_mv, best_ref_mv, is_compound, &cpi->NMVcount);
}
}
} else if (mbmi->mode == NEWMV) {
- diff.row = mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row;
- diff.col = mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col;
- vp9_inc_mv(&diff, &cpi->NMVcount);
-
- if (mbmi->ref_frame[1] > INTRA_FRAME) {
- diff.row = mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row;
- diff.col = mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col;
- vp9_inc_mv(&diff, &cpi->NMVcount);
- }
+ inc_mvs(mbmi->mv, best_ref_mv, is_compound, &cpi->NMVcount);
}
}
diff --git a/libvpx/vp9/encoder/vp9_encodemv.h b/libvpx/vp9/encoder/vp9_encodemv.h
index 2789ce1..6331778 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.h
+++ b/libvpx/vp9/encoder/vp9_encodemv.h
@@ -25,7 +25,7 @@ void vp9_build_nmv_cost_table(int *mvjoint,
int usehp,
int mvc_flag_v,
int mvc_flag_h);
-void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
- int_mv *best_ref_mv, int_mv *second_best_ref_mv);
+
+void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]);
#endif // VP9_ENCODER_VP9_ENCODEMV_H_
diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c
index 9cf7b83..6a3555d 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libvpx/vp9/encoder/vp9_firstpass.c
@@ -8,8 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "math.h"
-#include "limits.h"
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
#include "vp9/encoder/vp9_block.h"
#include "vp9/encoder/vp9_onyx_int.h"
#include "vp9/encoder/vp9_variance.h"
@@ -23,13 +24,13 @@
#include "vp9/common/vp9_systemdependent.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_scale/yv12config.h"
-#include <stdio.h>
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_rdopt.h"
#include "vp9/encoder/vp9_ratectrl.h"
#include "vp9/common/vp9_quant_common.h"
#include "vp9/common/vp9_entropymv.h"
#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_vaq.h"
#include "./vpx_scale_rtcd.h"
// TODO(jkoleszar): for setup_dst_planes
#include "vp9/common/vp9_reconinter.h"
@@ -77,7 +78,8 @@ static int select_cq_level(int qindex) {
}
-// Resets the first pass file to the given position using a relative seek from the current position
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *position) {
cpi->twopass.stats_in = position;
}
@@ -250,8 +252,10 @@ static void avg_stats(FIRSTPASS_STATS *section) {
section->duration /= section->count;
}
-// Calculate a modified Error used in distributing bits between easier and harder frames
-static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+static double calculate_modified_err(VP9_COMP *cpi,
+ FIRSTPASS_STATS *this_frame) {
const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats;
const double av_err = stats->ssim_weighted_pred_err / stats->count;
const double this_err = this_frame->ssim_weighted_pred_err;
@@ -260,38 +264,43 @@ static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame)
}
static const double weight_table[256] = {
- 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
- 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
- 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
- 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
- 0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
- 0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
- 0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
- 0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
+ 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+ 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+ 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+ 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+ 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.031250, 0.062500,
+ 0.093750, 0.125000, 0.156250, 0.187500, 0.218750, 0.250000, 0.281250,
+ 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750, 0.500000,
+ 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
+ 0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500,
+ 0.968750, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000
};
static double simple_weight(YV12_BUFFER_CONFIG *source) {
@@ -300,7 +309,8 @@ static double simple_weight(YV12_BUFFER_CONFIG *source) {
uint8_t *src = source->y_buffer;
double sum_weights = 0.0;
- // Loop throught the Y plane raw examining levels and creating a weight for the image
+ // Loop through the Y plane examining levels and creating a weight for
+ // the image.
i = source->y_height;
do {
j = source->y_width;
@@ -340,13 +350,15 @@ void vp9_end_first_pass(VP9_COMP *cpi) {
output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
}
-static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {
+static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+ YV12_BUFFER_CONFIG *recon_buffer,
+ int *best_motion_err, int recon_yoffset) {
MACROBLOCKD *const xd = &x->e_mbd;
// Set up pointers for this macro block recon buffer
xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;
- switch (xd->this_mi->mbmi.sb_type) {
+ switch (xd->mi_8x8[0]->mbmi.sb_type) {
case BLOCK_8X8:
vp9_mse8x8(x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
@@ -385,7 +397,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
int n;
vp9_variance_fn_ptr_t v_fn_ptr =
- cpi->fn_ptr[xd->this_mi->mbmi.sb_type];
+ cpi->fn_ptr[xd->mi_8x8[0]->mbmi.sb_type];
int new_mv_mode_penalty = 256;
int sr = 0;
@@ -402,7 +414,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
further_steps -= sr;
// override the default variance function to use MSE
- switch (xd->this_mi->mbmi.sb_type) {
+ switch (xd->mi_8x8[0]->mbmi.sb_type) {
case BLOCK_8X8:
v_fn_ptr.vf = vp9_mse8x8;
break;
@@ -444,9 +456,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
while (n < further_steps) {
n++;
- if (num00)
+ if (num00) {
num00--;
- else {
+ } else {
tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
step_param + n, x->sadperbit16,
&num00, &v_fn_ptr,
@@ -469,13 +481,14 @@ void vp9_first_pass(VP9_COMP *cpi) {
MACROBLOCK *const x = &cpi->mb;
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
+ TileInfo tile;
int recon_yoffset, recon_uvoffset;
const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx];
const int gld_yv12_idx = cm->ref_frame_map[cpi->gld_fb_idx];
YV12_BUFFER_CONFIG *const lst_yv12 = &cm->yv12_fb[lst_yv12_idx];
- YV12_BUFFER_CONFIG *const new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
YV12_BUFFER_CONFIG *const gld_yv12 = &cm->yv12_fb[gld_yv12_idx];
+ YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
const int recon_y_stride = lst_yv12->y_stride;
const int recon_uv_stride = lst_yv12->uv_stride;
int64_t intra_error = 0;
@@ -504,12 +517,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL);
setup_dst_planes(xd, new_yv12, 0, 0);
- x->partition_info = x->pi;
xd->mi_8x8 = cm->mi_grid_visible;
// required for vp9_frame_init_quantizer
- xd->this_mi =
xd->mi_8x8[0] = cm->mi;
- xd->mic_stream_ptr = cm->mi;
setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
@@ -520,9 +530,12 @@ void vp9_first_pass(VP9_COMP *cpi) {
// if ( 0 )
{
vp9_init_mv_probs(cm);
- vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q);
+ vp9_initialize_rd_consts(cpi);
}
+ // tiling is ignored in the first pass
+ vp9_tile_init(&tile, cm, 0, 0);
+
// for each macroblock row in image
for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
int_mv best_ref_mv;
@@ -534,16 +547,21 @@ void vp9_first_pass(VP9_COMP *cpi) {
recon_yoffset = (mb_row * recon_y_stride * 16);
recon_uvoffset = (mb_row * recon_uv_stride * 8);
- // Set up limit values for motion vectors to prevent them extending outside the UMV borders
- x->mv_row_min = -((mb_row * 16) + (VP9BORDERINPIXELS - 8));
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders
+ x->mv_row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
- + (VP9BORDERINPIXELS - 8);
+ + BORDER_MV_PIXELS_B16;
// for each macroblock col in image
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
int this_error;
int gf_motion_error = INT_MAX;
int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+ double error_weight;
+
+ vp9_clear_system_state(); // __asm emms;
+ error_weight = 1.0; // avoid uninitialized warnings
xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
@@ -552,40 +570,54 @@ void vp9_first_pass(VP9_COMP *cpi) {
if (mb_col * 2 + 1 < cm->mi_cols) {
if (mb_row * 2 + 1 < cm->mi_rows) {
- xd->this_mi->mbmi.sb_type = BLOCK_16X16;
+ xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X16;
} else {
- xd->this_mi->mbmi.sb_type = BLOCK_16X8;
+ xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X8;
}
} else {
if (mb_row * 2 + 1 < cm->mi_rows) {
- xd->this_mi->mbmi.sb_type = BLOCK_8X16;
+ xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X16;
} else {
- xd->this_mi->mbmi.sb_type = BLOCK_8X8;
+ xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X8;
}
}
- xd->this_mi->mbmi.ref_frame[0] = INTRA_FRAME;
- set_mi_row_col(cm, xd,
+ xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+ set_mi_row_col(xd, &tile,
mb_row << 1,
- 1 << mi_height_log2(xd->this_mi->mbmi.sb_type),
+ num_8x8_blocks_high_lookup[xd->mi_8x8[0]->mbmi.sb_type],
mb_col << 1,
- 1 << mi_height_log2(xd->this_mi->mbmi.sb_type));
+ num_8x8_blocks_wide_lookup[xd->mi_8x8[0]->mbmi.sb_type],
+ cm->mi_rows, cm->mi_cols);
+
+ if (cpi->sf.variance_adaptive_quantization) {
+ int energy = vp9_block_energy(cpi, x, xd->mi_8x8[0]->mbmi.sb_type);
+ error_weight = vp9_vaq_inv_q_ratio(energy);
+ }
// do intra 16x16 prediction
this_error = vp9_encode_intra(x, use_dc_pred);
+ if (cpi->sf.variance_adaptive_quantization) {
+ vp9_clear_system_state(); // __asm emms;
+ this_error *= error_weight;
+ }
- // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
- // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
- // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames.
+ // intrapenalty below deals with situations where the intra and inter
+ // error scores are very low (eg a plain black frame).
+ // We do not have special cases in first pass for 0,0 and nearest etc so
+ // all inter modes carry an overhead cost estimate for the mv.
+ // When the error score is very low this causes us to pick all or lots of
+ // INTRA modes and throw lots of key frames.
// This penalty adds a cost matching that of a 0,0 mv to the intra case.
this_error += intrapenalty;
// Cumulative intra error total
intra_error += (int64_t)this_error;
- // Set up limit values for motion vectors to prevent them extending outside the UMV borders
- x->mv_col_min = -((mb_col * 16) + (VP9BORDERINPIXELS - 8));
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders.
+ x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
- + (VP9BORDERINPIXELS - 8);
+ + BORDER_MV_PIXELS_B16;
// Other than for the first frame do a motion search
if (cm->current_video_frame > 0) {
@@ -602,12 +634,21 @@ void vp9_first_pass(VP9_COMP *cpi) {
first_pass_motion_search(cpi, x, &best_ref_mv,
&mv.as_mv, lst_yv12,
&motion_error, recon_yoffset);
+ if (cpi->sf.variance_adaptive_quantization) {
+ vp9_clear_system_state(); // __asm emms;
+ motion_error *= error_weight;
+ }
- // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
+ // If the current best reference mv is not centered on 0,0 then do a 0,0
+ // based search as well.
if (best_ref_mv.as_int) {
tmp_err = INT_MAX;
first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv,
lst_yv12, &tmp_err, recon_yoffset);
+ if (cpi->sf.variance_adaptive_quantization) {
+ vp9_clear_system_state(); // __asm emms;
+ tmp_err *= error_weight;
+ }
if (tmp_err < motion_error) {
motion_error = tmp_err;
@@ -624,6 +665,10 @@ void vp9_first_pass(VP9_COMP *cpi) {
first_pass_motion_search(cpi, x, &zero_ref_mv,
&tmp_mv.as_mv, gld_yv12,
&gf_motion_error, recon_yoffset);
+ if (cpi->sf.variance_adaptive_quantization) {
+ vp9_clear_system_state(); // __asm emms;
+ gf_motion_error *= error_weight;
+ }
if ((gf_motion_error < motion_error) &&
(gf_motion_error < this_error)) {
@@ -643,9 +688,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
sr_coded_error += gf_motion_error;
else
sr_coded_error += this_error;
- } else
+ } else {
sr_coded_error += motion_error;
-
+ }
/* Intra assumed best */
best_ref_mv.as_int = 0;
@@ -660,17 +705,17 @@ void vp9_first_pass(VP9_COMP *cpi) {
neutral_count++;
}
- mv.as_mv.row <<= 3;
- mv.as_mv.col <<= 3;
+ mv.as_mv.row *= 8;
+ mv.as_mv.col *= 8;
this_error = motion_error;
vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
- xd->this_mi->mbmi.tx_size = TX_4X4;
- xd->this_mi->mbmi.ref_frame[0] = LAST_FRAME;
- xd->this_mi->mbmi.ref_frame[1] = NONE;
+ xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
+ xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME;
+ xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE;
vp9_build_inter_predictors_sby(xd, mb_row << 1,
mb_col << 1,
- xd->this_mi->mbmi.sb_type);
- vp9_encode_sby(x, xd->this_mi->mbmi.sb_type);
+ xd->mi_8x8[0]->mbmi.sb_type);
+ vp9_encode_sby(x, xd->mi_8x8[0]->mbmi.sb_type);
sum_mvr += mv.as_mv.row;
sum_mvr_abs += abs(mv.as_mv.row);
sum_mvc += mv.as_mv.col;
@@ -717,9 +762,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
}
}
}
- } else
+ } else {
sr_coded_error += (int64_t)this_error;
-
+ }
coded_error += (int64_t)this_error;
// adjust to the next column of macroblocks
@@ -778,16 +823,19 @@ void vp9_first_pass(VP9_COMP *cpi) {
fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
fps.MVc = (double)sum_mvc / (double)mvcount;
fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
- fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;
- fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;
+ fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) /
+ (double)mvcount;
+ fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) /
+ (double)mvcount;
fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
fps.new_mv_count = new_mv_count;
fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
}
- // TODO: handle the case when duration is set to 0, or something less
- // than the full time between subsequent values of cpi->source_time_stamp.
+ // TODO(paulwilkins): Handle the case when duration is set to 0, or
+ // something less than the full time between subsequent values of
+ // cpi->source_time_stamp.
fps.duration = (double)(cpi->source->ts_end
- cpi->source->ts_start);
@@ -807,15 +855,16 @@ void vp9_first_pass(VP9_COMP *cpi) {
2.0))) {
vp8_yv12_copy_frame(lst_yv12, gld_yv12);
cpi->twopass.sr_update_lag = 1;
- } else
+ } else {
cpi->twopass.sr_update_lag++;
-
+ }
// swap frame pointers so last frame refers to the frame we just compressed
swap_yv12(lst_yv12, new_yv12);
vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y);
- // Special case for the first frame. Copy into the GF buffer as a second reference.
+ // Special case for the first frame. Copy into the GF buffer as a second
+ // reference.
if (cm->current_video_frame == 0)
vp8_yv12_copy_frame(lst_yv12, gld_yv12);
@@ -823,7 +872,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
if (0) {
char filename[512];
FILE *recon_file;
- sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+ snprintf(filename, sizeof(filename), "enc%04d.yuv",
+ (int)cm->current_video_frame);
if (cm->current_video_frame == 0)
recon_file = fopen(filename, "wb");
@@ -835,7 +885,6 @@ void vp9_first_pass(VP9_COMP *cpi) {
}
cm->current_video_frame++;
-
}
// Estimate a cost per mb attributable to overheads such as the coding of
@@ -878,7 +927,7 @@ static int64_t estimate_modemvcost(VP9_COMP *cpi,
(av_intra * intra_cost)) * cpi->common.MBs) << 9;
// return mv_cost + mode_cost;
- // TODO PGW Fix overhead costs for extended Q range
+ // TODO(paulwilkins): Fix overhead costs for extended Q range.
#endif
return 0;
}
@@ -1102,8 +1151,8 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
FIRSTPASS_STATS *start_pos;
double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
- double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
- * cpi->oxcf.two_pass_vbrmin_section / 100);
+ double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth *
+ cpi->oxcf.two_pass_vbrmin_section / 100);
if (two_pass_min_rate < lower_bounds_min_rate)
two_pass_min_rate = lower_bounds_min_rate;
@@ -1141,15 +1190,17 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
// This variable monitors how far behind the second ref update is lagging
cpi->twopass.sr_update_lag = 1;
- // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
+ // Scan the first pass file and calculate an average Intra / Inter error score
+ // ratio for the sequence.
{
double sum_iiratio = 0.0;
double IIRatio;
- start_pos = cpi->twopass.stats_in; // Note starting "file" position
+ start_pos = cpi->twopass.stats_in; // Note the starting "file" position.
while (input_stats(cpi, &this_frame) != EOF) {
- IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
+ IIRatio = this_frame.intra_error
+ / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
sum_iiratio += IIRatio;
}
@@ -1161,21 +1212,21 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
reset_fpf_position(cpi, start_pos);
}
- // Scan the first pass file and calculate a modified total error based upon the bias/power function
- // used to allocate bits
+ // Scan the first pass file and calculate a modified total error based upon
+ // the bias/power function used to allocate bits.
{
- start_pos = cpi->twopass.stats_in; // Note starting "file" position
+ start_pos = cpi->twopass.stats_in; // Note starting "file" position
cpi->twopass.modified_error_total = 0.0;
cpi->twopass.modified_error_used = 0.0;
while (input_stats(cpi, &this_frame) != EOF) {
- cpi->twopass.modified_error_total += calculate_modified_err(cpi, &this_frame);
+ cpi->twopass.modified_error_total +=
+ calculate_modified_err(cpi, &this_frame);
}
cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;
- reset_fpf_position(cpi, start_pos); // Reset file position
-
+ reset_fpf_position(cpi, start_pos); // Reset file position
}
}
@@ -1321,7 +1372,6 @@ static void accumulate_frame_motion_stats(
(this_frame_mvc_ratio < this_frame->mvc_abs)
? (this_frame_mvc_ratio * motion_pct)
: this_frame->mvc_abs * motion_pct;
-
}
}
@@ -1380,7 +1430,8 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
// Update the motion related elements to the boost calculation
accumulate_frame_motion_stats(&this_frame,
&this_frame_mv_in_out, &mv_in_out_accumulator,
- &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+ &abs_mv_in_out_accumulator,
+ &mv_ratio_accumulator);
// We want to discount the flash frame itself and the recovery
// frame that follows as both will have poor scores.
@@ -1416,7 +1467,8 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
// Update the motion related elements to the boost calculation
accumulate_frame_motion_stats(&this_frame,
&this_frame_mv_in_out, &mv_in_out_accumulator,
- &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+ &abs_mv_in_out_accumulator,
+ &mv_ratio_accumulator);
// We want to discount the the flash frame itself and the recovery
// frame that follows as both will have poor scores.
@@ -1432,7 +1484,6 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
boost_score += (decay_accumulator *
calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
-
}
*b_boost = (int)boost_score;
@@ -1666,7 +1717,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Update the motion related elements to the boost calculation
accumulate_frame_motion_stats(&next_frame,
&this_frame_mv_in_out, &mv_in_out_accumulator,
- &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+ &abs_mv_in_out_accumulator,
+ &mv_ratio_accumulator);
// Cumulative effect of prediction quality decay
if (!flash_detected) {
@@ -1709,8 +1761,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
(abs_mv_in_out_accumulator > 3.0) ||
(mv_in_out_accumulator < -2.0) ||
- ((boost_score - old_boost_score) < IIFACTOR))
- )) {
+ ((boost_score - old_boost_score) < IIFACTOR)))) {
boost_score = old_boost_score;
break;
}
@@ -1764,7 +1815,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
(mv_in_out_accumulator > -2.0)) &&
(boost_score > 100)) {
// Alternative boost calculation for alt ref
- cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+ cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
+ &b_boost);
cpi->source_alt_ref_pending = 1;
#if CONFIG_MULTIPLE_ARF
@@ -1841,9 +1893,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
cpi->twopass.gf_group_bits =
(int64_t)(cpi->twopass.kf_group_bits *
(gf_group_err / cpi->twopass.kf_group_error_left));
- } else
+ } else {
cpi->twopass.gf_group_bits = 0;
-
+ }
cpi->twopass.gf_group_bits =
(cpi->twopass.gf_group_bits < 0)
? 0
@@ -1907,11 +1959,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if (gf_bits > alt_gf_bits)
gf_bits = alt_gf_bits;
- }
- // Else if it is harder than other frames in the group make sure it at
- // least receives an allocation in keeping with its relative error
- // score, otherwise it may be worse off than an "un-boosted" frame
- else {
+ } else {
+ // If it is harder than other frames in the group make sure it at
+ // least receives an allocation in keeping with its relative error
+ // score, otherwise it may be worse off than an "un-boosted" frame.
int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits *
mod_frame_err /
DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
@@ -2023,9 +2074,9 @@ static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Clip target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at
// the top end.
- if (target_frame_size < 0)
+ if (target_frame_size < 0) {
target_frame_size = 0;
- else {
+ } else {
if (target_frame_size > max_bits)
target_frame_size = max_bits;
@@ -2093,14 +2144,19 @@ void vp9_second_pass(VP9_COMP *cpi) {
cpi->twopass.est_max_qcorrection_factor = 1.0;
// Set a cq_level in constrained quality mode.
+ // Commenting this code out for now since it does not seem to be
+ // working well.
+ /*
if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
- section_target_bandwidth);
+ section_target_bandwidth);
- cpi->cq_target_quality = cpi->oxcf.cq_level;
if (est_cq > cpi->cq_target_quality)
cpi->cq_target_quality = est_cq;
+ else
+ cpi->cq_target_quality = cpi->oxcf.cq_level;
}
+ */
// guess at maxq needed in 2nd pass
cpi->twopass.maxq_max_limit = cpi->worst_quality;
@@ -2113,17 +2169,14 @@ void vp9_second_pass(VP9_COMP *cpi) {
cpi->ni_av_qi = tmp_q;
cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
-#ifndef ONE_SHOT_Q_ESTIMATE
// Limit the maxq value returned subsequently.
// This increases the risk of overspend or underspend if the initial
// estimate for the clip is bad, but helps prevent excessive
// variation in Q, especially near the end of a clip
// where for example a small overspend may cause Q to crash
adjust_maxq_qrange(cpi);
-#endif
}
-#ifndef ONE_SHOT_Q_ESTIMATE
// The last few frames of a clip almost always have to few or too many
// bits and for the sake of over exact rate control we dont want to make
// radical adjustments to the allowed quantizer range just to use up a
@@ -2146,7 +2199,6 @@ void vp9_second_pass(VP9_COMP *cpi) {
cpi->active_worst_quality =
adjust_active_maxq(cpi->active_worst_quality, tmp_q);
}
-#endif
}
vp9_zero(this_frame);
if (EOF == input_stats(cpi, &this_frame))
@@ -2243,16 +2295,17 @@ static int test_candidate_kf(VP9_COMP *cpi,
if ((this_frame->pcnt_second_ref < 0.10) &&
(next_frame->pcnt_second_ref < 0.10) &&
((this_frame->pcnt_inter < 0.05) ||
- (
- ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
- ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
- ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
- (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
- ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)
- )
- )
- )
- ) {
+ (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
+ ((this_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
+ ((fabs(last_frame->coded_error - this_frame->coded_error) /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+ .40) ||
+ (fabs(last_frame->intra_error - this_frame->intra_error) /
+ DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+ .40) ||
+ ((next_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) {
int i;
FIRSTPASS_STATS *start_pos;
@@ -2270,7 +2323,8 @@ static int test_candidate_kf(VP9_COMP *cpi,
// Examine how well the key frame predicts subsequent frames
for (i = 0; i < 16; i++) {
- next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+ next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error /
+ DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
if (next_iiratio > RMAX)
next_iiratio = RMAX;
@@ -2279,7 +2333,8 @@ static int test_candidate_kf(VP9_COMP *cpi,
if (local_next_frame.pcnt_inter > 0.85)
decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
else
- decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+ decay_accumulator =
+ decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
// decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
@@ -2307,9 +2362,9 @@ static int test_candidate_kf(VP9_COMP *cpi,
// If there is tolerable prediction for at least the next 3 frames then
// break out else discard this potential key frame and move on
- if (boost_score > 30.0 && (i > 3))
+ if (boost_score > 30.0 && (i > 3)) {
is_viable_kf = 1;
- else {
+ } else {
// Reset the file position
reset_fpf_position(cpi, start_pos);
@@ -2369,8 +2424,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Accumulate kf group error
kf_group_err += calculate_modified_err(cpi, this_frame);
- // These figures keep intra and coded error counts for all frames including key frames in the group.
- // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+ // These figures keep intra and coded error counts for all frames including
+ // key frames in the group. The effect of the key frame itself can be
+ // subtracted out using the first_frame data collected above.
kf_group_intra_err += this_frame->intra_error;
kf_group_coded_err += this_frame->coded_error;
@@ -2410,9 +2466,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// forcekeyframeevery intervals then break out of the loop.
if (cpi->twopass.frames_to_key >= 2 * (int)cpi->key_frame_frequency)
break;
- } else
+ } else {
cpi->twopass.frames_to_key++;
-
+ }
i++;
}
@@ -2452,22 +2508,24 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
reset_fpf_position(cpi, current_pos);
cpi->next_key_frame_forced = 1;
- } else
+ } else {
cpi->next_key_frame_forced = 0;
-
+ }
// Special case for the last frame of the file
if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {
// Accumulate kf group error
kf_group_err += calculate_modified_err(cpi, this_frame);
- // These figures keep intra and coded error counts for all frames including key frames in the group.
- // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+ // These figures keep intra and coded error counts for all frames including
+ // key frames in the group. The effect of the key frame itself can be
+ // subtracted out using the first_frame data collected above.
kf_group_intra_err += this_frame->intra_error;
kf_group_coded_err += this_frame->coded_error;
}
// Calculate the number of bits that should be assigned to the kf group.
- if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0)) {
+ if ((cpi->twopass.bits_left > 0) &&
+ (cpi->twopass.modified_error_left > 0.0)) {
// Max for a single normal frame (not key frame)
int max_bits = frame_max_bits(cpi);
@@ -2484,13 +2542,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;
if (cpi->twopass.kf_group_bits > max_grp_bits)
cpi->twopass.kf_group_bits = max_grp_bits;
- } else
+ } else {
cpi->twopass.kf_group_bits = 0;
-
+ }
// Reset the first pass file position
reset_fpf_position(cpi, start_position);
- // determine how big to make this keyframe based on how well the subsequent frames use inter blocks
+ // Determine how big to make this keyframe based on how well the subsequent
+ // frames use inter blocks.
decay_accumulator = 1.0;
boost_score = 0.0;
loop_decay_rate = 1.00; // Starting decay rate
@@ -2563,7 +2622,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if (kf_boost < (cpi->twopass.frames_to_key * 3))
kf_boost = (cpi->twopass.frames_to_key * 3);
- if (kf_boost < 300) // Min KF boost
+ if (kf_boost < 300) // Min KF boost
kf_boost = 300;
// Make a note of baseline boost and the zero motion
@@ -2598,10 +2657,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
allocation_chunks /= divisor;
}
- cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
+ cpi->twopass.kf_group_bits =
+ (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
// Calculate the number of bits to be spent on the key frame
- cpi->twopass.kf_bits = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
+ cpi->twopass.kf_bits =
+ (int)((double)kf_boost *
+ ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
// If the key frame is actually easier than the average for the
// kf group (which does sometimes happen... eg a blank intro frame)
@@ -2619,11 +2681,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if (cpi->twopass.kf_bits > alt_kf_bits) {
cpi->twopass.kf_bits = alt_kf_bits;
}
- }
+ } else {
// Else if it is much harder than other frames in the group make sure
// it at least receives an allocation in keeping with its relative
// error score
- else {
alt_kf_bits =
(int)((double)cpi->twopass.bits_left *
(kf_mod_err /
@@ -2649,6 +2710,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);
// Adjust the count of total modified error left.
- // The count of bits left is adjusted elsewhere based on real coded frame sizes
+ // The count of bits left is adjusted elsewhere based on real coded frame
+ // sizes.
cpi->twopass.modified_error_left -= kf_group_err;
}
diff --git a/libvpx/vp9/encoder/vp9_firstpass.h b/libvpx/vp9/encoder/vp9_firstpass.h
index 2296a66..c18d11e 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/libvpx/vp9/encoder/vp9_firstpass.h
@@ -10,6 +10,7 @@
#ifndef VP9_ENCODER_VP9_FIRSTPASS_H_
#define VP9_ENCODER_VP9_FIRSTPASS_H_
+#include "vp9/encoder/vp9_onyx_int.h"
void vp9_init_first_pass(VP9_COMP *cpi);
void vp9_first_pass(VP9_COMP *cpi);
diff --git a/libvpx/vp9/encoder/vp9_lookahead.c b/libvpx/vp9/encoder/vp9_lookahead.c
index 81445a9..c28c868 100644
--- a/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/libvpx/vp9/encoder/vp9_lookahead.c
@@ -10,7 +10,7 @@
#include <assert.h>
#include <stdlib.h>
-#include "vpx_config.h"
+#include "./vpx_config.h"
#include "vp9/common/vp9_common.h"
#include "vp9/encoder/vp9_lookahead.h"
#include "vp9/common/vp9_extend.h"
@@ -77,7 +77,7 @@ struct lookahead_ctx * vp9_lookahead_init(unsigned int width,
goto bail;
}
return ctx;
-bail:
+ bail:
vp9_lookahead_destroy(ctx);
return NULL;
}
diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c
index 5a671f2..7b605b2 100644
--- a/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -10,14 +10,17 @@
#include <limits.h>
-#include <vpx_mem/vpx_mem.h>
-#include <vp9/encoder/vp9_encodeintra.h>
-#include <vp9/encoder/vp9_rdopt.h>
-#include <vp9/common/vp9_blockd.h>
-#include <vp9/common/vp9_reconinter.h>
-#include <vp9/common/vp9_reconintra.h>
-#include <vp9/common/vp9_systemdependent.h>
-#include <vp9/encoder/vp9_segmentation.h>
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/encoder/vp9_encodeintra.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+
static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
int_mv *ref_mv,
@@ -46,9 +49,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
ref_full.as_mv.row = ref_mv->as_mv.row >> 3;
/*cpi->sf.search_method == HEX*/
- best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit,
+ best_err = vp9_hex_search(x, &ref_full.as_mv, step_param, x->errorperbit,
0, &v_fn_ptr,
- 0, ref_mv, dst_mv);
+ 0, &ref_mv->as_mv, &dst_mv->as_mv);
// Try sub-pixel MC
// if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -57,7 +60,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
unsigned int sse;
best_err = cpi->find_fractional_mv_step(
x,
- dst_mv, ref_mv,
+ &dst_mv->as_mv, &ref_mv->as_mv,
+ cpi->common.allow_high_precision_mv,
x->errorperbit, &v_fn_ptr,
0, cpi->sf.subpel_iters_per_step, NULL, NULL,
& distortion, &sse);
@@ -100,7 +104,8 @@ static int do_16x16_motion_search(VP9_COMP *cpi, int_mv *ref_mv, int_mv *dst_mv,
dst_mv->as_int = tmp_mv.as_int;
}
- // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
+ // If the current best reference mv is not centered on 0,0 then do a 0,0
+ // based search as well.
if (ref_mv->as_int) {
unsigned int tmp_err;
int_mv zero_ref_mv, tmp_mv;
@@ -145,7 +150,7 @@ static int find_best_16x16_intra(VP9_COMP *cpi,
for (mode = DC_PRED; mode <= TM_PRED; mode++) {
unsigned int err;
- xd->this_mi->mbmi.mode = mode;
+ xd->mi_8x8[0]->mbmi.mode = mode;
vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode,
x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].dst.buf, xd->plane[0].dst.stride);
@@ -189,8 +194,8 @@ static void update_mbgraph_mb_stats
x->plane[0].src.buf = buf->y_buffer + mb_y_offset;
x->plane[0].src.stride = buf->y_stride;
- xd->plane[0].dst.buf = cm->yv12_fb[cm->new_fb_idx].y_buffer + mb_y_offset;
- xd->plane[0].dst.stride = cm->yv12_fb[cm->new_fb_idx].y_stride;
+ xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset;
+ xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride;
// do intra 16x16 prediction
intra_error = find_best_16x16_intra(cpi, mb_y_offset,
@@ -214,7 +219,8 @@ static void update_mbgraph_mb_stats
stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
}
- // Alt-ref frame MV search, if it exists and is different than last/golden frame
+ // Do an Alt-ref frame MV search, if it exists and is different than
+ // last/golden frame.
if (alt_ref) {
int a_motion_error;
xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;
@@ -243,17 +249,17 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
int_mv arf_top_mv, gld_top_mv;
MODE_INFO mi_local = { { 0 } };
- // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+ // Set up limit values for motion vectors to prevent them extending outside
+ // the UMV borders.
arf_top_mv.as_int = 0;
gld_top_mv.as_int = 0;
- x->mv_row_min = -(VP9BORDERINPIXELS - 8 - VP9_INTERP_EXTEND);
- x->mv_row_max = (cm->mb_rows - 1) * 8 + VP9BORDERINPIXELS
- - 8 - VP9_INTERP_EXTEND;
+ x->mv_row_min = -BORDER_MV_PIXELS_B16;
+ x->mv_row_max = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16;
xd->up_available = 0;
xd->plane[0].dst.stride = buf->y_stride;
xd->plane[0].pre[0].stride = buf->y_stride;
xd->plane[1].dst.stride = buf->uv_stride;
- xd->this_mi = &mi_local;
+ xd->mi_8x8[0] = &mi_local;
mi_local.mbmi.sb_type = BLOCK_16X16;
mi_local.mbmi.ref_frame[0] = LAST_FRAME;
mi_local.mbmi.ref_frame[1] = NONE;
@@ -264,12 +270,12 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
int arf_y_in_offset = arf_y_offset;
int gld_y_in_offset = gld_y_offset;
- // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+ // Set up limit values for motion vectors to prevent them extending outside
+ // the UMV borders.
arf_left_mv.as_int = arf_top_mv.as_int;
gld_left_mv.as_int = gld_top_mv.as_int;
- x->mv_col_min = -(VP9BORDERINPIXELS - 8 - VP9_INTERP_EXTEND);
- x->mv_col_max = (cm->mb_cols - 1) * 8 + VP9BORDERINPIXELS
- - 8 - VP9_INTERP_EXTEND;
+ x->mv_col_min = -BORDER_MV_PIXELS_B16;
+ x->mv_col_max = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16;
xd->left_available = 0;
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
@@ -307,6 +313,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
static void separate_arf_mbs(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
int mb_col, mb_row, offset, i;
+ int mi_row, mi_col;
int ncnt[4] = { 0 };
int n_frames = cpi->mbgraph_n_frames;
@@ -343,22 +350,17 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
}
}
- for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
- offset += cm->mb_cols, mb_row++) {
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ // arf_not_zz is indexed by MB, but this loop is indexed by MI to avoid out
+ // of bound access in segmentation_map
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
// If any of the blocks in the sequence failed then the MB
// goes in segment 0
- if (arf_not_zz[offset + mb_col]) {
+ if (arf_not_zz[mi_row/2*cm->mb_cols + mi_col/2]) {
ncnt[0]++;
- cpi->segmentation_map[offset * 4 + 2 * mb_col] = 0;
- cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 0;
- cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 0;
- cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols + 1] = 0;
+ cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0;
} else {
- cpi->segmentation_map[offset * 4 + 2 * mb_col] = 1;
- cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 1;
- cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 1;
- cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols + 1] = 1;
+ cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 1;
ncnt[1]++;
}
}
@@ -369,7 +371,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
if (1) {
// Note % of blocks that are marked as static
if (cm->MBs)
- cpi->static_mb_pct = (ncnt[1] * 100) / cm->MBs;
+ cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
// This error case should not be reachable as this function should
// never be called with the common data structure uninitialized.
@@ -406,7 +408,8 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
for (i = 0; i < n_frames; i++) {
MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
vpx_memset(frame_stats->mb_stats, 0,
- cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
+ cm->mb_rows * cm->mb_cols *
+ sizeof(*cpi->mbgraph_stats[i].mb_stats));
}
// do motion search to find contribution of each reference to data
diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c
index 1360088..a52f5b1 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libvpx/vp9/encoder/vp9_mcomp.c
@@ -59,38 +59,39 @@ int vp9_init_search_range(VP9_COMP *cpi, int size) {
return sr;
}
-int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],
- int weight) {
- MV v;
- v.row = mv->as_mv.row - ref->as_mv.row;
- v.col = mv->as_mv.col - ref->as_mv.col;
- return ROUND_POWER_OF_TWO((mvjcost[vp9_get_mv_joint(&v)] +
- mvcost[0][v.row] +
- mvcost[1][v.col]) * weight, 7);
+static INLINE int mv_cost(const MV *mv,
+ const int *joint_cost, int *comp_cost[2]) {
+ return joint_cost[vp9_get_mv_joint(mv)] +
+ comp_cost[0][mv->row] + comp_cost[1][mv->col];
}
-static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],
+int vp9_mv_bit_cost(const MV *mv, const MV *ref,
+ const int *mvjcost, int *mvcost[2], int weight) {
+ const MV diff = { mv->row - ref->row,
+ mv->col - ref->col };
+ return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
+}
+
+static int mv_err_cost(const MV *mv, const MV *ref,
+ const int *mvjcost, int *mvcost[2],
int error_per_bit) {
if (mvcost) {
- MV v;
- v.row = mv->as_mv.row - ref->as_mv.row;
- v.col = mv->as_mv.col - ref->as_mv.col;
- return ROUND_POWER_OF_TWO((mvjcost[vp9_get_mv_joint(&v)] +
- mvcost[0][v.row] +
- mvcost[1][v.col]) * error_per_bit, 13);
+ const MV diff = { mv->row - ref->row,
+ mv->col - ref->col };
+ return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) *
+ error_per_bit, 13);
}
return 0;
}
-static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvjsadcost,
- int *mvsadcost[2], int error_per_bit) {
+static int mvsad_err_cost(const MV *mv, const MV *ref,
+ const int *mvjsadcost, int *mvsadcost[2],
+ int error_per_bit) {
if (mvsadcost) {
- MV v;
- v.row = mv->as_mv.row - ref->as_mv.row;
- v.col = mv->as_mv.col - ref->as_mv.col;
- return ROUND_POWER_OF_TWO((mvjsadcost[vp9_get_mv_joint(&v)] +
- mvsadcost[0][v.row] +
- mvsadcost[1][v.col]) * error_per_bit, 8);
+ const MV diff = { mv->row - ref->row,
+ mv->col - ref->col };
+ return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjsadcost, mvsadcost) *
+ error_per_bit, 8);
}
return 0;
}
@@ -136,66 +137,26 @@ void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) {
}
void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
- int len;
- int search_site_count = 0;
+ int len, ss_count = 1;
- // Generate offsets for 8 search sites per step.
- x->ss[search_site_count].mv.col = 0;
- x->ss[search_site_count].mv.row = 0;
- x->ss[search_site_count].offset = 0;
- search_site_count++;
+ x->ss[0].mv.col = x->ss[0].mv.row = 0;
+ x->ss[0].offset = 0;
for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = 0;
- x->ss[search_site_count].mv.row = -len;
- x->ss[search_site_count].offset = -len * stride;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = 0;
- x->ss[search_site_count].mv.row = len;
- x->ss[search_site_count].offset = len * stride;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = -len;
- x->ss[search_site_count].mv.row = 0;
- x->ss[search_site_count].offset = -len;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = len;
- x->ss[search_site_count].mv.row = 0;
- x->ss[search_site_count].offset = len;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = -len;
- x->ss[search_site_count].mv.row = -len;
- x->ss[search_site_count].offset = -len * stride - len;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = len;
- x->ss[search_site_count].mv.row = -len;
- x->ss[search_site_count].offset = -len * stride + len;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = -len;
- x->ss[search_site_count].mv.row = len;
- x->ss[search_site_count].offset = len * stride - len;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = len;
- x->ss[search_site_count].mv.row = len;
- x->ss[search_site_count].offset = len * stride + len;
- search_site_count++;
+ // Generate offsets for 8 search sites per step.
+ const MV ss_mvs[8] = {
+ {-len, 0 }, {len, 0 }, { 0, -len}, {0, len},
+ {-len, -len}, {-len, len}, {len, -len}, {len, len}
+ };
+ int i;
+ for (i = 0; i < 8; ++i) {
+ search_site *const ss = &x->ss[ss_count++];
+ ss->mv = ss_mvs[i];
+ ss->offset = ss->mv.row * stride + ss->mv.col;
+ }
}
- x->ss_count = search_site_count;
+ x->ss_count = ss_count;
x->searches_per_step = 8;
}
@@ -313,7 +274,8 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
}
int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
- int_mv *bestmv, int_mv *ref_mv,
+ MV *bestmv, const MV *ref_mv,
+ int allow_hp,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop,
@@ -333,38 +295,34 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
unsigned int eighthiters = iters_per_step;
int thismse;
- uint8_t *y = xd->plane[0].pre[0].buf +
- (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
- bestmv->as_mv.col;
-
const int y_stride = xd->plane[0].pre[0].stride;
+ const int offset = bestmv->row * y_stride + bestmv->col;
+ uint8_t *y = xd->plane[0].pre[0].buf + offset;
- int rr = ref_mv->as_mv.row;
- int rc = ref_mv->as_mv.col;
- int br = bestmv->as_mv.row << 3;
- int bc = bestmv->as_mv.col << 3;
+ int rr = ref_mv->row;
+ int rc = ref_mv->col;
+ int br = bestmv->row * 8;
+ int bc = bestmv->col * 8;
int hstep = 4;
- const int minc = MAX(x->mv_col_min << 3, ref_mv->as_mv.col - MV_MAX);
- const int maxc = MIN(x->mv_col_max << 3, ref_mv->as_mv.col + MV_MAX);
- const int minr = MAX(x->mv_row_min << 3, ref_mv->as_mv.row - MV_MAX);
- const int maxr = MIN(x->mv_row_max << 3, ref_mv->as_mv.row + MV_MAX);
+ const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+ const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+ const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+ const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
int tr = br;
int tc = bc;
- const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
-
// central mv
- bestmv->as_mv.row <<= 3;
- bestmv->as_mv.col <<= 3;
+ bestmv->row <<= 3;
+ bestmv->col <<= 3;
// calculate central point error
besterr = vfp->vf(y, y_stride, z, src_stride, sse1);
*distortion = besterr;
besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
- // TODO: Each subsequent iteration checks at least one point in
- // common with the last iteration could be 2 ( if diag selected)
+ // TODO(jbb): Each subsequent iteration checks at least one point in
+ // common with the last iteration could be 2 if diagonal is selected.
while (halfiters--) {
// 1/2 pel
FIRST_LEVEL_CHECKS;
@@ -375,8 +333,8 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
tc = bc;
}
- // TODO: Each subsequent iteration checks at least one point in common with
- // the last iteration could be 2 ( if diag selected) 1/4 pel
+ // TODO(yaowu): Each subsequent iteration checks at least one point in common
+ // with the last iteration could be 2 if diagonal is selected.
// Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
if (forced_stop != 2) {
@@ -391,8 +349,7 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
}
}
- if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
- forced_stop == 0) {
+ if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
hstep >>= 1;
while (eighthiters--) {
FIRST_LEVEL_CHECKS;
@@ -404,18 +361,19 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
}
}
- bestmv->as_mv.row = br;
- bestmv->as_mv.col = bc;
+ bestmv->row = br;
+ bestmv->col = bc;
- if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
- (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+ if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+ (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
return INT_MAX;
return besterr;
}
int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
- int_mv *bestmv, int_mv *ref_mv,
+ MV *bestmv, const MV *ref_mv,
+ int allow_hp,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop,
@@ -424,49 +382,36 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
int *distortion,
unsigned int *sse1) {
uint8_t *z = x->plane[0].src.buf;
- int src_stride = x->plane[0].src.stride;
+ const int src_stride = x->plane[0].src.stride;
MACROBLOCKD *xd = &x->e_mbd;
- int rr, rc, br, bc, hstep;
- int tr, tc;
unsigned int besterr = INT_MAX;
unsigned int sse;
unsigned int whichdir;
int thismse;
- int maxc, minc, maxr, minr;
- int y_stride;
- int offset;
unsigned int halfiters = iters_per_step;
unsigned int quarteriters = iters_per_step;
unsigned int eighthiters = iters_per_step;
- uint8_t *y = xd->plane[0].pre[0].buf +
- (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
- bestmv->as_mv.col;
-
- y_stride = xd->plane[0].pre[0].stride;
-
- rr = ref_mv->as_mv.row;
- rc = ref_mv->as_mv.col;
- br = bestmv->as_mv.row << 3;
- bc = bestmv->as_mv.col << 3;
- hstep = 4;
- minc = MAX(x->mv_col_min << 3,
- (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1));
- maxc = MIN(x->mv_col_max << 3,
- (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1));
- minr = MAX(x->mv_row_min << 3,
- (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1));
- maxr = MIN(x->mv_row_max << 3,
- (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1));
+ const int y_stride = xd->plane[0].pre[0].stride;
+ const int offset = bestmv->row * y_stride + bestmv->col;
+ uint8_t *y = xd->plane[0].pre[0].buf + offset;
- tr = br;
- tc = bc;
+ int rr = ref_mv->row;
+ int rc = ref_mv->col;
+ int br = bestmv->row * 8;
+ int bc = bestmv->col * 8;
+ int hstep = 4;
+ const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+ const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+ const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+ const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
- offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+ int tr = br;
+ int tc = bc;
// central mv
- bestmv->as_mv.row <<= 3;
- bestmv->as_mv.col <<= 3;
+ bestmv->row *= 8;
+ bestmv->col *= 8;
// calculate central point error
besterr = vfp->vf(y, y_stride, z, src_stride, sse1);
@@ -492,8 +437,7 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
tc = bc;
}
- if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
- forced_stop == 0) {
+ if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
hstep >>= 1;
FIRST_LEVEL_CHECKS;
if (eighthiters > 1) {
@@ -503,11 +447,11 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
tc = bc;
}
- bestmv->as_mv.row = br;
- bestmv->as_mv.col = bc;
+ bestmv->row = br;
+ bestmv->col = bc;
- if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
- (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+ if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+ (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
return INT_MAX;
return besterr;
@@ -520,7 +464,8 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
z, src_stride, &sse, second_pred)
int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
- int_mv *bestmv, int_mv *ref_mv,
+ MV *bestmv, const MV *ref_mv,
+ int allow_hp,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop,
@@ -543,30 +488,26 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
int thismse;
DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
- uint8_t *const y = xd->plane[0].pre[0].buf +
- (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
- bestmv->as_mv.col;
-
const int y_stride = xd->plane[0].pre[0].stride;
+ const int offset = bestmv->row * y_stride + bestmv->col;
+ uint8_t *const y = xd->plane[0].pre[0].buf + offset;
- int rr = ref_mv->as_mv.row;
- int rc = ref_mv->as_mv.col;
- int br = bestmv->as_mv.row << 3;
- int bc = bestmv->as_mv.col << 3;
+ int rr = ref_mv->row;
+ int rc = ref_mv->col;
+ int br = bestmv->row * 8;
+ int bc = bestmv->col * 8;
int hstep = 4;
- const int minc = MAX(x->mv_col_min << 3, ref_mv->as_mv.col - MV_MAX);
- const int maxc = MIN(x->mv_col_max << 3, ref_mv->as_mv.col + MV_MAX);
- const int minr = MAX(x->mv_row_min << 3, ref_mv->as_mv.row - MV_MAX);
- const int maxr = MIN(x->mv_row_max << 3, ref_mv->as_mv.row + MV_MAX);
+ const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+ const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+ const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+ const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
int tr = br;
int tc = bc;
- const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
-
// central mv
- bestmv->as_mv.row <<= 3;
- bestmv->as_mv.col <<= 3;
+ bestmv->row *= 8;
+ bestmv->col *= 8;
// calculate central point error
// TODO(yunqingwang): central pointer error was already calculated in full-
@@ -604,8 +545,7 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
}
}
- if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
- forced_stop == 0) {
+ if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
hstep >>= 1;
while (eighthiters--) {
FIRST_LEVEL_CHECKS;
@@ -616,18 +556,19 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
tc = bc;
}
}
- bestmv->as_mv.row = br;
- bestmv->as_mv.col = bc;
+ bestmv->row = br;
+ bestmv->col = bc;
- if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
- (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+ if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+ (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
return INT_MAX;
return besterr;
}
int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
- int_mv *bestmv, int_mv *ref_mv,
+ MV *bestmv, const MV *ref_mv,
+ int allow_hp,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop,
@@ -638,51 +579,37 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
const uint8_t *second_pred,
int w, int h) {
uint8_t *z = x->plane[0].src.buf;
- int src_stride = x->plane[0].src.stride;
+ const int src_stride = x->plane[0].src.stride;
MACROBLOCKD *xd = &x->e_mbd;
- int rr, rc, br, bc, hstep;
- int tr, tc;
unsigned int besterr = INT_MAX;
unsigned int sse;
unsigned int whichdir;
int thismse;
- int maxc, minc, maxr, minr;
- int y_stride;
- int offset;
unsigned int halfiters = iters_per_step;
unsigned int quarteriters = iters_per_step;
unsigned int eighthiters = iters_per_step;
DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
- uint8_t *y = xd->plane[0].pre[0].buf +
- (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
- bestmv->as_mv.col;
-
- y_stride = xd->plane[0].pre[0].stride;
-
- rr = ref_mv->as_mv.row;
- rc = ref_mv->as_mv.col;
- br = bestmv->as_mv.row << 3;
- bc = bestmv->as_mv.col << 3;
- hstep = 4;
- minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
- ((1 << MV_MAX_BITS) - 1));
- maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
- ((1 << MV_MAX_BITS) - 1));
- minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
- ((1 << MV_MAX_BITS) - 1));
- maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
- ((1 << MV_MAX_BITS) - 1));
-
- tr = br;
- tc = bc;
+ const int y_stride = xd->plane[0].pre[0].stride;
+ const int offset = bestmv->row * y_stride + bestmv->col;
+ uint8_t *y = xd->plane[0].pre[0].buf + offset;
+ int rr = ref_mv->row;
+ int rc = ref_mv->col;
+ int br = bestmv->row * 8;
+ int bc = bestmv->col * 8;
+ int hstep = 4;
+ const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+ const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+ const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+ const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
- offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+ int tr = br;
+ int tc = bc;
// central mv
- bestmv->as_mv.row <<= 3;
- bestmv->as_mv.col <<= 3;
+ bestmv->row *= 8;
+ bestmv->col *= 8;
// calculate central point error
// TODO(yunqingwang): central pointer error was already calculated in full-
@@ -716,8 +643,7 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
tc = bc;
}
- if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
- forced_stop == 0) {
+ if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
hstep >>= 1;
FIRST_LEVEL_CHECKS;
if (eighthiters > 1) {
@@ -726,11 +652,11 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
tr = br;
tc = bc;
}
- bestmv->as_mv.row = br;
- bestmv->as_mv.col = bc;
+ bestmv->row = br;
+ bestmv->col = bc;
- if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
- (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+ if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+ (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
return INT_MAX;
return besterr;
@@ -754,10 +680,10 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
#define CHECK_POINT \
{\
- if (this_mv.as_mv.col < x->mv_col_min) continue;\
- if (this_mv.as_mv.col > x->mv_col_max) continue;\
- if (this_mv.as_mv.row < x->mv_row_min) continue;\
- if (this_mv.as_mv.row > x->mv_row_max) continue;\
+ if (this_mv.col < x->mv_col_min) continue;\
+ if (this_mv.col > x->mv_col_max) continue;\
+ if (this_mv.row < x->mv_row_min) continue;\
+ if (this_mv.row > x->mv_row_max) continue;\
}
#define CHECK_BETTER \
@@ -765,7 +691,7 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
if (thissad < bestsad)\
{\
if (use_mvcost) \
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv, \
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv.as_mv, \
mvjsadcost, mvsadcost, \
sad_per_bit);\
if (thissad < bestsad)\
@@ -790,14 +716,14 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
// candidates as indicated in the num_candidates and candidates arrays
// passed into this function
static int vp9_pattern_search(MACROBLOCK *x,
- int_mv *ref_mv,
+ MV *ref_mv,
int search_param,
int sad_per_bit,
int do_init_search,
int do_refine,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
- int_mv *center_mv, int_mv *best_mv,
+ const MV *center_mv, MV *best_mv,
const int num_candidates[MAX_PATTERN_SCALES],
const MV candidates[MAX_PATTERN_SCALES]
[MAX_PATTERN_CANDIDATES]) {
@@ -810,7 +736,7 @@ static int vp9_pattern_search(MACROBLOCK *x,
int what_stride = x->plane[0].src.stride;
int in_what_stride = xd->plane[0].pre[0].stride;
int br, bc;
- int_mv this_mv;
+ MV this_mv;
int bestsad = INT_MAX;
int thissad;
uint8_t *base_offset;
@@ -823,24 +749,22 @@ static int vp9_pattern_search(MACROBLOCK *x,
int *mvjsadcost = x->nmvjointsadcost;
int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
- fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
- fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+ fcenter_mv.as_mv.row = center_mv->row >> 3;
+ fcenter_mv.as_mv.col = center_mv->col >> 3;
// adjust ref_mv to make sure it is within MV range
- clamp_mv(&ref_mv->as_mv,
- x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
- br = ref_mv->as_mv.row;
- bc = ref_mv->as_mv.col;
+ clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+ br = ref_mv->row;
+ bc = ref_mv->col;
// Work out the start point for the search
base_offset = (uint8_t *)(xd->plane[0].pre[0].buf);
this_offset = base_offset + (br * in_what_stride) + bc;
- this_mv.as_mv.row = br;
- this_mv.as_mv.col = bc;
- bestsad = vfp->sdf(what, what_stride, this_offset,
- in_what_stride, 0x7fffffff)
- + mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost,
- sad_per_bit);
+ this_mv.row = br;
+ this_mv.col = bc;
+ bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, 0x7fffffff)
+ + mvsad_err_cost(&this_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, sad_per_bit);
// Search all possible scales upto the search param around the center point
// pick the scale of the point that is best as the starting scale of
@@ -853,21 +777,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
CHECK_BOUNDS((1 << t))
if (all_in) {
for (i = 0; i < num_candidates[t]; i++) {
- this_mv.as_mv.row = br + candidates[t][i].row;
- this_mv.as_mv.col = bc + candidates[t][i].col;
- this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
- this_mv.as_mv.col;
+ this_mv.row = br + candidates[t][i].row;
+ this_mv.col = bc + candidates[t][i].col;
+ this_offset = base_offset + (this_mv.row * in_what_stride) +
+ this_mv.col;
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
bestsad);
CHECK_BETTER
}
} else {
for (i = 0; i < num_candidates[t]; i++) {
- this_mv.as_mv.row = br + candidates[t][i].row;
- this_mv.as_mv.col = bc + candidates[t][i].col;
+ this_mv.row = br + candidates[t][i].row;
+ this_mv.col = bc + candidates[t][i].col;
CHECK_POINT
- this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
- this_mv.as_mv.col;
+ this_offset = base_offset + (this_mv.row * in_what_stride) +
+ this_mv.col;
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
bestsad);
CHECK_BETTER
@@ -897,21 +821,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
CHECK_BOUNDS((1 << s))
if (all_in) {
for (i = 0; i < num_candidates[s]; i++) {
- this_mv.as_mv.row = br + candidates[s][i].row;
- this_mv.as_mv.col = bc + candidates[s][i].col;
- this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
- this_mv.as_mv.col;
+ this_mv.row = br + candidates[s][i].row;
+ this_mv.col = bc + candidates[s][i].col;
+ this_offset = base_offset + (this_mv.row * in_what_stride) +
+ this_mv.col;
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
bestsad);
CHECK_BETTER
}
} else {
for (i = 0; i < num_candidates[s]; i++) {
- this_mv.as_mv.row = br + candidates[s][i].row;
- this_mv.as_mv.col = bc + candidates[s][i].col;
+ this_mv.row = br + candidates[s][i].row;
+ this_mv.col = bc + candidates[s][i].col;
CHECK_POINT
- this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
- this_mv.as_mv.col;
+ this_offset = base_offset + (this_mv.row * in_what_stride) +
+ this_mv.col;
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
bestsad);
CHECK_BETTER
@@ -935,25 +859,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
get_next_chkpts(next_chkpts_indices, k, num_candidates[s]);
if (all_in) {
for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
- this_mv.as_mv.row = br +
- candidates[s][next_chkpts_indices[i]].row;
- this_mv.as_mv.col = bc +
- candidates[s][next_chkpts_indices[i]].col;
- this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
- this_mv.as_mv.col;
+ this_mv.row = br + candidates[s][next_chkpts_indices[i]].row;
+ this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col;
+ this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+ this_mv.col;
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
bestsad);
CHECK_BETTER
}
} else {
for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
- this_mv.as_mv.row = br +
- candidates[s][next_chkpts_indices[i]].row;
- this_mv.as_mv.col = bc +
- candidates[s][next_chkpts_indices[i]].col;
+ this_mv.row = br + candidates[s][next_chkpts_indices[i]].row;
+ this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col;
CHECK_POINT
- this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
- this_mv.as_mv.col;
+ this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+ this_mv.col;
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
bestsad);
CHECK_BETTER
@@ -980,21 +900,21 @@ static int vp9_pattern_search(MACROBLOCK *x,
CHECK_BOUNDS(1)
if (all_in) {
for (i = 0; i < 4; i++) {
- this_mv.as_mv.row = br + neighbors[i].row;
- this_mv.as_mv.col = bc + neighbors[i].col;
- this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
- this_mv.as_mv.col;
+ this_mv.row = br + neighbors[i].row;
+ this_mv.col = bc + neighbors[i].col;
+ this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+ this_mv.col;
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
bestsad);
CHECK_BETTER
}
} else {
for (i = 0; i < 4; i++) {
- this_mv.as_mv.row = br + neighbors[i].row;
- this_mv.as_mv.col = bc + neighbors[i].col;
+ this_mv.row = br + neighbors[i].row;
+ this_mv.col = bc + neighbors[i].col;
CHECK_POINT
- this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
- this_mv.as_mv.col;
+ this_offset = base_offset + (this_mv.row * (in_what_stride)) +
+ this_mv.col;
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
bestsad);
CHECK_BETTER
@@ -1010,31 +930,32 @@ static int vp9_pattern_search(MACROBLOCK *x,
}
}
- best_mv->as_mv.row = br;
- best_mv->as_mv.col = bc;
+ best_mv->row = br;
+ best_mv->col = bc;
- this_offset = base_offset + (best_mv->as_mv.row * (in_what_stride)) +
- best_mv->as_mv.col;
- this_mv.as_mv.row = best_mv->as_mv.row << 3;
- this_mv.as_mv.col = best_mv->as_mv.col << 3;
+ this_offset = base_offset + (best_mv->row * in_what_stride) +
+ best_mv->col;
+ this_mv.row = best_mv->row * 8;
+ this_mv.col = best_mv->col * 8;
if (bestsad == INT_MAX)
return INT_MAX;
- return
- vfp->vf(what, what_stride, this_offset, in_what_stride,
- (unsigned int *)(&bestsad)) +
- use_mvcost ? mv_err_cost(&this_mv, center_mv, x->nmvjointcost, x->mvcost,
- x->errorperbit) : 0;
+
+ return vfp->vf(what, what_stride, this_offset, in_what_stride,
+ (unsigned int *)&bestsad) +
+ use_mvcost ? mv_err_cost(&this_mv, center_mv,
+ x->nmvjointcost, x->mvcost, x->errorperbit)
+ : 0;
}
int vp9_hex_search(MACROBLOCK *x,
- int_mv *ref_mv,
+ MV *ref_mv,
int search_param,
int sad_per_bit,
int do_init_search,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
- int_mv *center_mv, int_mv *best_mv) {
+ const MV *center_mv, MV *best_mv) {
// First scale has 8-closest points, the rest have 6 points in hex shape
// at increasing scales
static const int hex_num_candidates[MAX_PATTERN_SCALES] = {
@@ -1063,14 +984,14 @@ int vp9_hex_search(MACROBLOCK *x,
}
int vp9_bigdia_search(MACROBLOCK *x,
- int_mv *ref_mv,
+ MV *ref_mv,
int search_param,
int sad_per_bit,
int do_init_search,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
- int_mv *center_mv,
- int_mv *best_mv) {
+ const MV *center_mv,
+ MV *best_mv) {
// First scale has 4-closest points, the rest have 8 points in diamond
// shape at increasing scales
static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
@@ -1097,22 +1018,21 @@ int vp9_bigdia_search(MACROBLOCK *x,
{{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024},
{-512, 512}, {-1024, 0}},
};
- return
- vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
- do_init_search, 0, vfp, use_mvcost,
- center_mv, best_mv,
- bigdia_num_candidates, bigdia_candidates);
+ return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+ do_init_search, 0, vfp, use_mvcost,
+ center_mv, best_mv,
+ bigdia_num_candidates, bigdia_candidates);
}
int vp9_square_search(MACROBLOCK *x,
- int_mv *ref_mv,
+ MV *ref_mv,
int search_param,
int sad_per_bit,
int do_init_search,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
- int_mv *center_mv,
- int_mv *best_mv) {
+ const MV *center_mv,
+ MV *best_mv) {
// All scales have 8 closest points in square shape
static const int square_num_candidates[MAX_PATTERN_SCALES] = {
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
@@ -1139,11 +1059,10 @@ int vp9_square_search(MACROBLOCK *x,
{{-1024, -1024}, {0, -1024}, {1024, -1024}, {1024, 0}, {1024, 1024},
{0, 1024}, {-1024, 1024}, {-1024, 0}},
};
- return
- vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
- do_init_search, 0, vfp, use_mvcost,
- center_mv, best_mv,
- square_num_candidates, square_candidates);
+ return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+ do_init_search, 0, vfp, use_mvcost,
+ center_mv, best_mv,
+ square_num_candidates, square_candidates);
};
#undef CHECK_BOUNDS
@@ -1199,13 +1118,14 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
best_address = in_what;
// Check the starting position
- bestsad = fn_ptr->sdf(what, what_stride, in_what,
- in_what_stride, 0x7fffffff)
- + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
- sad_per_bit);
+ bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
+ + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, sad_per_bit);
- // search_param determines the length of the initial step and hence the number of iterations
- // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+ // search_param determines the length of the initial step and hence the number
+ // of iterations
+ // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+ // (MAX_FIRST_STEP/4) pel... etc.
ss = &x->ss[search_param * x->searches_per_step];
tot_steps = (x->ss_count / x->searches_per_step) - search_param;
@@ -1228,7 +1148,7 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
this_mv.as_mv.col = this_col_offset;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
mvjsadcost, mvsadcost, sad_per_bit);
if (thissad < bestsad) {
@@ -1260,7 +1180,7 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
this_mv.as_mv.col = this_col_offset;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
mvjsadcost, mvsadcost, sad_per_bit);
if (thissad < bestsad) {
bestsad = thissad;
@@ -1274,19 +1194,21 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
break;
};
#endif
- } else if (best_address == in_what)
+ } else if (best_address == in_what) {
(*num00)++;
+ }
}
- this_mv.as_mv.row = best_mv->as_mv.row << 3;
- this_mv.as_mv.col = best_mv->as_mv.col << 3;
+ this_mv.as_mv.row = best_mv->as_mv.row * 8;
+ this_mv.as_mv.col = best_mv->as_mv.col * 8;
if (bestsad == INT_MAX)
return INT_MAX;
return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
- (unsigned int *)(&thissad)) + mv_err_cost(&this_mv, center_mv, mvjcost,
- mvcost, x->errorperbit);
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+ mvjcost, mvcost, x->errorperbit);
}
int vp9_diamond_search_sadx4(MACROBLOCK *x,
@@ -1340,13 +1262,15 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
best_address = in_what;
// Check the starting position
- bestsad = fn_ptr->sdf(what, what_stride,
- in_what, in_what_stride, 0x7fffffff)
- + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
- sad_per_bit);
+ bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
+ + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, sad_per_bit);
- // search_param determines the length of the initial step and hence the number of iterations
- // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+ // search_param determines the length of the initial step and hence the number
+ // of iterations.
+ // 0 = initial step (MAX_FIRST_STEP) pel
+ // 1 = (MAX_FIRST_STEP/2) pel,
+ // 2 = (MAX_FIRST_STEP/4) pel...
ss = &x->ss[search_param * x->searches_per_step];
tot_steps = (x->ss_count / x->searches_per_step) - search_param;
@@ -1355,13 +1279,16 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
for (step = 0; step < tot_steps; step++) {
int all_in = 1, t;
- // To know if all neighbor points are within the bounds, 4 bounds checking are enough instead of
- // checking 4 bounds for each points.
+ // All_in is true if every one of the points we are checking are within
+ // the bounds of the image.
all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min);
all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max);
all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min);
all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max);
+ // If all the pixels are within the bounds we don't check whether the
+ // search point is valid in this loop, otherwise we check each point
+ // for validity..
if (all_in) {
unsigned int sad_array[4];
@@ -1378,7 +1305,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
if (sad_array[t] < bestsad) {
this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row;
this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col;
- sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv,
+ sad_array[t] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
mvjsadcost, mvsadcost, sad_per_bit);
if (sad_array[t] < bestsad) {
@@ -1394,15 +1321,18 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
- if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
- (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
+ if ((this_col_offset > x->mv_col_min) &&
+ (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) &&
+ (this_row_offset < x->mv_row_max)) {
check_here = ss[i].offset + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
this_mv.as_mv.col = this_col_offset;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
mvjsadcost, mvsadcost, sad_per_bit);
if (thissad < bestsad) {
@@ -1433,7 +1363,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
this_mv.as_mv.col = this_col_offset;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
mvjsadcost, mvsadcost, sad_per_bit);
if (thissad < bestsad) {
bestsad = thissad;
@@ -1447,19 +1377,21 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
break;
};
#endif
- } else if (best_address == in_what)
+ } else if (best_address == in_what) {
(*num00)++;
+ }
}
- this_mv.as_mv.row = best_mv->as_mv.row << 3;
- this_mv.as_mv.col = best_mv->as_mv.col << 3;
+ this_mv.as_mv.row = best_mv->as_mv.row * 8;
+ this_mv.as_mv.col = best_mv->as_mv.col * 8;
if (bestsad == INT_MAX)
return INT_MAX;
return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
- (unsigned int *)(&thissad)) + mv_err_cost(&this_mv,
- center_mv, mvjcost, mvcost, x->errorperbit);
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+ mvjcost, mvcost, x->errorperbit);
}
/* do_refine: If last step (1-away) of n-step search doesn't pick the center
@@ -1482,16 +1414,17 @@ int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
n = num00;
num00 = 0;
- /* If there won't be more n-step search, check to see if refining search is needed. */
+ /* If there won't be more n-step search, check to see if refining search is
+ * needed. */
if (n > further_steps)
do_refine = 0;
while (n < further_steps) {
n++;
- if (num00)
+ if (num00) {
num00--;
- else {
+ } else {
thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
step_param + n, sadpb, &num00,
fn_ptr, x->nmvjointcost, x->mvcost,
@@ -1570,8 +1503,8 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
// Baseline value at the centre
bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
in_what_stride, 0x7fffffff)
- + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
- sad_per_bit);
+ + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, sad_per_bit);
// Apply further limits to prevent us looking using vectors that stretch
// beyond the UMV border
@@ -1585,11 +1518,12 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
check_here = r * mv_stride + in_what + col_min;
for (c = col_min; c < col_max; c++) {
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
this_mv.as_mv.col = c;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
- mvjsadcost, mvsadcost, sad_per_bit);
+ thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, sad_per_bit);
if (thissad < bestsad) {
bestsad = thissad;
@@ -1602,14 +1536,14 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
}
}
- this_mv.as_mv.row = best_mv->as_mv.row << 3;
- this_mv.as_mv.col = best_mv->as_mv.col << 3;
+ this_mv.as_mv.row = best_mv->as_mv.row * 8;
+ this_mv.as_mv.col = best_mv->as_mv.col * 8;
if (bestsad < INT_MAX)
- return
- fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
- (unsigned int *)(&thissad)) +
- mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+ return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+ mvjcost, mvcost, x->errorperbit);
else
return INT_MAX;
}
@@ -1660,8 +1594,8 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
// Baseline value at the centre
bestsad = fn_ptr->sdf(what, what_stride,
bestaddress, in_what_stride, 0x7fffffff)
- + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
- sad_per_bit);
+ + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, sad_per_bit);
// Apply further limits to prevent us looking using vectors that stretch
// beyond the UMV border
@@ -1685,8 +1619,8 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
if (thissad < bestsad) {
this_mv.as_mv.col = c;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
- mvjsadcost, mvsadcost, sad_per_bit);
+ thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, sad_per_bit);
if (thissad < bestsad) {
bestsad = thissad;
@@ -1702,11 +1636,12 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
}
while (c < col_max) {
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
if (thissad < bestsad) {
this_mv.as_mv.col = c;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
mvjsadcost, mvsadcost, sad_per_bit);
if (thissad < bestsad) {
@@ -1720,17 +1655,16 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
check_here++;
c++;
}
-
}
- this_mv.as_mv.row = best_mv->as_mv.row << 3;
- this_mv.as_mv.col = best_mv->as_mv.col << 3;
+ this_mv.as_mv.row = best_mv->as_mv.row * 8;
+ this_mv.as_mv.col = best_mv->as_mv.col * 8;
if (bestsad < INT_MAX)
- return
- fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
- (unsigned int *)(&thissad)) +
- mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+ return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+ mvjcost, mvcost, x->errorperbit);
else
return INT_MAX;
}
@@ -1783,8 +1717,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
// Baseline value at the centre
bestsad = fn_ptr->sdf(what, what_stride,
bestaddress, in_what_stride, 0x7fffffff)
- + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
- sad_per_bit);
+ + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, sad_per_bit);
// Apply further limits to prevent us looking using vectors that stretch
// beyond the UMV border
@@ -1808,8 +1742,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
if (thissad < bestsad) {
this_mv.as_mv.col = c;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
- mvjsadcost, mvsadcost, sad_per_bit);
+ thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, sad_per_bit);
if (thissad < bestsad) {
bestsad = thissad;
@@ -1834,7 +1768,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
if (thissad < bestsad) {
this_mv.as_mv.col = c;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
mvjsadcost, mvsadcost, sad_per_bit);
if (thissad < bestsad) {
@@ -1851,12 +1785,13 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
}
while (c < col_max) {
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
if (thissad < bestsad) {
this_mv.as_mv.col = c;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
- mvjsadcost, mvsadcost, sad_per_bit);
+ thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, sad_per_bit);
if (thissad < bestsad) {
bestsad = thissad;
@@ -1871,14 +1806,14 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
}
}
- this_mv.as_mv.row = best_mv->as_mv.row << 3;
- this_mv.as_mv.col = best_mv->as_mv.col << 3;
+ this_mv.as_mv.row = best_mv->as_mv.row * 8;
+ this_mv.as_mv.col = best_mv->as_mv.col * 8;
if (bestsad < INT_MAX)
- return
- fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
- (unsigned int *)(&thissad)) +
- mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+ return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+ mvjcost, mvcost, x->errorperbit);
else
return INT_MAX;
}
@@ -1909,8 +1844,10 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
- bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +
- mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+ bestsad = fn_ptr->sdf(what, what_stride, best_address,
+ in_what_stride, 0x7fffffff) +
+ mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, error_per_bit);
for (i = 0; i < search_range; i++) {
int best_site = -1;
@@ -1919,16 +1856,20 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
- if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
- (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
- check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ if ((this_col_offset > x->mv_col_min) &&
+ (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) &&
+ (this_row_offset < x->mv_row_max)) {
+ check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+ best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
this_mv.as_mv.col = this_col_offset;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
- mvsadcost, error_per_bit);
+ thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, error_per_bit);
if (thissad < bestsad) {
bestsad = thissad;
@@ -1938,23 +1879,24 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
}
}
- if (best_site == -1)
+ if (best_site == -1) {
break;
- else {
+ } else {
ref_mv->as_mv.row += neighbors[best_site].row;
ref_mv->as_mv.col += neighbors[best_site].col;
- best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
+ best_address += (neighbors[best_site].row) * in_what_stride +
+ neighbors[best_site].col;
}
}
- this_mv.as_mv.row = ref_mv->as_mv.row << 3;
- this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+ this_mv.as_mv.row = ref_mv->as_mv.row * 8;
+ this_mv.as_mv.col = ref_mv->as_mv.col * 8;
if (bestsad < INT_MAX)
- return
- fn_ptr->vf(what, what_stride, best_address, in_what_stride,
- (unsigned int *)(&thissad)) +
- mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+ return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+ mvjcost, mvcost, x->errorperbit);
else
return INT_MAX;
}
@@ -1986,8 +1928,10 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
- bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +
- mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+ bestsad = fn_ptr->sdf(what, what_stride, best_address,
+ in_what_stride, 0x7fffffff) +
+ mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, error_per_bit);
for (i = 0; i < search_range; i++) {
int best_site = -1;
@@ -2004,14 +1948,15 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
block_offset[2] = best_address + 1;
block_offset[3] = best_address + in_what_stride;
- fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+ fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+ sad_array);
for (j = 0; j < 4; j++) {
if (sad_array[j] < bestsad) {
this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row;
this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col;
- sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
- mvsadcost, error_per_bit);
+ sad_array[j] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, error_per_bit);
if (sad_array[j] < bestsad) {
bestsad = sad_array[j];
@@ -2024,16 +1969,20 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
- if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
- (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
- check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ if ((this_col_offset > x->mv_col_min) &&
+ (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) &&
+ (this_row_offset < x->mv_row_max)) {
+ check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+ best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
this_mv.as_mv.col = this_col_offset;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
- mvsadcost, error_per_bit);
+ thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, error_per_bit);
if (thissad < bestsad) {
bestsad = thissad;
@@ -2044,23 +1993,24 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
}
}
- if (best_site == -1)
+ if (best_site == -1) {
break;
- else {
+ } else {
ref_mv->as_mv.row += neighbors[best_site].row;
ref_mv->as_mv.col += neighbors[best_site].col;
- best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
+ best_address += (neighbors[best_site].row) * in_what_stride +
+ neighbors[best_site].col;
}
}
- this_mv.as_mv.row = ref_mv->as_mv.row << 3;
- this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+ this_mv.as_mv.row = ref_mv->as_mv.row * 8;
+ this_mv.as_mv.col = ref_mv->as_mv.col * 8;
if (bestsad < INT_MAX)
- return
- fn_ptr->vf(what, what_stride, best_address, in_what_stride,
- (unsigned int *)(&thissad)) +
- mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+ return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+ mvjcost, mvcost, x->errorperbit);
else
return INT_MAX;
}
@@ -2100,7 +2050,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
/* Get compound pred by averaging two pred blocks. */
bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride,
second_pred, 0x7fffffff) +
- mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+ mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, error_per_bit);
for (i = 0; i < search_range; i++) {
int best_site = -1;
@@ -2123,9 +2074,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
this_mv.as_mv.col = this_col_offset;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
- mvsadcost, error_per_bit);
-
+ thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+ mvjsadcost, mvsadcost, error_per_bit);
if (thissad < bestsad) {
bestsad = thissad;
best_site = j;
@@ -2144,16 +2094,16 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
}
}
- this_mv.as_mv.row = ref_mv->as_mv.row << 3;
- this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+ this_mv.as_mv.row = ref_mv->as_mv.row * 8;
+ this_mv.as_mv.col = ref_mv->as_mv.col * 8;
if (bestsad < INT_MAX) {
// FIXME(rbultje, yunqing): add full-pixel averaging variance functions
// so we don't have to use the subpixel with xoff=0,yoff=0 here.
- return fn_ptr->svaf(best_address, in_what_stride, 0, 0,
- what, what_stride, (unsigned int *)(&thissad),
- second_pred) +
- mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
+ return fn_ptr->svaf(best_address, in_what_stride, 0, 0, what, what_stride,
+ (unsigned int *)(&thissad), second_pred) +
+ mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+ mvjcost, mvcost, x->errorperbit);
} else {
return INT_MAX;
}
diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h
index 3598fa0..bcab679 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/libvpx/vp9/encoder/vp9_mcomp.h
@@ -22,10 +22,14 @@
#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)
// Maximum size of the first step in full pel units
#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
+// Allowed motion vector pixel distance outside image border
+// for Block_16x16
+#define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND)
+
void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv);
-int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,
- int *mvcost[2], int weight);
+int vp9_mv_bit_cost(const MV *mv, const MV *ref,
+ const int *mvjcost, int *mvcost[2], int weight);
void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
void vp9_init3smotion_compensation(MACROBLOCK *x, int stride);
@@ -40,37 +44,37 @@ int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
int_mv *ref_mv, int_mv *dst_mv);
int vp9_hex_search(MACROBLOCK *x,
- int_mv *ref_mv,
+ MV *ref_mv,
int search_param,
int error_per_bit,
int do_init_search,
const vp9_variance_fn_ptr_t *vf,
int use_mvcost,
- int_mv *center_mv,
- int_mv *best_mv);
+ const MV *center_mv,
+ MV *best_mv);
int vp9_bigdia_search(MACROBLOCK *x,
- int_mv *ref_mv,
+ MV *ref_mv,
int search_param,
int error_per_bit,
int do_init_search,
const vp9_variance_fn_ptr_t *vf,
int use_mvcost,
- int_mv *center_mv,
- int_mv *best_mv);
+ const MV *center_mv,
+ MV *best_mv);
int vp9_square_search(MACROBLOCK *x,
- int_mv *ref_mv,
+ MV *ref_mv,
int search_param,
int error_per_bit,
int do_init_search,
const vp9_variance_fn_ptr_t *vf,
int use_mvcost,
- int_mv *center_mv,
- int_mv *best_mv);
+ const MV *center_mv,
+ MV *best_mv);
typedef int (fractional_mv_step_fp) (
MACROBLOCK *x,
- int_mv *bestmv,
- int_mv *ref_mv,
+ MV *bestmv, const MV *ref_mv,
+ int allow_hp,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop, // 0 - full, 1 - qtr only, 2 - half only
@@ -84,7 +88,8 @@ extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
typedef int (fractional_mv_step_comp_fp) (
MACROBLOCK *x,
- int_mv *bestmv, int_mv *ref_mv,
+ MV *bestmv, const MV *ref_mv,
+ int allow_hp,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop, // 0 - full, 1 - qtr only, 2 - half only
diff --git a/libvpx/vp9/encoder/vp9_modecosts.c b/libvpx/vp9/encoder/vp9_modecosts.c
index a5dfaed..7eb6592 100644
--- a/libvpx/vp9/encoder/vp9_modecosts.c
+++ b/libvpx/vp9/encoder/vp9_modecosts.c
@@ -17,7 +17,7 @@
void vp9_init_mode_costs(VP9_COMP *c) {
VP9_COMMON *const cm = &c->common;
- const vp9_tree_p KT = vp9_intra_mode_tree;
+ const vp9_tree_index *KT = vp9_intra_mode_tree;
int i, j;
for (i = 0; i < INTRA_MODES; i++) {
@@ -36,7 +36,7 @@ void vp9_init_mode_costs(VP9_COMP *c) {
vp9_kf_uv_mode_prob[INTRA_MODES - 1],
vp9_intra_mode_tree);
- for (i = 0; i <= SWITCHABLE_FILTERS; ++i)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
cm->fc.switchable_interp_prob[i],
vp9_switchable_interp_tree);
diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_onyx_if.c
index 883b31e..f922f90 100644
--- a/libvpx/vp9/encoder/vp9_onyx_if.c
+++ b/libvpx/vp9/encoder/vp9_onyx_if.c
@@ -8,44 +8,35 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <math.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
-#include "vpx_config.h"
+#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_filter.h"
-#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_idct.h"
+#if CONFIG_VP9_POSTPROC
+#include "vp9/common/vp9_postproc.h"
+#endif
#include "vp9/common/vp9_reconinter.h"
-#include "vp9/encoder/vp9_onyx_int.h"
#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_alloccommon.h"
-#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/common/vp9_tile_common.h"
#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mbgraph.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_picklpf.h"
#include "vp9/encoder/vp9_psnr.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_extend.h"
#include "vp9/encoder/vp9_ratectrl.h"
-#include "vp9/common/vp9_quant_common.h"
-#include "vp9/common/vp9_tile_common.h"
-#include "vp9/encoder/vp9_segmentation.h"
-#include "./vp9_rtcd.h"
-#include "./vpx_scale_rtcd.h"
-#if CONFIG_VP9_POSTPROC
-#include "vp9/common/vp9_postproc.h"
-#endif
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/vpx_timer.h"
-
-#include "vp9/common/vp9_seg_common.h"
-#include "vp9/encoder/vp9_mbgraph.h"
-#include "vp9/common/vp9_pred_common.h"
#include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_bitstream.h"
-#include "vp9/encoder/vp9_picklpf.h"
-#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/encoder/vp9_segmentation.h"
#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_vaq.h"
+
+#include "vpx_ports/vpx_timer.h"
-#include <math.h>
-#include <stdio.h>
-#include <limits.h>
extern void print_tree_update_probs();
@@ -55,16 +46,20 @@ static void set_default_lf_deltas(struct loopfilter *lf);
#define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */
-#define ALTREF_HIGH_PRECISION_MV 1 /* whether to use high precision mv
- for altref computation */
-#define HIGH_PRECISION_MV_QTHRESH 200 /* Q threshold for use of high precision
- mv. Choose a very high value for
- now so that HIGH_PRECISION is always
- chosen */
+#define ALTREF_HIGH_PRECISION_MV 1 // Whether to use high precision mv
+ // for altref computation.
+#define HIGH_PRECISION_MV_QTHRESH 200 // Q threshold for high precision
+ // mv. Choose a very high value for
+ // now so that HIGH_PRECISION is always
+ // chosen.
-#if CONFIG_INTERNAL_STATS
-#include "math.h"
+// Masks for partially or completely disabling split mode
+#define DISABLE_ALL_SPLIT 0x3F
+#define DISABLE_ALL_INTER_SPLIT 0x1F
+#define DISABLE_COMPOUND_SPLIT 0x18
+#define LAST_AND_INTRA_SPLIT_ONLY 0x1E
+#if CONFIG_INTERNAL_STATS
extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *dest, int lumamask,
double *weight);
@@ -107,7 +102,8 @@ extern void write_switchable_interp_stats();
#endif
#ifdef SPEEDSTATS
-unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0};
#endif
#if defined(SECTIONBITS_OUTPUT)
@@ -122,6 +118,8 @@ static int kf_high_motion_minq[QINDEX_RANGE];
static int gf_low_motion_minq[QINDEX_RANGE];
static int gf_high_motion_minq[QINDEX_RANGE];
static int inter_minq[QINDEX_RANGE];
+static int afq_low_motion_minq[QINDEX_RANGE];
+static int afq_high_motion_minq[QINDEX_RANGE];
static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
switch (mode) {
@@ -193,24 +191,55 @@ static void init_minq_luts(void) {
gf_low_motion_minq[i] = calculate_minq_index(maxq,
0.0000015,
-0.0009,
- 0.33,
+ 0.32,
0.0);
gf_high_motion_minq[i] = calculate_minq_index(maxq,
0.0000021,
-0.00125,
- 0.45,
+ 0.50,
0.0);
inter_minq[i] = calculate_minq_index(maxq,
0.00000271,
-0.00113,
- 0.697,
+ 0.75,
0.0);
+ afq_low_motion_minq[i] = calculate_minq_index(maxq,
+ 0.0000015,
+ -0.0009,
+ 0.33,
+ 0.0);
+ afq_high_motion_minq[i] = calculate_minq_index(maxq,
+ 0.0000021,
+ -0.00125,
+ 0.55,
+ 0.0);
+ }
+}
+static int get_active_quality(int q,
+ int gfu_boost,
+ int low,
+ int high,
+ int *low_motion_minq,
+ int *high_motion_minq) {
+ int active_best_quality;
+ if (gfu_boost > high) {
+ active_best_quality = low_motion_minq[q];
+ } else if (gfu_boost < low) {
+ active_best_quality = high_motion_minq[q];
+ } else {
+ const int gap = high - low;
+ const int offset = high - gfu_boost;
+ const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+ const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+ active_best_quality = low_motion_minq[q] + adjustment;
}
+ return active_best_quality;
}
-static void set_mvcost(MACROBLOCK *mb) {
- if (mb->e_mbd.allow_high_precision_mv) {
+static void set_mvcost(VP9_COMP *cpi) {
+ MACROBLOCK *const mb = &cpi->mb;
+ if (cpi->common.allow_high_precision_mv) {
mb->mvcost = mb->nmvcost_hp;
mb->mvsadcost = mb->nmvsadcost_hp;
} else {
@@ -284,14 +313,17 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
vpx_free(cpi->mb_norm_activity_map);
cpi->mb_norm_activity_map = 0;
- vpx_free(cpi->mb.pip);
- cpi->mb.pip = 0;
+ vpx_free(cpi->above_context[0]);
+ cpi->above_context[0] = NULL;
+
+ vpx_free(cpi->above_seg_context);
+ cpi->above_seg_context = NULL;
}
// Computes a q delta (in "q index" terms) to get from a starting q value
// to a target value
// target q value
-static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
+int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
int i;
int start_index = cpi->worst_quality;
int target_index = cpi->worst_quality;
@@ -355,7 +387,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
seg->update_map = 1;
seg->update_data = 1;
- qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));
+ qi_delta = vp9_compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));
vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta - 2));
vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
@@ -364,7 +396,6 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
// Where relevant assume segment data is delta data
seg->abs_delta = SEGMENT_DELTADATA;
-
}
} else if (seg->enabled) {
// All other frames if segmentation has been enabled
@@ -377,8 +408,8 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
seg->update_data = 1;
seg->abs_delta = SEGMENT_DELTADATA;
- qi_delta = compute_qdelta(cpi, cpi->avg_q,
- (cpi->avg_q * 1.125));
+ qi_delta = vp9_compute_qdelta(cpi, cpi->avg_q,
+ (cpi->avg_q * 1.125));
vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
@@ -421,8 +452,8 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
// Skip all MBs if high Q (0,0 mv and skip coeffs)
if (high_q) {
- vp9_enable_segfeature(seg, 0, SEG_LVL_SKIP);
- vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+ vp9_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+ vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP);
}
// Enable data update
seg->update_data = 1;
@@ -565,16 +596,16 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
sf->thresh_mult[THR_NEARESTG] = 0;
sf->thresh_mult[THR_NEARESTA] = 0;
- sf->thresh_mult[THR_NEWMV] += 1000;
- sf->thresh_mult[THR_COMP_NEARESTLA] += 1000;
- sf->thresh_mult[THR_NEARMV] += 1000;
- sf->thresh_mult[THR_COMP_NEARESTGA] += 1000;
-
sf->thresh_mult[THR_DC] += 1000;
- sf->thresh_mult[THR_NEWG] += 1000;
+ sf->thresh_mult[THR_NEWMV] += 1000;
sf->thresh_mult[THR_NEWA] += 1000;
+ sf->thresh_mult[THR_NEWG] += 1000;
+
+ sf->thresh_mult[THR_NEARMV] += 1000;
sf->thresh_mult[THR_NEARA] += 1000;
+ sf->thresh_mult[THR_COMP_NEARESTLA] += 1000;
+ sf->thresh_mult[THR_COMP_NEARESTGA] += 1000;
sf->thresh_mult[THR_TM] += 1000;
@@ -584,19 +615,12 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
sf->thresh_mult[THR_COMP_NEARGA] += 1500;
sf->thresh_mult[THR_COMP_NEWGA] += 2000;
- sf->thresh_mult[THR_SPLITMV] += 2500;
- sf->thresh_mult[THR_SPLITG] += 2500;
- sf->thresh_mult[THR_SPLITA] += 2500;
- sf->thresh_mult[THR_COMP_SPLITLA] += 4500;
- sf->thresh_mult[THR_COMP_SPLITGA] += 4500;
-
sf->thresh_mult[THR_ZEROMV] += 2000;
sf->thresh_mult[THR_ZEROG] += 2000;
sf->thresh_mult[THR_ZEROA] += 2000;
sf->thresh_mult[THR_COMP_ZEROLA] += 2500;
sf->thresh_mult[THR_COMP_ZEROGA] += 2500;
- sf->thresh_mult[THR_B_PRED] += 2500;
sf->thresh_mult[THR_H_PRED] += 2000;
sf->thresh_mult[THR_V_PRED] += 2000;
sf->thresh_mult[THR_D45_PRED ] += 2500;
@@ -606,49 +630,24 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
sf->thresh_mult[THR_D207_PRED] += 2500;
sf->thresh_mult[THR_D63_PRED] += 2500;
- if (cpi->sf.skip_lots_of_modes) {
- for (i = 0; i < MAX_MODES; ++i)
- sf->thresh_mult[i] = INT_MAX;
-
- sf->thresh_mult[THR_DC] = 2000;
- sf->thresh_mult[THR_TM] = 2000;
- sf->thresh_mult[THR_NEWMV] = 4000;
- sf->thresh_mult[THR_NEWG] = 4000;
- sf->thresh_mult[THR_NEWA] = 4000;
- sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_NEARESTG] = 0;
- sf->thresh_mult[THR_NEARESTA] = 0;
- sf->thresh_mult[THR_NEARMV] = 2000;
- sf->thresh_mult[THR_NEARG] = 2000;
- sf->thresh_mult[THR_NEARA] = 2000;
- sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;
- sf->thresh_mult[THR_SPLITMV] = 2500;
- sf->thresh_mult[THR_SPLITG] = 2500;
- sf->thresh_mult[THR_SPLITA] = 2500;
- sf->recode_loop = 0;
- }
-
/* disable frame modes if flags not set */
if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
sf->thresh_mult[THR_NEWMV ] = INT_MAX;
sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
sf->thresh_mult[THR_ZEROMV ] = INT_MAX;
sf->thresh_mult[THR_NEARMV ] = INT_MAX;
- sf->thresh_mult[THR_SPLITMV ] = INT_MAX;
}
if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
sf->thresh_mult[THR_ZEROG ] = INT_MAX;
sf->thresh_mult[THR_NEARG ] = INT_MAX;
sf->thresh_mult[THR_NEWG ] = INT_MAX;
- sf->thresh_mult[THR_SPLITG ] = INT_MAX;
}
if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
sf->thresh_mult[THR_ZEROA ] = INT_MAX;
sf->thresh_mult[THR_NEARA ] = INT_MAX;
sf->thresh_mult[THR_NEWA ] = INT_MAX;
- sf->thresh_mult[THR_SPLITA ] = INT_MAX;
}
if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
@@ -657,7 +656,6 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
sf->thresh_mult[THR_COMP_NEARLA ] = INT_MAX;
sf->thresh_mult[THR_COMP_NEWLA ] = INT_MAX;
- sf->thresh_mult[THR_COMP_SPLITLA ] = INT_MAX;
}
if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
(VP9_GOLD_FLAG | VP9_ALT_FLAG)) {
@@ -665,17 +663,42 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
sf->thresh_mult[THR_COMP_NEARGA ] = INT_MAX;
sf->thresh_mult[THR_COMP_NEWGA ] = INT_MAX;
- sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX;
}
+}
+
+static void set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi, int mode) {
+ SPEED_FEATURES *sf = &cpi->sf;
+ int i;
+
+ for (i = 0; i < MAX_REFS; ++i)
+ sf->thresh_mult_sub8x8[i] = mode == 0 ? -500 : 0;
- if (sf->disable_splitmv == 1) {
- sf->thresh_mult[THR_SPLITMV ] = INT_MAX;
- sf->thresh_mult[THR_SPLITG ] = INT_MAX;
- sf->thresh_mult[THR_SPLITA ] = INT_MAX;
+ sf->thresh_mult_sub8x8[THR_LAST] += 2500;
+ sf->thresh_mult_sub8x8[THR_GOLD] += 2500;
+ sf->thresh_mult_sub8x8[THR_ALTR] += 2500;
+ sf->thresh_mult_sub8x8[THR_INTRA] += 2500;
+ sf->thresh_mult_sub8x8[THR_COMP_LA] += 4500;
+ sf->thresh_mult_sub8x8[THR_COMP_GA] += 4500;
- sf->thresh_mult[THR_COMP_SPLITLA ] = INT_MAX;
- sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX;
+ // Check for masked out split cases.
+ for (i = 0; i < MAX_REFS; i++) {
+ if (sf->disable_split_mask & (1 << i))
+ sf->thresh_mult_sub8x8[i] = INT_MAX;
}
+
+ // disable mode test if frame flag is not set
+ if (!(cpi->ref_frame_flags & VP9_LAST_FLAG))
+ sf->thresh_mult_sub8x8[THR_LAST] = INT_MAX;
+ if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG))
+ sf->thresh_mult_sub8x8[THR_GOLD] = INT_MAX;
+ if (!(cpi->ref_frame_flags & VP9_ALT_FLAG))
+ sf->thresh_mult_sub8x8[THR_ALTR] = INT_MAX;
+ if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
+ (VP9_LAST_FLAG | VP9_ALT_FLAG))
+ sf->thresh_mult_sub8x8[THR_COMP_LA] = INT_MAX;
+ if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
+ (VP9_GOLD_FLAG | VP9_ALT_FLAG))
+ sf->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX;
}
void vp9_set_speed_features(VP9_COMP *cpi) {
@@ -688,12 +711,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
if (mode > 1)
mode = 1;
- // Initialise default mode frequency sampling variables
- for (i = 0; i < MAX_MODES; i ++) {
- cpi->mode_check_freq[i] = 0;
- cpi->mode_test_hit_counts[i] = 0;
+ for (i = 0; i < MAX_MODES; ++i)
cpi->mode_chosen_counts[i] = 0;
- }
// best quality defaults
sf->RD = 1;
@@ -708,30 +727,28 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
sf->comp_inter_joint_search_thresh = BLOCK_4X4;
sf->adaptive_rd_thresh = 0;
- sf->use_lastframe_partitioning = 0;
+ sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF;
sf->tx_size_search_method = USE_FULL_RD;
sf->use_lp32x32fdct = 0;
sf->adaptive_motion_search = 0;
sf->use_avoid_tested_higherror = 0;
sf->reference_masking = 0;
- sf->skip_lots_of_modes = 0;
- sf->partition_by_variance = 0;
sf->use_one_partition_size_always = 0;
sf->less_rectangular_check = 0;
sf->use_square_partition_only = 0;
sf->auto_min_max_partition_size = 0;
- sf->auto_min_max_partition_interval = 0;
- sf->auto_min_max_partition_count = 0;
sf->max_partition_size = BLOCK_64X64;
sf->min_partition_size = BLOCK_4X4;
sf->adjust_partitioning_from_last_frame = 0;
sf->last_partitioning_redo_frequency = 4;
- sf->disable_splitmv = 0;
+ sf->disable_split_mask = 0;
sf->mode_search_skip_flags = 0;
sf->disable_split_var_thresh = 0;
sf->disable_filter_search_var_thresh = 0;
- sf->intra_y_mode_mask = ALL_INTRA_MODES;
- sf->intra_uv_mode_mask = ALL_INTRA_MODES;
+ for (i = 0; i < TX_SIZES; i++) {
+ sf->intra_y_mode_mask[i] = ALL_INTRA_MODES;
+ sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES;
+ }
sf->use_rd_breakout = 0;
sf->skip_encode_sb = 0;
sf->use_uv_intra_rd_estimate = 0;
@@ -747,8 +764,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->static_segmentation = 0;
#endif
+ sf->variance_adaptive_quantization = 0;
+
switch (mode) {
- case 0: // best quality mode
+ case 0: // This is the best quality mode.
break;
case 1:
@@ -759,121 +778,148 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->static_segmentation = 0;
#endif
sf->use_avoid_tested_higherror = 1;
- sf->adaptive_rd_thresh = MIN((speed + 1), 4);
+ sf->adaptive_rd_thresh = 1;
+ sf->recode_loop = (speed < 1);
if (speed == 1) {
- sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+ sf->use_square_partition_only = !frame_is_intra_only(&cpi->common);
+ sf->less_rectangular_check = 1;
+ sf->tx_size_search_method = frame_is_intra_only(&cpi->common)
+ ? USE_FULL_RD : USE_LARGESTALL;
+
+ if (MIN(cpi->common.width, cpi->common.height) >= 720)
+ sf->disable_split_mask = cpi->common.show_frame ?
+ DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+ else
+ sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+
+ sf->use_rd_breakout = 1;
+ sf->adaptive_motion_search = 1;
+ sf->auto_mv_step_size = 1;
+ sf->adaptive_rd_thresh = 2;
+ sf->recode_loop = 2;
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ }
+ if (speed == 2) {
+ sf->use_square_partition_only = !frame_is_intra_only(&cpi->common);
sf->less_rectangular_check = 1;
- sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
- cpi->common.intra_only ||
- cpi->common.show_frame == 0) ?
- USE_FULL_RD :
- USE_LARGESTALL);
- sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME ||
- cpi->common.intra_only ||
- cpi->common.show_frame == 0);
- sf->disable_splitmv =
- (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
+ sf->tx_size_search_method = frame_is_intra_only(&cpi->common)
+ ? USE_FULL_RD : USE_LARGESTALL;
+
+ if (MIN(cpi->common.width, cpi->common.height) >= 720)
+ sf->disable_split_mask = cpi->common.show_frame ?
+ DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+ else
+ sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+
+
sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
FLAG_SKIP_INTRA_BESTINTER |
FLAG_SKIP_COMP_BESTINTRA |
FLAG_SKIP_INTRA_LOWVAR;
- sf->use_uv_intra_rd_estimate = 1;
+
sf->use_rd_breakout = 1;
- sf->skip_encode_sb = 1;
- sf->use_lp32x32fdct = 1;
sf->adaptive_motion_search = 1;
sf->auto_mv_step_size = 1;
- sf->auto_min_max_partition_size = 1;
- sf->auto_min_max_partition_interval = 1;
- // FIXME(jingning): temporarily turn off disable_split_var_thresh
- // during refactoring process. will get this back after finishing
- // the main framework of partition search type.
- sf->disable_split_var_thresh = 0;
sf->disable_filter_search_var_thresh = 16;
-
- sf->intra_y_mode_mask = INTRA_DC_TM_H_V;
- sf->intra_uv_mode_mask = INTRA_DC_TM_H_V;
- sf->use_fast_coef_updates = 1;
- sf->mode_skip_start = 9;
- }
- if (speed == 2) {
- sf->less_rectangular_check = 1;
- sf->use_square_partition_only = 1;
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
- sf->use_lastframe_partitioning = 1;
+
+ sf->auto_min_max_partition_size = 1;
+ sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
sf->adjust_partitioning_from_last_frame = 1;
sf->last_partitioning_redo_frequency = 3;
- sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
- cpi->common.intra_only ||
- cpi->common.show_frame == 0) ?
- USE_FULL_RD :
- USE_LARGESTALL);
+
+ sf->adaptive_rd_thresh = 2;
+ sf->recode_loop = 2;
+ sf->mode_skip_start = 11;
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ }
+ if (speed == 3) {
+ sf->use_square_partition_only = 1;
+ sf->tx_size_search_method = USE_LARGESTALL;
+
+ if (MIN(cpi->common.width, cpi->common.height) >= 720)
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ else
+ sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+
sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
FLAG_SKIP_INTRA_BESTINTER |
FLAG_SKIP_COMP_BESTINTRA |
- FLAG_SKIP_COMP_REFMISMATCH |
- FLAG_SKIP_INTRA_LOWVAR |
- FLAG_EARLY_TERMINATE;
- sf->intra_y_mode_mask = INTRA_DC_TM;
- sf->intra_uv_mode_mask = INTRA_DC_TM;
- sf->use_uv_intra_rd_estimate = 1;
+ FLAG_SKIP_INTRA_LOWVAR;
+
sf->use_rd_breakout = 1;
- sf->skip_encode_sb = 1;
- sf->use_lp32x32fdct = 1;
sf->adaptive_motion_search = 1;
- sf->using_small_partition_info = 0;
- sf->disable_splitmv =
- (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
sf->auto_mv_step_size = 1;
- sf->search_method = SQUARE;
- sf->subpel_iters_per_step = 1;
- sf->use_fast_lpf_pick = 1;
+
+ sf->disable_filter_search_var_thresh = 16;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+
sf->auto_min_max_partition_size = 1;
- sf->auto_min_max_partition_interval = 2;
- sf->disable_split_var_thresh = 32;
- sf->disable_filter_search_var_thresh = 32;
+ sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
+ sf->adjust_partitioning_from_last_frame = 1;
+ sf->last_partitioning_redo_frequency = 3;
+
+ sf->use_uv_intra_rd_estimate = 1;
+ sf->skip_encode_sb = 1;
+ sf->use_lp32x32fdct = 1;
+ sf->subpel_iters_per_step = 1;
sf->use_fast_coef_updates = 2;
- sf->mode_skip_start = 9;
+
+ sf->adaptive_rd_thresh = 4;
+ sf->mode_skip_start = 6;
}
- if (speed == 3) {
- sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
- sf->partition_by_variance = 1;
- sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
- cpi->common.intra_only ||
- cpi->common.show_frame == 0) ?
- USE_FULL_RD :
- USE_LARGESTALL);
+ if (speed == 4) {
+ sf->use_square_partition_only = 1;
+ sf->tx_size_search_method = USE_LARGESTALL;
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
+
sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
FLAG_SKIP_INTRA_BESTINTER |
FLAG_SKIP_COMP_BESTINTRA |
FLAG_SKIP_COMP_REFMISMATCH |
FLAG_SKIP_INTRA_LOWVAR |
FLAG_EARLY_TERMINATE;
+
sf->use_rd_breakout = 1;
+ sf->adaptive_motion_search = 1;
+ sf->auto_mv_step_size = 1;
+
+ sf->disable_filter_search_var_thresh = 16;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+
+ sf->auto_min_max_partition_size = 1;
+ sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
+ sf->adjust_partitioning_from_last_frame = 1;
+ sf->last_partitioning_redo_frequency = 3;
+
+ sf->use_uv_intra_rd_estimate = 1;
sf->skip_encode_sb = 1;
sf->use_lp32x32fdct = 1;
- sf->disable_splitmv = 1;
- sf->auto_mv_step_size = 1;
- sf->search_method = BIGDIA;
sf->subpel_iters_per_step = 1;
- sf->disable_split_var_thresh = 64;
- sf->disable_filter_search_var_thresh = 64;
- sf->intra_y_mode_mask = INTRA_DC_ONLY;
- sf->intra_uv_mode_mask = INTRA_DC_ONLY;
sf->use_fast_coef_updates = 2;
- sf->mode_skip_start = 9;
+
+ sf->adaptive_rd_thresh = 4;
+ sf->mode_skip_start = 6;
+
+ /* sf->intra_y_mode_mask = INTRA_DC_ONLY;
+ sf->intra_uv_mode_mask = INTRA_DC_ONLY;
+ sf->search_method = BIGDIA;
+ sf->disable_split_var_thresh = 64;
+ sf->disable_filter_search_var_thresh = 64; */
}
- if (speed == 4) {
+ if (speed == 5) {
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
sf->use_one_partition_size_always = 1;
sf->always_this_block_size = BLOCK_16X16;
- sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
- cpi->common.intra_only ||
- cpi->common.show_frame == 0) ?
- USE_FULL_RD :
- USE_LARGESTALL);
+ sf->tx_size_search_method = frame_is_intra_only(&cpi->common) ?
+ USE_FULL_RD : USE_LARGESTALL;
sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
FLAG_SKIP_INTRA_BESTINTER |
FLAG_SKIP_COMP_BESTINTRA |
@@ -887,22 +933,25 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
// sf->reduce_first_step_size = 1;
// sf->reference_masking = 1;
- sf->disable_splitmv = 1;
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
sf->search_method = HEX;
sf->subpel_iters_per_step = 1;
sf->disable_split_var_thresh = 64;
sf->disable_filter_search_var_thresh = 96;
- sf->intra_y_mode_mask = INTRA_DC_ONLY;
- sf->intra_uv_mode_mask = INTRA_DC_ONLY;
+ for (i = 0; i < TX_SIZES; i++) {
+ sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
+ sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+ }
sf->use_fast_coef_updates = 2;
- sf->mode_skip_start = 9;
+ sf->adaptive_rd_thresh = 4;
+ sf->mode_skip_start = 6;
}
break;
-
}; /* switch */
// Set rd thresholds based on mode and speed setting
set_rd_speed_thresholds(cpi, mode);
+ set_rd_speed_thresholds_sub8x8(cpi, mode);
// Slow quant, dct and trellis not worthwhile for first pass
// so make sure they are always turned off.
@@ -910,16 +959,16 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->optimize_coefficients = 0;
}
- cpi->mb.fwd_txm16x16 = vp9_short_fdct16x16;
- cpi->mb.fwd_txm8x8 = vp9_short_fdct8x8;
- cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
- cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
- if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
- cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
- cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
+ // No recode for 1 pass.
+ if (cpi->pass == 0) {
+ sf->recode_loop = 0;
+ sf->optimize_coefficients = 0;
}
- cpi->mb.quantize_b_4x4 = vp9_regular_quantize_b_4x4;
+ cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
+ if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
+ cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
+ }
if (cpi->sf.subpel_search_method == SUBPEL_ITERATIVE) {
cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_iterative;
@@ -954,20 +1003,6 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
"Failed to allocate altref buffer");
}
-static int alloc_partition_data(VP9_COMP *cpi) {
- vpx_free(cpi->mb.pip);
-
- cpi->mb.pip = vpx_calloc(cpi->common.mode_info_stride *
- (cpi->common.mi_rows + MI_BLOCK_SIZE),
- sizeof(PARTITION_INFO));
- if (!cpi->mb.pip)
- return 1;
-
- cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;
-
- return 0;
-}
-
void vp9_alloc_compressor_data(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
@@ -975,10 +1010,6 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffers");
- if (alloc_partition_data(cpi))
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate partition data");
-
if (vp9_alloc_frame_buffer(&cpi->last_frame_uf,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
@@ -1001,11 +1032,6 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
}
- // Data used for real time vc mode to see if gf needs refreshing
- cpi->inter_zz_count = 0;
- cpi->gf_bad_count = 0;
- cpi->gf_update_recommended = 0;
-
vpx_free(cpi->mb_activity_map);
CHECK_MEM_ERROR(cm, cpi->mb_activity_map,
vpx_calloc(sizeof(unsigned int),
@@ -1015,6 +1041,19 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
CHECK_MEM_ERROR(cm, cpi->mb_norm_activity_map,
vpx_calloc(sizeof(unsigned int),
cm->mb_rows * cm->mb_cols));
+
+ // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
+ // block where mi unit size is 8x8.
+ vpx_free(cpi->above_context[0]);
+ CHECK_MEM_ERROR(cm, cpi->above_context[0],
+ vpx_calloc(2 * mi_cols_aligned_to_sb(cm->mi_cols) *
+ MAX_MB_PLANE,
+ sizeof(*cpi->above_context[0])));
+
+ vpx_free(cpi->above_seg_context);
+ CHECK_MEM_ERROR(cm, cpi->above_seg_context,
+ vpx_calloc(mi_cols_aligned_to_sb(cm->mi_cols),
+ sizeof(*cpi->above_seg_context)));
}
@@ -1047,13 +1086,18 @@ static void update_frame_size(VP9_COMP *cpi) {
vp9_init_dsmotion_compensation(&cpi->mb, y_stride);
}
}
+
+ {
+ int i;
+ for (i = 1; i < MAX_MB_PLANE; ++i) {
+ cpi->above_context[i] = cpi->above_context[0] +
+ i * sizeof(*cpi->above_context[0]) * 2 *
+ mi_cols_aligned_to_sb(cm->mi_cols);
+ }
+ }
}
-// TODO perhaps change number of steps expose to outside world when setting
-// max and min limits. Also this will likely want refining for the extended Q
-// range.
-//
// Table that converts 0-63 Q range values passed in outside to the Qindex
// range used internally.
static const int q_trans[] = {
@@ -1080,11 +1124,14 @@ void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
if (framerate < 0.1)
framerate = 30;
- cpi->oxcf.framerate = framerate;
- cpi->output_framerate = cpi->oxcf.framerate;
- cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
- cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
- cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+ cpi->oxcf.framerate = framerate;
+ cpi->output_framerate = cpi->oxcf.framerate;
+ cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
+ / cpi->output_framerate);
+ cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
+ / cpi->output_framerate);
+ cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth *
+ cpi->oxcf.two_pass_vbrmin_section / 100);
cpi->min_frame_bandwidth = MAX(cpi->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
@@ -1133,7 +1180,6 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
int i;
cpi->oxcf = *oxcf;
- cpi->goldfreq = 7;
cm->version = oxcf->version;
@@ -1195,6 +1241,12 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
switch (cpi->oxcf.Mode) {
// Real time and one pass deprecated in test code base
+ case MODE_GOODQUALITY:
+ cpi->pass = 0;
+ cpi->compressor_speed = 2;
+ cpi->oxcf.cpu_used = clamp(cpi->oxcf.cpu_used, -5, 5);
+ break;
+
case MODE_FIRSTPASS:
cpi->pass = 1;
cpi->compressor_speed = 1;
@@ -1217,14 +1269,8 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
cpi->oxcf.lossless = oxcf->lossless;
- if (cpi->oxcf.lossless) {
- cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
- cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
- } else {
- cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
- cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
- }
-
+ cpi->mb.e_mbd.itxm_add = cpi->oxcf.lossless ? vp9_iwht4x4_add
+ : vp9_idct4x4_add;
cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
@@ -1237,8 +1283,8 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
cm->reset_frame_context = 0;
setup_features(cm);
- cpi->mb.e_mbd.allow_high_precision_mv = 0; // Default mv precision adaptation
- set_mvcost(&cpi->mb);
+ cpi->common.allow_high_precision_mv = 0; // Default mv precision
+ set_mvcost(cpi);
{
int i;
@@ -1390,6 +1436,94 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
} while (++i <= MV_MAX);
}
+static void init_pick_mode_context(VP9_COMP *cpi) {
+ int i;
+ MACROBLOCK *x = &cpi->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ VP9_COMMON *cm = &cpi->common;
+
+ for (i = 0; i < BLOCK_SIZES; ++i) {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[i];
+ const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
+ if (i < BLOCK_16X16) {
+ for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+ for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
+ for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
+ PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+ ctx->num_4x4_blk = num_4x4_blk;
+ CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+ vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+ }
+ }
+ }
+ } else if (i < BLOCK_32X32) {
+ for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+ for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
+ ++xd->mb_index) {
+ PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+ ctx->num_4x4_blk = num_4x4_blk;
+ CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+ vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+ }
+ }
+ } else if (i < BLOCK_64X64) {
+ for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
+ PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+ ctx->num_4x4_blk = num_4x4_blk;
+ CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+ vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+ }
+ } else {
+ PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+ ctx->num_4x4_blk = num_4x4_blk;
+ CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+ vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+ }
+ }
+}
+
+static void free_pick_mode_context(MACROBLOCK *x) {
+ int i;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ for (i = 0; i < BLOCK_SIZES; ++i) {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[i];
+ const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
+ if (i < BLOCK_16X16) {
+ for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+ for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
+ for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
+ PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+ vpx_free(ctx->zcoeff_blk);
+ ctx->zcoeff_blk = 0;
+ }
+ }
+ }
+ } else if (i < BLOCK_32X32) {
+ for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+ for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
+ ++xd->mb_index) {
+ PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+ vpx_free(ctx->zcoeff_blk);
+ ctx->zcoeff_blk = 0;
+ }
+ }
+ } else if (i < BLOCK_64X64) {
+ for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
+ PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+ vpx_free(ctx->zcoeff_blk);
+ ctx->zcoeff_blk = 0;
+ }
+ } else {
+ PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+ vpx_free(ctx->zcoeff_blk);
+ ctx->zcoeff_blk = 0;
+ }
+ }
+}
+
VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
int i, j;
volatile union {
@@ -1426,6 +1560,8 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
init_config((VP9_PTR)cpi, oxcf);
+ init_pick_mode_context(cpi);
+
cm->current_video_frame = 0;
cpi->kf_overspend_bits = 0;
cpi->kf_bitrate_adjustment = 0;
@@ -1478,7 +1614,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
/*Initialize the feed-forward activity masking.*/
cpi->activity_avg = 90 << 12;
- cpi->frames_since_key = 8; // Give a sensible default for the first frame.
+ cpi->frames_since_key = 8; // Sensible default for first frame.
cpi->key_frame_frequency = cpi->oxcf.key_freq;
cpi->this_key_frame_forced = 0;
cpi->next_key_frame_forced = 0;
@@ -1599,9 +1735,12 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
vp9_set_speed_features(cpi);
// Default rd threshold factors for mode selection
- for (i = 0; i < BLOCK_SIZES; ++i)
+ for (i = 0; i < BLOCK_SIZES; ++i) {
for (j = 0; j < MAX_MODES; ++j)
cpi->rd_thresh_freq_fact[i][j] = 32;
+ for (j = 0; j < MAX_REFS; ++j)
+ cpi->rd_thresh_freq_sub8x8[i][j] = 32;
+ }
#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
SDX3F, SDX8F, SDX4DF)\
@@ -1712,10 +1851,10 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
cpi->common.error.setjmp = 0;
- vp9_zero(cpi->y_uv_mode_count)
+ vp9_zero(cpi->y_uv_mode_count);
#ifdef MODE_TEST_HIT_STATS
- vp9_zero(cpi->mode_test_hits)
+ vp9_zero(cpi->mode_test_hits);
#endif
return (VP9_PTR) cpi;
@@ -1757,8 +1896,10 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
FILE *f = fopen("opsnr.stt", "a");
double time_encoded = (cpi->last_end_time_stamp_seen
- cpi->first_time_stamp_ever) / 10000000.000;
- double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
- double dr = (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded;
+ double total_encode_time = (cpi->time_receive_data +
+ cpi->time_compress_data) / 1000.000;
+ double dr = (double)cpi->bytes * (double) 8 / (double)1000
+ / time_encoded;
if (cpi->b_calculate_psnr) {
YV12_BUFFER_CONFIG *lst_yv12 =
@@ -1778,20 +1919,15 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
dr, cpi->total / cpi->count, total_psnr,
cpi->totalp / cpi->count, total_psnr2, total_ssim, total_ssimp,
total_encode_time);
-// fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n",
-// dr, cpi->total / cpi->count, total_psnr,
-// cpi->totalp / cpi->count, total_psnr2, total_ssim,
-// total_encode_time, cpi->tot_recode_hits);
}
if (cpi->b_calculate_ssimg) {
fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t Time(ms)\n");
fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
- cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
- cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time);
-// fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f %10ld\n", dr,
-// cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
-// cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time, cpi->tot_recode_hits);
+ cpi->total_ssimg_y / cpi->count,
+ cpi->total_ssimg_u / cpi->count,
+ cpi->total_ssimg_v / cpi->count,
+ cpi->total_ssimg_all / cpi->count, total_encode_time);
}
fclose(f);
@@ -1838,11 +1974,9 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
"[INTRA_MODES] =\n{\n");
for (i = 0; i < INTRA_MODES; i++) {
-
fprintf(fmode, " { // Above Mode : %d\n", i);
for (j = 0; j < INTRA_MODES; j++) {
-
fprintf(fmode, " {");
for (k = 0; k < INTRA_MODES; k++) {
@@ -1853,11 +1987,9 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
}
fprintf(fmode, "}, // left_mode %d\n", j);
-
}
fprintf(fmode, " },\n");
-
}
fprintf(fmode, "};\n");
@@ -1891,14 +2023,15 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
(cpi->time_receive_data + cpi->time_compress_data) / 1000);
}
#endif
-
}
+ free_pick_mode_context(&cpi->mb);
dealloc_compressor_data(cpi);
vpx_free(cpi->mb.ss);
vpx_free(cpi->tok);
- for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]); i++) {
+ for (i = 0; i < sizeof(cpi->mbgraph_stats) /
+ sizeof(cpi->mbgraph_stats[0]); ++i) {
vpx_free(cpi->mbgraph_stats[i].mb_stats);
}
@@ -1925,7 +2058,6 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
fclose(kf_list);
#endif
-
}
@@ -2246,14 +2378,15 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
cpi->frames_since_golden = 0;
// ******** Fixed Q test code only ************
- // If we are going to use the ALT reference for the next group of frames set a flag to say so.
+ // If we are going to use the ALT reference for the next group of frames
+ // set a flag to say so.
if (cpi->oxcf.fixed_q >= 0 &&
cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) {
cpi->source_alt_ref_pending = 1;
cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
- // TODO(ivan): for SVC encoder, GF automatic update is disabled by using a
- // large GF_interval
+ // TODO(ivan): For SVC encoder, GF automatic update is disabled by using
+ // a large GF_interval.
if (cpi->use_svc) {
cpi->frames_till_gf_update_due = INT_MAX;
}
@@ -2293,12 +2426,12 @@ static int find_fp_qindex() {
return i;
}
-static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) {
+static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest,
+ unsigned int *frame_flags) {
(void) size;
(void) dest;
(void) frame_flags;
-
vp9_set_quantizer(cpi, find_fp_qindex());
vp9_first_pass(cpi);
}
@@ -2306,13 +2439,11 @@ static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest,
#define WRITE_RECON_BUFFER 0
#if WRITE_RECON_BUFFER
void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-
- // write the frame
FILE *yframe;
int i;
char filename[255];
- sprintf(filename, "cx\\y%04d.raw", this_frame);
+ snprintf(filename, sizeof(filename), "cx\\y%04d.raw", this_frame);
yframe = fopen(filename, "wb");
for (i = 0; i < frame->y_height; i++)
@@ -2320,7 +2451,7 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
frame->y_width, 1, yframe);
fclose(yframe);
- sprintf(filename, "cx\\u%04d.raw", this_frame);
+ snprintf(filename, sizeof(filename), "cx\\u%04d.raw", this_frame);
yframe = fopen(filename, "wb");
for (i = 0; i < frame->uv_height; i++)
@@ -2328,7 +2459,7 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
frame->uv_width, 1, yframe);
fclose(yframe);
- sprintf(filename, "cx\\v%04d.raw", this_frame);
+ snprintf(filename, sizeof(filename), "cx\\v%04d.raw", this_frame);
yframe = fopen(filename, "wb");
for (i = 0; i < frame->uv_height; i++)
@@ -2350,8 +2481,10 @@ static double compute_edge_pixel_proportion(YV12_BUFFER_CONFIG *frame) {
for (i = 1; i < frame->y_height - 1; i++) {
for (j = 1; j < frame->y_width - 1; j++) {
/* Sobel hor and ver gradients */
- int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) + (next[1] - next[-1]);
- int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + (prev[-1] - next[-1]);
+ int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) +
+ (next[1] - next[-1]);
+ int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) +
+ (prev[-1] - next[-1]);
h = (h < 0 ? -h : h);
v = (v < 0 ? -v : v);
if (h > EDGE_THRESH || v > EDGE_THRESH)
@@ -2387,10 +2520,9 @@ static int recode_loop_test(VP9_COMP *cpi,
if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
((cpi->projected_frame_size < low_limit) && (q > minq))) {
force_recode = 1;
- }
- // Special Constrained quality tests
- else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
- // Undershoot and below auto cq level
+ } else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+ // Deal with frame undershoot and whether or not we are
+ // below the automatically set cq level.
if (q > cpi->cq_target_quality &&
cpi->projected_frame_size < ((cpi->this_frame_target * 7) >> 3)) {
force_recode = 1;
@@ -2551,159 +2683,81 @@ static void full_to_model_counts(
}
}
+#if 0 && CONFIG_INTERNAL_STATS
+static void output_frame_level_debug_stats(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
+ int recon_err;
-static void encode_frame_to_data_rate(VP9_COMP *cpi,
- unsigned long *size,
- unsigned char *dest,
- unsigned int *frame_flags) {
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
- TX_SIZE t;
- int q;
- int frame_over_shoot_limit;
- int frame_under_shoot_limit;
-
- int loop = 0;
- int loop_count;
-
- int q_low;
- int q_high;
-
- int top_index;
- int bottom_index;
- int active_worst_qchanged = 0;
-
- int overshoot_seen = 0;
- int undershoot_seen = 0;
-
- SPEED_FEATURES *sf = &cpi->sf;
- unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
- struct segmentation *seg = &cm->seg;
-
- /* Scale the source buffer, if required */
- if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
- cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
- scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);
- cpi->Source = &cpi->scaled_source;
- } else {
- cpi->Source = cpi->un_scaled_source;
- }
-
- scale_references(cpi);
-
- // Clear down mmx registers to allow floating point in what follows
- vp9_clear_system_state();
-
-
- // For an alt ref frame in 2 pass we skip the call to the second
- // pass function that sets the target bandwidth so must set it here
- if (cpi->refresh_alt_ref_frame) {
- // Per frame bit target for the alt ref frame
- cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
- // per second target bitrate
- cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
- cpi->output_framerate);
- }
-
- // Clear zbin over-quant value and mode boost values.
- cpi->zbin_mode_boost = 0;
-
- // Enable or disable mode based tweaking of the zbin
- // For 2 Pass Only used where GF/ARF prediction quality
- // is above a threshold
- cpi->zbin_mode_boost = 0;
-
- // if (cpi->oxcf.lossless)
- cpi->zbin_mode_boost_enabled = 0;
- // else
- // cpi->zbin_mode_boost_enabled = 1;
-
- // Current default encoder behaviour for the altref sign bias
- cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->source_alt_ref_active;
-
- // Check to see if a key frame is signaled
- // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.
- if ((cm->current_video_frame == 0) ||
- (cm->frame_flags & FRAMEFLAGS_KEY) ||
- (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0))) {
- // Key frame from VFW/auto-keyframe/first frame
- cm->frame_type = KEY_FRAME;
- }
+ vp9_clear_system_state(); // __asm emms;
- // Set default state for segment based loop filter update flags
- cm->lf.mode_ref_delta_update = 0;
+ recon_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
+
+ if (cpi->twopass.total_left_stats.coded_error != 0.0)
+ fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
+ "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
+ "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"
+ "%10.3f %8d %10d %10d %10d\n",
+ cpi->common.current_video_frame, cpi->this_frame_target,
+ cpi->projected_frame_size, 0,
+ (cpi->projected_frame_size - cpi->this_frame_target),
+ (int)cpi->total_target_vs_actual,
+ (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
+ (int)cpi->total_actual_bits, cm->base_qindex,
+ vp9_convert_qindex_to_q(cm->base_qindex),
+ (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
+ vp9_convert_qindex_to_q(cpi->active_best_quality),
+ vp9_convert_qindex_to_q(cpi->active_worst_quality), cpi->avg_q,
+ vp9_convert_qindex_to_q(cpi->ni_av_qi),
+ vp9_convert_qindex_to_q(cpi->cq_target_quality),
+ cpi->refresh_last_frame, cpi->refresh_golden_frame,
+ cpi->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost,
+ cpi->twopass.est_max_qcorrection_factor, (int)cpi->twopass.bits_left,
+ cpi->twopass.total_left_stats.coded_error,
+ (double)cpi->twopass.bits_left /
+ (1 + cpi->twopass.total_left_stats.coded_error),
+ cpi->tot_recode_hits, recon_err, cpi->kf_boost, cpi->kf_zeromotion_pct);
- // Initialize cpi->mv_step_param to default based on max resolution
- cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
- // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
- if (sf->auto_mv_step_size) {
- if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only) {
- // initialize max_mv_magnitude for use in the first INTER frame
- // after a key/intra-only frame
- cpi->max_mv_magnitude = max_mv_def;
- } else {
- if (cm->show_frame)
- // allow mv_steps to correspond to twice the max mv magnitude found
- // in the previous frame, capped by the default max_mv_magnitude based
- // on resolution
- cpi->mv_step_param = vp9_init_search_range(
- cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
- cpi->max_mv_magnitude = 0;
- }
- }
+ fclose(f);
- // Set various flags etc to special state if it is a key frame
- if (cm->frame_type == KEY_FRAME) {
- // Reset the loop filter deltas and segmentation map
- setup_features(cm);
+ if (0) {
+ FILE *const fmodes = fopen("Modes.stt", "a");
+ int i;
- // If segmentation is enabled force a map update for key frames
- if (seg->enabled) {
- seg->update_map = 1;
- seg->update_data = 1;
- }
+ fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
+ cm->frame_type, cpi->refresh_golden_frame,
+ cpi->refresh_alt_ref_frame);
- // The alternate reference frame cannot be active for a key frame
- cpi->source_alt_ref_active = 0;
+ for (i = 0; i < MAX_MODES; ++i)
+ fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+ for (i = 0; i < MAX_REFS; ++i)
+ fprintf(fmodes, "%5d ", cpi->sub8x8_mode_chosen_counts[i]);
- cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
- cm->frame_parallel_decoding_mode =
- (cpi->oxcf.frame_parallel_decoding_mode != 0);
- if (cm->error_resilient_mode) {
- cm->frame_parallel_decoding_mode = 1;
- cm->reset_frame_context = 0;
- cm->refresh_frame_context = 0;
- }
- }
+ fprintf(fmodes, "\n");
- // Configure experimental use of segmentation for enhanced coding of
- // static regions if indicated.
- // Only allowed for now in second pass of two pass (as requires lagged coding)
- // and if the relevant speed feature flag is set.
- if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {
- configure_static_seg_features(cpi);
+ fclose(fmodes);
}
+}
+#endif
- // Decide how big to make the frame
- vp9_pick_frame_size(cpi);
-
- vp9_clear_system_state();
-
+static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
+ int * bottom_index, int * top_index) {
// Set an active best quality and if necessary active worst quality
- q = cpi->active_worst_quality;
+ int q = cpi->active_worst_quality;
+ VP9_COMMON *const cm = &cpi->common;
- if (cm->frame_type == KEY_FRAME) {
+ if (frame_is_intra_only(cm)) {
#if !CONFIG_MULTIPLE_ARF
- // Special case for key frames forced because we have reached
- // the maximum key frame interval. Here force the Q to a range
- // based on the ambient Q to reduce the risk of popping
+ // Handle the special case for key frames forced when we have75 reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping.
if (cpi->this_key_frame_forced) {
int delta_qindex;
int qindex = cpi->last_boosted_qindex;
double last_boosted_q = vp9_convert_qindex_to_q(qindex);
- delta_qindex = compute_qdelta(cpi, last_boosted_q,
- (last_boosted_q * 0.75));
+ delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
+ (last_boosted_q * 0.75));
cpi->active_best_quality = MAX(qindex + delta_qindex,
cpi->best_quality);
@@ -2714,18 +2768,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
double q_val;
// Baseline value derived from cpi->active_worst_quality and kf boost
- if (cpi->kf_boost > high) {
- cpi->active_best_quality = kf_low_motion_minq[q];
- } else if (cpi->kf_boost < low) {
- cpi->active_best_quality = kf_high_motion_minq[q];
- } else {
- const int gap = high - low;
- const int offset = high - cpi->kf_boost;
- const int qdiff = kf_high_motion_minq[q] - kf_low_motion_minq[q];
- const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-
- cpi->active_best_quality = kf_low_motion_minq[q] + adjustment;
- }
+ cpi->active_best_quality = get_active_quality(q, cpi->kf_boost,
+ low, high,
+ kf_low_motion_minq,
+ kf_high_motion_minq);
// Allow somewhat lower kf minq with small image formats.
if ((cm->width * cm->height) <= (352 * 288)) {
@@ -2739,84 +2785,78 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// on active_best_quality.
q_val = vp9_convert_qindex_to_q(cpi->active_best_quality);
cpi->active_best_quality +=
- compute_qdelta(cpi, q_val, (q_val * q_adj_factor));
+ vp9_compute_qdelta(cpi, q_val, (q_val * q_adj_factor));
}
#else
double current_q;
// Force the KF quantizer to be 30% of the active_worst_quality.
current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);
cpi->active_best_quality = cpi->active_worst_quality
- + compute_qdelta(cpi, current_q, current_q * 0.3);
+ + vp9_compute_qdelta(cpi, current_q, current_q * 0.3);
#endif
- } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
+ } else if (!cpi->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
int high = 2000;
int low = 400;
// Use the lower of cpi->active_worst_quality and recent
- // average Q as basis for GF/ARF Q limit unless last frame was
+ // average Q as basis for GF/ARF best Q limit unless last frame was
// a key frame.
if (cpi->frames_since_key > 1 &&
cpi->avg_frame_qindex < cpi->active_worst_quality) {
q = cpi->avg_frame_qindex;
}
// For constrained quality dont allow Q less than the cq level
- if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
- q < cpi->cq_target_quality) {
- q = cpi->cq_target_quality;
- }
- if (cpi->gfu_boost > high) {
- cpi->active_best_quality = gf_low_motion_minq[q];
- } else if (cpi->gfu_boost < low) {
- cpi->active_best_quality = gf_high_motion_minq[q];
- } else {
- const int gap = high - low;
- const int offset = high - cpi->gfu_boost;
- const int qdiff = gf_high_motion_minq[q] - gf_low_motion_minq[q];
- const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-
- cpi->active_best_quality = gf_low_motion_minq[q] + adjustment;
- }
-
- // Constrained quality use slightly lower active best.
- if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+ if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+ if (q < cpi->cq_target_quality)
+ q = cpi->cq_target_quality;
+ if (cpi->frames_since_key > 1) {
+ cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+ low, high,
+ afq_low_motion_minq,
+ afq_high_motion_minq);
+ } else {
+ cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+ low, high,
+ gf_low_motion_minq,
+ gf_high_motion_minq);
+ }
+ // Constrained quality use slightly lower active best.
cpi->active_best_quality = cpi->active_best_quality * 15 / 16;
- // TODO(debargha): Refine the logic below
- if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+ } else if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
if (!cpi->refresh_alt_ref_frame) {
cpi->active_best_quality = cpi->cq_target_quality;
} else {
if (cpi->frames_since_key > 1) {
- if (cpi->gfu_boost > high) {
- cpi->active_best_quality = cpi->cq_target_quality * 6 / 16;
- } else if (cpi->gfu_boost < low) {
- cpi->active_best_quality = cpi->cq_target_quality * 11 / 16;
- } else {
- const int gap = high - low;
- const int offset = high - cpi->gfu_boost;
- const int qdiff = cpi->cq_target_quality * 5 / 16;
- const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
- cpi->active_best_quality = cpi->cq_target_quality * 6 / 16
- + adjustment;
- }
+ cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+ low, high,
+ afq_low_motion_minq,
+ afq_high_motion_minq);
+ } else {
+ cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+ low, high,
+ gf_low_motion_minq,
+ gf_high_motion_minq);
}
}
+ } else {
+ cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
+ low, high,
+ gf_low_motion_minq,
+ gf_high_motion_minq);
}
} else {
if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
cpi->active_best_quality = cpi->cq_target_quality;
} else {
-#ifdef ONE_SHOT_Q_ESTIMATE
-#ifdef STRICT_ONE_SHOT_Q
- cpi->active_best_quality = q;
-#else
- cpi->active_best_quality = inter_minq[q];
-#endif
-#else
cpi->active_best_quality = inter_minq[q];
-#endif
+ // 1-pass: for now, use the average Q for the active_best, if its lower
+ // than active_worst.
+ if (cpi->pass == 0 && (cpi->avg_frame_qindex < q))
+ cpi->active_best_quality = inter_minq[cpi->avg_frame_qindex];
- // For the constant/constrained quality mode we don't want
+ // For the constrained quality mode we don't want
// q to fall below the cq level.
if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
(cpi->active_best_quality < cpi->cq_target_quality)) {
@@ -2844,16 +2884,189 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
if (cpi->active_worst_quality < cpi->active_best_quality)
cpi->active_worst_quality = cpi->active_best_quality;
- // Special case code to try and match quality with forced key frames
+ // Limit Q range for the adaptive loop.
+ if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
+ *top_index =
+ (cpi->active_worst_quality + cpi->active_best_quality * 3) / 4;
+ // If this is the first (key) frame in 1-pass, active best is the user
+ // best-allowed, and leave the top_index to active_worst.
+ if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
+ cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+ *top_index = cpi->oxcf.worst_allowed_q;
+ }
+ } else if (!cpi->is_src_frame_alt_ref &&
+ (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ *top_index =
+ (cpi->active_worst_quality + cpi->active_best_quality) / 2;
+ } else {
+ *top_index = cpi->active_worst_quality;
+ }
+ *bottom_index = cpi->active_best_quality;
+
if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
q = cpi->active_best_quality;
+ // Special case code to try and match quality with forced key frames
} else if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
q = cpi->last_boosted_qindex;
} else {
- // Determine initial Q to try
- q = vp9_regulate_q(cpi, cpi->this_frame_target);
+ // Determine initial Q to try.
+ if (cpi->pass == 0) {
+ // 1-pass: for now, use per-frame-bw for target size of frame, scaled
+ // by |x| for key frame.
+ int scale = (cm->frame_type == KEY_FRAME) ? 5 : 1;
+ q = vp9_regulate_q(cpi, scale * cpi->av_per_frame_bandwidth);
+ } else {
+ q = vp9_regulate_q(cpi, cpi->this_frame_target);
+ }
+ if (q > *top_index)
+ q = *top_index;
}
+ return q;
+}
+static void encode_frame_to_data_rate(VP9_COMP *cpi,
+ unsigned long *size,
+ unsigned char *dest,
+ unsigned int *frame_flags) {
+ VP9_COMMON *const cm = &cpi->common;
+ TX_SIZE t;
+ int q;
+ int frame_over_shoot_limit;
+ int frame_under_shoot_limit;
+
+ int loop = 0;
+ int loop_count;
+
+ int q_low;
+ int q_high;
+
+ int top_index;
+ int bottom_index;
+ int active_worst_qchanged = 0;
+
+ int overshoot_seen = 0;
+ int undershoot_seen = 0;
+
+ SPEED_FEATURES *const sf = &cpi->sf;
+ unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
+ struct segmentation *const seg = &cm->seg;
+
+ /* Scale the source buffer, if required. */
+ if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
+ cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
+ scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);
+ cpi->Source = &cpi->scaled_source;
+ } else {
+ cpi->Source = cpi->un_scaled_source;
+ }
+ scale_references(cpi);
+
+ // Clear down mmx registers to allow floating point in what follows.
+ vp9_clear_system_state();
+
+ // For an alt ref frame in 2 pass we skip the call to the second
+ // pass function that sets the target bandwidth so we must set it here.
+ if (cpi->refresh_alt_ref_frame) {
+ // Set a per frame bit target for the alt ref frame.
+ cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
+ // Set a per second target bitrate.
+ cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * cpi->output_framerate);
+ }
+
+ // Clear zbin over-quant value and mode boost values.
+ cpi->zbin_mode_boost = 0;
+
+ // Enable or disable mode based tweaking of the zbin.
+ // For 2 pass only used where GF/ARF prediction quality
+ // is above a threshold.
+ cpi->zbin_mode_boost = 0;
+ cpi->zbin_mode_boost_enabled = 0;
+
+ // Current default encoder behavior for the altref sign bias.
+ cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->source_alt_ref_active;
+
+ // Check to see if a key frame is signaled.
+ // For two pass with auto key frame enabled cm->frame_type may already be
+ // set, but not for one pass.
+ if ((cm->current_video_frame == 0) ||
+ (cm->frame_flags & FRAMEFLAGS_KEY) ||
+ (cpi->oxcf.auto_key && (cpi->frames_since_key %
+ cpi->key_frame_frequency == 0))) {
+ // Set frame type to key frame for the force key frame, if we exceed the
+ // maximum distance in an automatic keyframe selection or for the first
+ // frame.
+ cm->frame_type = KEY_FRAME;
+ }
+
+ // Set default state for segment based loop filter update flags.
+ cm->lf.mode_ref_delta_update = 0;
+
+ // Initialize cpi->mv_step_param to default based on max resolution.
+ cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
+ // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
+ if (sf->auto_mv_step_size) {
+ if (frame_is_intra_only(&cpi->common)) {
+ // Initialize max_mv_magnitude for use in the first INTER frame
+ // after a key/intra-only frame.
+ cpi->max_mv_magnitude = max_mv_def;
+ } else {
+ if (cm->show_frame)
+ // Allow mv_steps to correspond to twice the max mv magnitude found
+ // in the previous frame, capped by the default max_mv_magnitude based
+ // on resolution.
+ cpi->mv_step_param = vp9_init_search_range(
+ cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+ cpi->max_mv_magnitude = 0;
+ }
+ }
+
+ // Set various flags etc to special state if it is a key frame.
+ if (frame_is_intra_only(cm)) {
+ vp9_setup_key_frame(cpi);
+ // Reset the loop filter deltas and segmentation map.
+ setup_features(cm);
+
+ // If segmentation is enabled force a map update for key frames.
+ if (seg->enabled) {
+ seg->update_map = 1;
+ seg->update_data = 1;
+ }
+
+ // The alternate reference frame cannot be active for a key frame.
+ cpi->source_alt_ref_active = 0;
+
+ cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
+ cm->frame_parallel_decoding_mode =
+ (cpi->oxcf.frame_parallel_decoding_mode != 0);
+ if (cm->error_resilient_mode) {
+ cm->frame_parallel_decoding_mode = 1;
+ cm->reset_frame_context = 0;
+ cm->refresh_frame_context = 0;
+ } else if (cm->intra_only) {
+ // Only reset the current context.
+ cm->reset_frame_context = 2;
+ }
+ }
+
+ // Configure experimental use of segmentation for enhanced coding of
+ // static regions if indicated.
+ // Only allowed in second pass of two pass (as requires lagged coding)
+ // and if the relevant speed feature flag is set.
+ if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {
+ configure_static_seg_features(cpi);
+ }
+
+ // Decide how big to make the frame.
+ vp9_pick_frame_size(cpi);
+
+ vp9_clear_system_state();
+
+ q = pick_q_and_adjust_q_bounds(cpi, &bottom_index, &top_index);
+
+ q_high = top_index;
+ q_low = bottom_index;
+
vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
&frame_over_shoot_limit);
@@ -2868,7 +3081,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// Set quantizer steps at 10% increments.
new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
- q = cpi->active_worst_quality + compute_qdelta(cpi, current_q, new_q);
+ q = cpi->active_worst_quality + vp9_compute_qdelta(cpi, current_q, new_q);
bottom_index = q;
top_index = q;
@@ -2876,24 +3089,17 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
q_high = q;
printf("frame:%d q:%d\n", cm->current_video_frame, q);
- } else {
-#endif
- // Limit Q range for the adaptive loop.
- bottom_index = cpi->active_best_quality;
- top_index = cpi->active_worst_quality;
- q_low = cpi->active_best_quality;
- q_high = cpi->active_worst_quality;
-#if CONFIG_MULTIPLE_ARF
}
#endif
+
loop_count = 0;
vp9_zero(cpi->rd_tx_select_threshes);
- if (cm->frame_type != KEY_FRAME) {
+ if (!frame_is_intra_only(cm)) {
cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
/* TODO: Decide this more intelligently */
- xd->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH;
- set_mvcost(&cpi->mb);
+ cm->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH;
+ set_mvcost(cpi);
}
#if CONFIG_VP9_POSTPROC
@@ -2935,25 +3141,25 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
vp9_set_quantizer(cpi, q);
if (loop_count == 0) {
-
- // Set up entropy depending on frame type.
+ // Set up entropy context depending on frame type. The decoder mandates
+ // the use of the default context, index 0, for keyframes and inter
+ // frames where the error_resilient_mode or intra_only flag is set. For
+ // other inter-frames the encoder currently uses only two contexts;
+ // context 1 for ALTREF frames and context 0 for the others.
if (cm->frame_type == KEY_FRAME) {
- /* Choose which entropy context to use. When using a forward reference
- * frame, it immediately follows the keyframe, and thus benefits from
- * using the same entropy context established by the keyframe.
- * Otherwise, use the default context 0.
- */
- cm->frame_context_idx = cpi->oxcf.play_alternate;
vp9_setup_key_frame(cpi);
} else {
- /* Choose which entropy context to use. Currently there are only two
- * contexts used, one for normal frames and one for alt ref frames.
- */
- cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
+ if (!cm->intra_only && !cm->error_resilient_mode) {
+ cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
+ }
vp9_setup_inter_frame(cpi);
}
}
+ if (cpi->sf.variance_adaptive_quantization) {
+ vp9_vaq_frame_setup(cpi);
+ }
+
// transform / motion compensation build reconstruction frame
vp9_encode_frame(cpi);
@@ -2977,14 +3183,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
frame_over_shoot_limit = 1;
active_worst_qchanged = 0;
- // Special case handling for forced key frames
if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
loop = 0;
} else {
+ // Special case handling for forced key frames
if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
int last_q = q;
- int kf_err = vp9_calc_ss_err(cpi->Source,
- &cm->yv12_fb[cm->new_fb_idx]);
+ int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
int high_err_target = cpi->ambient_err;
int low_err_target = cpi->ambient_err >> 1;
@@ -3120,14 +3325,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// fixed interval. Note the reconstruction error if it is the frame before
// the force key frame
if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) {
- cpi->ambient_err = vp9_calc_ss_err(cpi->Source,
- &cm->yv12_fb[cm->new_fb_idx]);
+ cpi->ambient_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
}
if (cm->frame_type == KEY_FRAME)
cpi->refresh_last_frame = 1;
- cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+ cm->frame_to_show = get_frame_new_buffer(cm);
#if WRITE_RECON_BUFFER
if (cm->show_frame)
@@ -3168,7 +3372,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
vp9_adapt_coef_probs(&cpi->common);
}
- if (cpi->common.frame_type != KEY_FRAME) {
+ if (!frame_is_intra_only(&cpi->common)) {
FRAME_COUNTS *counts = &cpi->common.counts;
vp9_copy(counts->y_mode, cpi->y_mode_count);
@@ -3182,7 +3386,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
if (!cpi->common.error_resilient_mode &&
!cpi->common.frame_parallel_decoding_mode) {
vp9_adapt_mode_probs(&cpi->common);
- vp9_adapt_mv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
+ vp9_adapt_mv_probs(&cpi->common, cpi->common.allow_high_precision_mv);
}
}
@@ -3198,8 +3402,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cpi->total_byte_count += (*size);
cpi->projected_frame_size = (*size) << 3;
+ // Post encode loop adjustment of Q prediction.
if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, 2);
+ vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop) ? 2 : 0);
cpi->last_q[cm->frame_type] = cm->base_qindex;
@@ -3222,9 +3427,11 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// Keep a record of ambient average Q.
if (cm->frame_type != KEY_FRAME)
- cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
+ cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex +
+ cm->base_qindex) >> 2;
- // Keep a record from which we can calculate the average Q excluding GF updates and key frames
+ // Keep a record from which we can calculate the average Q excluding GF
+ // updates and key frames.
if (cm->frame_type != KEY_FRAME &&
!cpi->refresh_golden_frame &&
!cpi->refresh_alt_ref_frame) {
@@ -3242,7 +3449,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
if (!cm->show_frame)
cpi->bits_off_target -= cpi->projected_frame_size;
else
- cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
+ cpi->bits_off_target += cpi->av_per_frame_bandwidth -
+ cpi->projected_frame_size;
// Clip the buffer level at the maximum buffer size
if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
@@ -3266,125 +3474,30 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cpi->total_actual_bits += cpi->projected_frame_size;
// Debug stats
- cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
+ cpi->total_target_vs_actual += (cpi->this_frame_target -
+ cpi->projected_frame_size);
cpi->buffer_level = cpi->bits_off_target;
- // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames
+#ifndef DISABLE_RC_LONG_TERM_MEM
+ // Update bits left to the kf and gf groups to account for overshoot or
+ // undershoot on these frames
if (cm->frame_type == KEY_FRAME) {
- cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+ cpi->twopass.kf_group_bits += cpi->this_frame_target -
+ cpi->projected_frame_size;
cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
} else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
- cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+ cpi->twopass.gf_group_bits += cpi->this_frame_target -
+ cpi->projected_frame_size;
cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
}
-
- // Update the skip mb flag probabilities based on the distribution seen
- // in this frame.
- // update_base_skip_probs(cpi);
-
-#if CONFIG_INTERNAL_STATS
- {
- FILE *f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
- int recon_err;
-
- vp9_clear_system_state(); // __asm emms;
-
- recon_err = vp9_calc_ss_err(cpi->Source,
- &cm->yv12_fb[cm->new_fb_idx]);
-
- if (cpi->twopass.total_left_stats.coded_error != 0.0)
- fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
- "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
- "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"
- "%10.3f %8d %10d %10d %10d\n",
- cpi->common.current_video_frame, cpi->this_frame_target,
- cpi->projected_frame_size, 0, //loop_size_estimate,
- (cpi->projected_frame_size - cpi->this_frame_target),
- (int)cpi->total_target_vs_actual,
- (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
- (int)cpi->total_actual_bits,
- cm->base_qindex,
- vp9_convert_qindex_to_q(cm->base_qindex),
- (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
- vp9_convert_qindex_to_q(cpi->active_best_quality),
- vp9_convert_qindex_to_q(cpi->active_worst_quality),
- cpi->avg_q,
- vp9_convert_qindex_to_q(cpi->ni_av_qi),
- vp9_convert_qindex_to_q(cpi->cq_target_quality),
- cpi->refresh_last_frame,
- cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
- cm->frame_type, cpi->gfu_boost,
- cpi->twopass.est_max_qcorrection_factor,
- (int)cpi->twopass.bits_left,
- cpi->twopass.total_left_stats.coded_error,
- (double)cpi->twopass.bits_left /
- cpi->twopass.total_left_stats.coded_error,
- cpi->tot_recode_hits, recon_err, cpi->kf_boost,
- cpi->kf_zeromotion_pct);
- else
- fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
- "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
- "%5d %5d %5d %8d %8d %8.2f %10d %10.3f"
- "%8d %10d %10d %10d\n",
- cpi->common.current_video_frame,
- cpi->this_frame_target, cpi->projected_frame_size,
- 0, //loop_size_estimate,
- (cpi->projected_frame_size - cpi->this_frame_target),
- (int)cpi->total_target_vs_actual,
- (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
- (int)cpi->total_actual_bits,
- cm->base_qindex,
- vp9_convert_qindex_to_q(cm->base_qindex),
- (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
- vp9_convert_qindex_to_q(cpi->active_best_quality),
- vp9_convert_qindex_to_q(cpi->active_worst_quality),
- cpi->avg_q,
- vp9_convert_qindex_to_q(cpi->ni_av_qi),
- vp9_convert_qindex_to_q(cpi->cq_target_quality),
- cpi->refresh_last_frame,
- cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
- cm->frame_type, cpi->gfu_boost,
- cpi->twopass.est_max_qcorrection_factor,
- (int)cpi->twopass.bits_left,
- cpi->twopass.total_left_stats.coded_error,
- cpi->tot_recode_hits, recon_err, cpi->kf_boost,
- cpi->kf_zeromotion_pct);
-
- fclose(f);
-
- if (0) {
- FILE *fmodes = fopen("Modes.stt", "a");
- int i;
-
- fprintf(fmodes, "%6d:%1d:%1d:%1d ",
- cpi->common.current_video_frame,
- cm->frame_type, cpi->refresh_golden_frame,
- cpi->refresh_alt_ref_frame);
-
- for (i = 0; i < MAX_MODES; i++)
- fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-
- fprintf(fmodes, "\n");
-
- fclose(fmodes);
- }
- }
-
#endif
#if 0
- // Debug stats for segment feature experiments.
- print_seg_map(cpi);
+ output_frame_level_debug_stats(cpi);
#endif
-
- // If this was a kf or Gf note the Q
- if ((cm->frame_type == KEY_FRAME)
- || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
- cm->last_kf_gf_q = cm->base_qindex;
-
if (cpi->refresh_golden_frame == 1)
cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
else
@@ -3463,7 +3576,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
#endif
}
- // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
+ // Clear the one shot update flags for segmentation map and mode/ref loop
+ // filter deltas.
cm->seg.update_map = 0;
cm->seg.update_data = 0;
cm->lf.mode_ref_delta_update = 0;
@@ -3487,6 +3601,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cm->mi = cm->mip + cm->mode_info_stride + 1;
cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1;
+ cpi->mb.e_mbd.mi_8x8 = cm->mi_grid_visible;
+ cpi->mb.e_mbd.mi_8x8[0] = cm->mi;
+
// Don't increment frame counters if this was an altref buffer
// update not a real frame
++cm->current_video_frame;
@@ -3495,28 +3612,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// restore prev_mi
cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
-
- #if 0
- {
- char filename[512];
- FILE *recon_file;
- sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
- recon_file = fopen(filename, "wb");
- fwrite(cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].buffer_alloc,
- cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].frame_size,
- 1, recon_file);
- fclose(recon_file);
- }
-#endif
-#ifdef OUTPUT_YUV_REC
- vp9_write_yuv_rec_frame(cm);
-#endif
-
}
static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
unsigned char *dest, unsigned int *frame_flags) {
-
cpi->enable_encode_breakout = 1;
if (!cpi->refresh_alt_ref_frame)
@@ -3533,12 +3632,14 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
if (!cpi->refresh_alt_ref_frame) {
double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
- * cpi->oxcf.two_pass_vbrmin_section / 100);
+ * cpi->oxcf.two_pass_vbrmin_section
+ / 100);
if (two_pass_min_rate < lower_bounds_min_rate)
two_pass_min_rate = lower_bounds_min_rate;
- cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.framerate);
+ cpi->twopass.bits_left += (int64_t)(two_pass_min_rate
+ / cpi->oxcf.framerate);
}
}
@@ -3612,8 +3713,8 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
cpi->source = NULL;
- cpi->mb.e_mbd.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV;
- set_mvcost(&cpi->mb);
+ cpi->common.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV;
+ set_mvcost(cpi);
// Should we code an alternate reference frame.
if (cpi->oxcf.play_alternate && cpi->source_alt_ref_pending) {
@@ -3651,7 +3752,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
}
cm->show_frame = 0;
- cm->intra_only = 0;
cpi->refresh_alt_ref_frame = 1;
cpi->refresh_golden_frame = 0;
cpi->refresh_last_frame = 0;
@@ -3673,6 +3773,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
#endif
if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) {
cm->show_frame = 1;
+ cm->intra_only = 0;
#if CONFIG_MULTIPLE_ARF
// Is this frame the ARF overlay.
@@ -3829,7 +3930,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
cm->frame_flags = *frame_flags;
// Reset the frame pointers to the current frame size
- vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],
+ vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
VP9BORDERINPIXELS);
@@ -3840,6 +3941,10 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
+ if (cpi->sf.variance_adaptive_quantization) {
+ vp9_vaq_init();
+ }
+
if (cpi->pass == 1) {
Pass1Encode(cpi, size, dest, frame_flags);
} else if (cpi->pass == 2) {
@@ -3876,7 +3981,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
cpi->bytes += *size;
if (cm->show_frame) {
-
cpi->count++;
if (cpi->b_calculate_psnr) {
@@ -3986,9 +4090,9 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
vp9_ppflags_t *flags) {
VP9_COMP *cpi = (VP9_COMP *) comp;
- if (!cpi->common.show_frame)
+ if (!cpi->common.show_frame) {
return -1;
- else {
+ } else {
int ret;
#if CONFIG_VP9_POSTPROC
ret = vp9_post_proc_frame(&cpi->common, dest, flags);
@@ -4138,37 +4242,9 @@ int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
return 0;
}
-int vp9_switch_layer(VP9_PTR comp, int layer) {
- VP9_COMP *cpi = (VP9_COMP *)comp;
-
- if (cpi->use_svc) {
- cpi->current_layer = layer;
-
- // Use buffer i for layer i LST
- cpi->lst_fb_idx = layer;
-
- // Use buffer i-1 for layer i Alt (Inter-layer prediction)
- if (layer != 0) cpi->alt_fb_idx = layer - 1;
-
- // Use the rest for Golden
- if (layer < 2 * cpi->number_spatial_layers - NUM_REF_FRAMES)
- cpi->gld_fb_idx = cpi->lst_fb_idx;
- else
- cpi->gld_fb_idx = 2 * cpi->number_spatial_layers - 1 - layer;
-
- printf("Switching to layer %d:\n", layer);
- printf("Using references: LST/GLD/ALT [%d|%d|%d]\n", cpi->lst_fb_idx,
- cpi->gld_fb_idx, cpi->alt_fb_idx);
- } else {
- printf("Switching layer not supported. Enable SVC first \n");
- }
- return 0;
-}
-
void vp9_set_svc(VP9_PTR comp, int use_svc) {
VP9_COMP *cpi = (VP9_COMP *)comp;
cpi->use_svc = use_svc;
- if (cpi->use_svc) printf("Enabled SVC encoder \n");
return;
}
diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_onyx_int.h
index 3e5796f..9429c7f 100644
--- a/libvpx/vp9/encoder/vp9_onyx_int.h
+++ b/libvpx/vp9/encoder/vp9_onyx_int.h
@@ -29,12 +29,7 @@
#include "vp9/common/vp9_findnearmv.h"
#include "vp9/encoder/vp9_lookahead.h"
-// Experimental rate control switches
-#if CONFIG_ONESHOTQ
-#define ONE_SHOT_Q_ESTIMATE 0
-#define STRICT_ONE_SHOT_Q 0
#define DISABLE_RC_LONG_TERM_MEM 0
-#endif
// #define MODE_TEST_HIT_STATS
@@ -49,7 +44,8 @@
#define KEY_FRAME_CONTEXT 5
-#define MAX_MODES 36
+#define MAX_MODES 30
+#define MAX_REFS 6
#define MIN_THRESHMULT 32
#define MAX_THRESHMULT 512
@@ -61,16 +57,11 @@
#define INTRA_ZBIN_BOOST 0
typedef struct {
- nmv_context nmvc;
int nmvjointcost[MV_JOINTS];
int nmvcosts[2][MV_VALS];
int nmvcosts_hp[2][MV_VALS];
vp9_prob segment_pred_probs[PREDICTION_PROBS];
- vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
- vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
- vp9_prob single_ref_prob[REF_CONTEXTS][2];
- vp9_prob comp_ref_prob[REF_CONTEXTS];
unsigned char *last_frame_seg_map_copy;
@@ -79,20 +70,8 @@ typedef struct {
// 0 = ZERO_MV, MV
signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
- vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
-
- vp9_prob y_mode_prob[4][INTRA_MODES - 1];
- vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
- vp9_prob partition_prob[2][NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
-
- vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
- [SWITCHABLE_FILTERS - 1];
-
int inter_mode_counts[INTER_MODE_CONTEXTS][INTER_MODES - 1][2];
- vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
-
- struct tx_probs tx_probs;
- vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
+ FRAME_CONTEXT fc;
} CODING_CONTEXT;
typedef struct {
@@ -169,19 +148,12 @@ typedef enum {
THR_COMP_NEARGA,
THR_COMP_NEWGA,
- THR_SPLITMV,
- THR_SPLITG,
- THR_SPLITA,
- THR_COMP_SPLITLA,
- THR_COMP_SPLITGA,
-
THR_ZEROMV,
THR_ZEROG,
THR_ZEROA,
THR_COMP_ZEROLA,
THR_COMP_ZEROGA,
- THR_B_PRED,
THR_H_PRED,
THR_V_PRED,
THR_D135_PRED,
@@ -193,6 +165,15 @@ typedef enum {
} THR_MODES;
typedef enum {
+ THR_LAST,
+ THR_GOLD,
+ THR_ALTR,
+ THR_COMP_LA,
+ THR_COMP_GA,
+ THR_INTRA,
+} THR_MODES_SUB8X8;
+
+typedef enum {
DIAMOND = 0,
NSTEP = 1,
HEX = 2,
@@ -244,8 +225,15 @@ typedef enum {
#define ALL_INTRA_MODES 0x3FF
#define INTRA_DC_ONLY 0x01
#define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED))
+#define INTRA_DC_H_V ((1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED))
#define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED))
+typedef enum {
+ LAST_FRAME_PARTITION_OFF = 0,
+ LAST_FRAME_PARTITION_LOW_MOTION = 1,
+ LAST_FRAME_PARTITION_ALL = 2
+} LAST_FRAME_PARTITION_METHOD;
+
typedef struct {
int RD;
SEARCH_METHODS search_method;
@@ -254,21 +242,21 @@ typedef struct {
SUBPEL_SEARCH_METHODS subpel_search_method;
int subpel_iters_per_step;
int thresh_mult[MAX_MODES];
+ int thresh_mult_sub8x8[MAX_REFS];
int max_step_search_steps;
int reduce_first_step_size;
int auto_mv_step_size;
int optimize_coefficients;
int static_segmentation;
+ int variance_adaptive_quantization;
int comp_inter_joint_search_thresh;
int adaptive_rd_thresh;
int skip_encode_sb;
int skip_encode_frame;
- int use_lastframe_partitioning;
+ LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
TX_SIZE_SEARCH_METHOD tx_size_search_method;
int use_lp32x32fdct;
int use_avoid_tested_higherror;
- int skip_lots_of_modes;
- int partition_by_variance;
int use_one_partition_size_always;
int less_rectangular_check;
int use_square_partition_only;
@@ -276,13 +264,11 @@ typedef struct {
int reference_masking;
BLOCK_SIZE always_this_block_size;
int auto_min_max_partition_size;
- int auto_min_max_partition_interval;
- int auto_min_max_partition_count;
BLOCK_SIZE min_partition_size;
BLOCK_SIZE max_partition_size;
int adjust_partitioning_from_last_frame;
int last_partitioning_redo_frequency;
- int disable_splitmv;
+ int disable_split_mask;
int using_small_partition_info;
// TODO(jingning): combine the related motion search speed features
int adaptive_motion_search;
@@ -296,8 +282,8 @@ typedef struct {
// A source variance threshold below which filter search is disabled
// Choose a very large value (UINT_MAX) to use 8-tap always
unsigned int disable_filter_search_var_thresh;
- int intra_y_mode_mask;
- int intra_uv_mode_mask;
+ int intra_y_mode_mask[TX_SIZES];
+ int intra_uv_mode_mask[TX_SIZES];
int use_rd_breakout;
int use_uv_intra_rd_estimate;
int use_fast_lpf_pick;
@@ -325,6 +311,7 @@ typedef struct VP9_COMP {
MACROBLOCK mb;
VP9_COMMON common;
VP9_CONFIG oxcf;
+ struct rdcost_block_args rdcost_stack;
struct lookahead_ctx *lookahead;
struct lookahead_entry *source;
@@ -339,13 +326,13 @@ typedef struct VP9_COMP {
YV12_BUFFER_CONFIG scaled_source;
unsigned int frames_till_alt_ref_frame;
- int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
- int source_alt_ref_active; // an alt ref frame has been encoded and is usable
+ int source_alt_ref_pending;
+ int source_alt_ref_active;
- int is_src_frame_alt_ref; // source of frame to encode is an exact copy of an alt ref frame
+ int is_src_frame_alt_ref;
- int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
- int alt_is_last; // Alt reference frame same as last ( short circuit altref search)
+ int gold_is_last; // gold same as last frame ( short circuit gold searches)
+ int alt_is_last; // Alt same as last ( short circuit altref search)
int gold_is_alt; // don't do both alt and gold search ( just do gold).
int scaled_ref_idx[3];
@@ -382,19 +369,19 @@ typedef struct VP9_COMP {
// Ambient reconstruction err target for force key frames
int ambient_err;
- unsigned int mode_check_freq[MAX_MODES];
- unsigned int mode_test_hit_counts[MAX_MODES];
unsigned int mode_chosen_counts[MAX_MODES];
+ unsigned int sub8x8_mode_chosen_counts[MAX_REFS];
int64_t mode_skip_mask;
int ref_frame_mask;
int set_ref_frame_mask;
- int rd_threshes[BLOCK_SIZES][MAX_MODES];
+ int rd_threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+ int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS];
+ int rd_thresh_freq_sub8x8[BLOCK_SIZES][MAX_REFS];
int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
- // FIXME(rbultje) int64_t?
- int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
+ int64_t rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];
unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];
unsigned int single_ref_count[REF_CONTEXTS][2][2];
@@ -404,9 +391,9 @@ typedef struct VP9_COMP {
// FIXME(rbultje) can this overflow?
int rd_tx_select_threshes[4][TX_MODES];
- int64_t rd_filter_diff[SWITCHABLE_FILTERS + 1];
- int64_t rd_filter_threshes[4][SWITCHABLE_FILTERS + 1];
- int64_t rd_filter_cache[SWITCHABLE_FILTERS + 1];
+ int64_t rd_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+ int64_t rd_filter_threshes[4][SWITCHABLE_FILTER_CONTEXTS];
+ int64_t rd_filter_cache[SWITCHABLE_FILTER_CONTEXTS];
int RDMULT;
int RDDIV;
@@ -424,14 +411,14 @@ typedef struct VP9_COMP {
double gf_rate_correction_factor;
unsigned int frames_since_golden;
- int frames_till_gf_update_due; // Count down till next GF
+ int frames_till_gf_update_due; // Count down till next GF
- int gf_overspend_bits; // Total bits overspent becasue of GF boost (cumulative)
+ int gf_overspend_bits; // cumulative bits overspent because of GF boost
- int non_gf_bitrate_adjustment; // Used in the few frames following a GF to recover the extra bits spent in that GF
+ int non_gf_bitrate_adjustment; // Following GF to recover extra bits spent
- int kf_overspend_bits; // Extra bits spent on key frames that need to be recovered on inter frames
- int kf_bitrate_adjustment; // Current number of bit s to try and recover on each inter frame.
+ int kf_overspend_bits; // Bits spent on key frames to be recovered on inters
+ int kf_bitrate_adjustment; // number of bits to recover on each inter frame.
int max_gf_interval;
int baseline_gf_interval;
int active_arnr_frames; // <= cpi->oxcf.arnr_max_frames
@@ -439,9 +426,9 @@ typedef struct VP9_COMP {
int64_t key_frame_count;
int prior_key_frame_distance[KEY_FRAME_CONTEXT];
- int per_frame_bandwidth; // Current section per frame bandwidth target
- int av_per_frame_bandwidth; // Average frame size target for clip
- int min_frame_bandwidth; // Minimum allocation that should be used for any frame
+ int per_frame_bandwidth; // Current section per frame bandwidth target
+ int av_per_frame_bandwidth; // Average frame size target for clip
+ int min_frame_bandwidth; // Minimum allocation used for any frame
int inter_frame_target;
double output_framerate;
int64_t last_time_stamp_seen;
@@ -483,7 +470,7 @@ typedef struct VP9_COMP {
int y_mode_count[4][INTRA_MODES];
int y_uv_mode_count[INTRA_MODES][INTRA_MODES];
- unsigned int partition_count[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
+ unsigned int partition_count[PARTITION_CONTEXTS][PARTITION_TYPES];
nmv_context_counts NMVcount;
@@ -514,14 +501,9 @@ typedef struct VP9_COMP {
int decimation_count;
// for real time encoding
- int avg_encode_time; // microsecond
- int avg_pick_mode_time; // microsecond
int speed;
- unsigned int cpu_freq; // Mhz
int compressor_speed;
- int interquantizer;
- int goldfreq;
int auto_worst_q;
int cpu_used;
int pass;
@@ -537,11 +519,6 @@ typedef struct VP9_COMP {
unsigned int max_mv_magnitude;
int mv_step_param;
- // Data used for real time conferencing mode to help determine if it would be good to update the gf
- int inter_zz_count;
- int gf_bad_count;
- int gf_update_recommended;
-
unsigned char *segmentation_map;
// segment threashold for encode breakout
@@ -648,10 +625,10 @@ typedef struct VP9_COMP {
int dummy_packing; /* flag to indicate if packing is dummy */
- unsigned int switchable_interp_count[SWITCHABLE_FILTERS + 1]
+ unsigned int switchable_interp_count[SWITCHABLE_FILTER_CONTEXTS]
[SWITCHABLE_FILTERS];
- unsigned int txfm_stepdown_count[TX_SIZES];
+ unsigned int tx_stepdown_count[TX_SIZES];
int initial_width;
int initial_height;
@@ -682,6 +659,13 @@ typedef struct VP9_COMP {
// Debug / test stats
int64_t mode_test_hits[BLOCK_SIZES];
#endif
+
+ /* Y,U,V,(A) */
+ ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+ ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+
+ PARTITION_CONTEXT *above_seg_context;
+ PARTITION_CONTEXT left_seg_context[8];
} VP9_COMP;
static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
@@ -714,9 +698,14 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x);
void vp9_set_speed_features(VP9_COMP *cpi);
-extern int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source,
- YV12_BUFFER_CONFIG *dest);
+int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
+
+void vp9_alloc_compressor_data(VP9_COMP *cpi);
-extern void vp9_alloc_compressor_data(VP9_COMP *cpi);
+int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget);
+
+static int get_token_alloc(int mb_rows, int mb_cols) {
+ return mb_rows * mb_cols * (48 * 16 + 4);
+}
#endif // VP9_ENCODER_VP9_ONYX_INT_H_
diff --git a/libvpx/vp9/encoder/vp9_picklpf.c b/libvpx/vp9/encoder/vp9_picklpf.c
index 239fd6b..476ecaa 100644
--- a/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/libvpx/vp9/encoder/vp9_picklpf.c
@@ -54,7 +54,8 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
src += srcoffset;
dst += dstoffset;
- // Loop through the Y plane raw and reconstruction data summing (square differences)
+ // Loop through the raw Y plane and reconstruction data summing the square
+ // differences.
for (i = 0; i < linestocopy; i += 16) {
for (j = 0; j < source->y_width; j += 16) {
unsigned int sse;
@@ -72,20 +73,6 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
// Enforce a minimum filter level based upon baseline Q
static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
int min_filter_level;
- /*int q = (int) vp9_convert_qindex_to_q(base_qindex);
-
- if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame)
- min_filter_level = 0;
- else
- {
- if (q <= 10)
- min_filter_level = 0;
- else if (q <= 64)
- min_filter_level = 1;
- else
- min_filter_level = (q >> 6);
- }
- */
min_filter_level = 0;
return min_filter_level;
@@ -93,11 +80,7 @@ static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
// Enforce a maximum filter level based upon baseline Q
static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {
- // PGW August 2006: Highest filter values almost always a bad idea
-
- // jbb chg: 20100118 - not so any more with this overquant stuff allow high values
- // with lots of intra coming in.
- int max_filter_level = MAX_LOOP_FILTER;// * 3 / 4;
+ int max_filter_level = MAX_LOOP_FILTER;
(void)base_qindex;
if (cpi->twopass.section_intra_rating > 8)
@@ -128,7 +111,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
int filt_best;
int filt_direction = 0;
- int Bias = 0; // Bias against raising loop filter and in favour of lowering it
+ int Bias = 0; // Bias against raising loop filter in favor of lowering it.
// Make a copy of the unfiltered / processed recon buffer
vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
@@ -136,7 +119,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
: cpi->oxcf.Sharpness;
- // Start the search at the previous frame filter level unless it is now out of range.
+ // Start the search at the previous frame filter level unless it is now out of
+ // range.
filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
// Define the initial step size
@@ -153,9 +137,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
while (filter_step > 0) {
- Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images
+ Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
- // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value
if (cpi->twopass.section_intra_rating < 20)
Bias = Bias * cpi->twopass.section_intra_rating / 20;
@@ -163,8 +146,12 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
if (cpi->common.tx_mode != ONLY_4X4)
Bias >>= 1;
- filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
- filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);
+ filt_high = ((filt_mid + filter_step) > max_filter_level)
+ ? max_filter_level
+ : (filt_mid + filter_step);
+ filt_low = ((filt_mid - filter_step) < min_filter_level)
+ ? min_filter_level
+ : (filt_mid - filter_step);
if ((filt_direction <= 0) && (filt_low != filt_mid)) {
// Get Low filter error score
@@ -176,7 +163,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
// Re-instate the unfiltered frame
vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
- // If value is close to the best so far then bias towards a lower loop filter value.
+ // If value is close to the best so far then bias towards a lower loop
+ // filter value.
if ((filt_err - Bias) < best_err) {
// Was it actually better than the previous best?
if (filt_err < best_err)
@@ -215,4 +203,3 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
lf->filter_level = filt_best;
}
-
diff --git a/libvpx/vp9/encoder/vp9_psnr.c b/libvpx/vp9/encoder/vp9_psnr.c
index 9439434..58294e1 100644
--- a/libvpx/vp9/encoder/vp9_psnr.c
+++ b/libvpx/vp9/encoder/vp9_psnr.c
@@ -8,9 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <math.h>
#include "vpx_scale/yv12config.h"
-#include "math.h"
#define MAX_PSNR 100
diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c
index 6c8b2a0..fca7525 100644
--- a/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libvpx/vp9/encoder/vp9_quantize.c
@@ -12,6 +12,7 @@
#include "vpx_mem/vpx_mem.h"
#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_rdopt.h"
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/common/vp9_quant_common.h"
@@ -21,12 +22,14 @@
extern int enc_debug;
#endif
-void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
- int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
- int16_t *dqcoeff_ptr, int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
+void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr,
+ int zbin_oq_value, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
int i, rc, eob;
int zbins[2], nzbins[2], zbin;
int x, y, z, sz;
@@ -85,14 +88,15 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
*eob_ptr = eob + 1;
}
-void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
- int16_t *zbin_ptr, int16_t *round_ptr,
- int16_t *quant_ptr, int16_t *quant_shift_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
- int16_t *dequant_ptr, int zbin_oq_value,
- uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
+ const int16_t *dequant_ptr,
+ int zbin_oq_value, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
int i, rc, eob;
int zbins[2], nzbins[2];
int x, y, z, sz;
@@ -173,25 +177,19 @@ static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
return res;
}
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
- int y_blocks) {
- MACROBLOCKD *const xd = &mb->e_mbd;
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+ const int16_t *scan, const int16_t *iscan) {
+ MACROBLOCKD *const xd = &x->e_mbd;
const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
- const int16_t *scan = get_scan_4x4(tx_type);
- const int16_t *iscan = get_iscan_4x4(tx_type);
-
- vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block),
- 16, mb->skip_block,
- mb->plane[pb_idx.plane].zbin,
- mb->plane[pb_idx.plane].round,
- mb->plane[pb_idx.plane].quant,
- mb->plane[pb_idx.plane].quant_shift,
- BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block),
- BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block),
- xd->plane[pb_idx.plane].dequant,
- mb->plane[pb_idx.plane].zbin_extra,
- &xd->plane[pb_idx.plane].eobs[pb_idx.block],
- scan, iscan);
+ struct macroblock_plane* p = &x->plane[pb_idx.plane];
+ struct macroblockd_plane* pd = &xd->plane[pb_idx.plane];
+
+ vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block),
+ 16, x->skip_block,
+ p->zbin, p->round, p->quant, p->quant_shift,
+ BLOCK_OFFSET(pd->qcoeff, pb_idx.block),
+ BLOCK_OFFSET(pd->dqcoeff, pb_idx.block),
+ pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan);
}
static void invert_quant(int16_t *quant, int16_t *shift, int d) {
@@ -271,12 +269,15 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
int i;
+ VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
int zbin_extra;
- int segment_id = xd->this_mi->mbmi.segment_id;
+ int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
const int qindex = vp9_get_qindex(&cpi->common.seg, segment_id,
cpi->common.base_qindex);
+ int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
+
// Y
zbin_extra = (cpi->common.y_dequant[qindex][1] *
(cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
@@ -315,6 +316,12 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
/* save this macroblock QIndex for vp9_update_zbin_extra() */
x->e_mbd.q_index = qindex;
+
+ /* R/D setup */
+ cpi->mb.errorperbit = rdmult >> 6;
+ cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
+
+ vp9_initialize_me_consts(cpi, xd->q_index);
}
void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
@@ -337,10 +344,10 @@ void vp9_frame_init_quantizer(VP9_COMP *cpi) {
vp9_mb_init_quantizer(cpi, &cpi->mb);
}
-void vp9_set_quantizer(struct VP9_COMP *cpi, int Q) {
+void vp9_set_quantizer(struct VP9_COMP *cpi, int q) {
VP9_COMMON *cm = &cpi->common;
- cm->base_qindex = Q;
+ cm->base_qindex = q;
// if any of the delta_q values are changing update flag will
// have to be set.
diff --git a/libvpx/vp9/encoder/vp9_quantize.h b/libvpx/vp9/encoder/vp9_quantize.h
index 3229eaa..c078e1d 100644
--- a/libvpx/vp9/encoder/vp9_quantize.h
+++ b/libvpx/vp9/encoder/vp9_quantize.h
@@ -13,31 +13,19 @@
#include "vp9/encoder/vp9_block.h"
-#define prototype_quantize_block(sym) \
- void (sym)(MACROBLOCK *mb, int b_idx)
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+ const int16_t *scan, const int16_t *iscan);
-#define prototype_quantize_block_pair(sym) \
- void (sym)(MACROBLOCK *mb, int b_idx1, int b_idx2)
-
-#define prototype_quantize_mb(sym) \
- void (sym)(MACROBLOCK *x)
-
-void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2,
- int y_blocks);
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
- int y_blocks);
-void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
- int y_blocks);
struct VP9_COMP;
-extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
+void vp9_set_quantizer(struct VP9_COMP *cpi, int q);
-extern void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
+void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
-extern void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
+void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
-extern void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);
+void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);
-extern void vp9_init_quantizer(struct VP9_COMP *cpi);
+void vp9_init_quantizer(struct VP9_COMP *cpi);
#endif // VP9_ENCODER_VP9_QUANTIZE_H_
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c
index 2d12ba9..0aa3a68 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -59,9 +59,8 @@ static int kfboost_qadjust(int qindex) {
int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
double correction_factor) {
-
const double q = vp9_convert_qindex_to_q(qindex);
- int enumerator = frame_type == KEY_FRAME ? 4000000 : 2500000;
+ int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000;
// q based adjustment to baseline enumerator
enumerator += (int)(enumerator * q) >> 12;
@@ -76,35 +75,19 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
// restored with a call to vp9_restore_coding_context. These functions are
// intended for use in a re-code loop in vp9_compress_frame where the
// quantizer value is adjusted between loop iterations.
-
- cc->nmvc = cm->fc.nmvc;
vp9_copy(cc->nmvjointcost, cpi->mb.nmvjointcost);
vp9_copy(cc->nmvcosts, cpi->mb.nmvcosts);
vp9_copy(cc->nmvcosts_hp, cpi->mb.nmvcosts_hp);
- vp9_copy(cc->inter_mode_probs, cm->fc.inter_mode_probs);
-
- vp9_copy(cc->y_mode_prob, cm->fc.y_mode_prob);
- vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
- vp9_copy(cc->partition_prob, cm->fc.partition_prob);
-
vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
- vp9_copy(cc->intra_inter_prob, cm->fc.intra_inter_prob);
- vp9_copy(cc->comp_inter_prob, cm->fc.comp_inter_prob);
- vp9_copy(cc->single_ref_prob, cm->fc.single_ref_prob);
- vp9_copy(cc->comp_ref_prob, cm->fc.comp_ref_prob);
-
vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
- vp9_copy(cc->coef_probs, cm->fc.coef_probs);
- vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
- cc->tx_probs = cm->fc.tx_probs;
- vp9_copy(cc->mbskip_probs, cm->fc.mbskip_probs);
+ cc->fc = cm->fc;
}
void vp9_restore_coding_context(VP9_COMP *cpi) {
@@ -113,25 +96,12 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
// Restore key state variables to the snapshot state stored in the
// previous call to vp9_save_coding_context.
-
- cm->fc.nmvc = cc->nmvc;
vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
- vp9_copy(cm->fc.inter_mode_probs, cc->inter_mode_probs);
-
- vp9_copy(cm->fc.y_mode_prob, cc->y_mode_prob);
- vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
- vp9_copy(cm->fc.partition_prob, cc->partition_prob);
-
vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
- vp9_copy(cm->fc.intra_inter_prob, cc->intra_inter_prob);
- vp9_copy(cm->fc.comp_inter_prob, cc->comp_inter_prob);
- vp9_copy(cm->fc.single_ref_prob, cc->single_ref_prob);
- vp9_copy(cm->fc.comp_ref_prob, cc->comp_ref_prob);
-
vpx_memcpy(cm->last_frame_seg_map,
cpi->coding_context.last_frame_seg_map_copy,
(cm->mi_rows * cm->mi_cols));
@@ -139,10 +109,7 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
- vp9_copy(cm->fc.coef_probs, cc->coef_probs);
- vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
- cm->fc.tx_probs = cc->tx_probs;
- vp9_copy(cm->fc.mbskip_probs, cc->mbskip_probs);
+ cm->fc = cc->fc;
}
void vp9_setup_key_frame(VP9_COMP *cpi) {
@@ -224,11 +191,12 @@ static void calc_pframe_target_size(VP9_COMP *cpi) {
cpi->this_frame_target = cpi->per_frame_bandwidth;
}
- // Sanity check that the total sum of adjustments is not above the maximum allowed
- // That is that having allowed for KF and GF penalties we have not pushed the
- // current interframe target to low. If the adjustment we apply here is not capable of recovering
- // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over
- // a longer time span via other buffer / rate control mechanisms.
+ // Check that the total sum of adjustments is not above the maximum allowed.
+ // That is, having allowed for the KF and GF penalties, we have not pushed
+ // the current inter-frame target too low. If the adjustment we apply here is
+ // not capable of recovering all the extra bits we have spent in the KF or GF,
+ // then the remainder will have to be recovered over a longer time span via
+ // other buffer / rate control mechanisms.
if (cpi->this_frame_target < min_frame_target)
cpi->this_frame_target = min_frame_target;
@@ -297,12 +265,12 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
rate_correction_factor);
// Work out a size correction factor.
- // if ( cpi->this_frame_target > 0 )
- // correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target;
if (projected_size_based_on_q > 0)
- correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+ correction_factor =
+ (100 * cpi->projected_frame_size) / projected_size_based_on_q;
- // More heavily damped adjustment used if we have been oscillating either side of target
+ // More heavily damped adjustment used if we have been oscillating either side
+ // of target.
switch (damp_var) {
case 0:
adjustment_limit = 0.75;
@@ -319,27 +287,29 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
// if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )
if (correction_factor > 102) {
// We are not already at the worst allowable quality
- correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
- rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+ correction_factor =
+ (int)(100 + ((correction_factor - 100) * adjustment_limit));
+ rate_correction_factor =
+ ((rate_correction_factor * correction_factor) / 100);
// Keep rate_correction_factor within limits
if (rate_correction_factor > MAX_BPB_FACTOR)
rate_correction_factor = MAX_BPB_FACTOR;
- }
- // else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) )
- else if (correction_factor < 99) {
+ } else if (correction_factor < 99) {
// We are not already at the best allowable quality
- correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
- rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+ correction_factor =
+ (int)(100 - ((100 - correction_factor) * adjustment_limit));
+ rate_correction_factor =
+ ((rate_correction_factor * correction_factor) / 100);
// Keep rate_correction_factor within limits
if (rate_correction_factor < MIN_BPB_FACTOR)
rate_correction_factor = MIN_BPB_FACTOR;
}
- if (cpi->common.frame_type == KEY_FRAME)
+ if (cpi->common.frame_type == KEY_FRAME) {
cpi->key_frame_rate_correction_factor = rate_correction_factor;
- else {
+ } else {
if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
cpi->gf_rate_correction_factor = rate_correction_factor;
else
@@ -358,20 +328,24 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
double correction_factor;
// Select the appropriate correction factor based upon type of frame.
- if (cpi->common.frame_type == KEY_FRAME)
+ if (cpi->common.frame_type == KEY_FRAME) {
correction_factor = cpi->key_frame_rate_correction_factor;
- else {
+ } else {
if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
correction_factor = cpi->gf_rate_correction_factor;
else
correction_factor = cpi->rate_correction_factor;
}
- // Calculate required scaling factor based on target frame size and size of frame produced using previous Q
+ // Calculate required scaling factor based on target frame size and size of
+ // frame produced using previous Q.
if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
- target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS; // Case where we would overflow int
+ target_bits_per_mb =
+ (target_bits_per_frame / cpi->common.MBs)
+ << BPER_MB_NORMBITS; // Case where we would overflow int
else
- target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
+ target_bits_per_mb =
+ (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
i = cpi->active_best_quality;
@@ -437,7 +411,6 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) {
}
av_key_frame_frequency /= total_weight;
-
}
return av_key_frame_frequency;
}
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.h b/libvpx/vp9/encoder/vp9_ratectrl.h
index 4733176..ddda713 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -32,8 +32,8 @@ int vp9_pick_frame_size(VP9_COMP *cpi);
double vp9_convert_qindex_to_q(int qindex);
int vp9_gfboost_qadjust(int qindex);
-extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
- double correction_factor);
+int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+ double correction_factor);
void vp9_setup_inter_frame(VP9_COMP *cpi);
#endif // VP9_ENCODER_VP9_RATECTRL_H_
diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index df00334..993919e 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c
@@ -36,7 +36,7 @@
#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_entropy.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
#include "vp9/common/vp9_mvref_common.h"
#include "vp9/common/vp9_common.h"
@@ -45,58 +45,59 @@
/* Factor to weigh the rate for switchable interp filters */
#define SWITCHABLE_INTERP_RATE_FACTOR 1
-DECLARE_ALIGNED(16, extern const uint8_t,
- vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+#define LAST_FRAME_MODE_MASK 0xFFEDCD60
+#define GOLDEN_FRAME_MODE_MASK 0xFFDA3BB0
+#define ALT_REF_MODE_MASK 0xFFC648D0
-#define LAST_FRAME_MODE_MASK 0xFFDADCD60
-#define GOLDEN_FRAME_MODE_MASK 0xFFB5A3BB0
-#define ALT_REF_MODE_MASK 0xFF8C648D0
+#define MIN_EARLY_TERM_INDEX 3
const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
- {RD_NEARESTMV, LAST_FRAME, NONE},
- {RD_NEARESTMV, ALTREF_FRAME, NONE},
- {RD_NEARESTMV, GOLDEN_FRAME, NONE},
-
- {RD_DC_PRED, INTRA_FRAME, NONE},
-
- {RD_NEWMV, LAST_FRAME, NONE},
- {RD_NEWMV, ALTREF_FRAME, NONE},
- {RD_NEWMV, GOLDEN_FRAME, NONE},
-
- {RD_NEARMV, LAST_FRAME, NONE},
- {RD_NEARMV, ALTREF_FRAME, NONE},
- {RD_NEARESTMV, LAST_FRAME, ALTREF_FRAME},
- {RD_NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
-
- {RD_TM_PRED, INTRA_FRAME, NONE},
-
- {RD_NEARMV, LAST_FRAME, ALTREF_FRAME},
- {RD_NEWMV, LAST_FRAME, ALTREF_FRAME},
- {RD_NEARMV, GOLDEN_FRAME, NONE},
- {RD_NEARMV, GOLDEN_FRAME, ALTREF_FRAME},
- {RD_NEWMV, GOLDEN_FRAME, ALTREF_FRAME},
-
- {RD_SPLITMV, LAST_FRAME, NONE},
- {RD_SPLITMV, GOLDEN_FRAME, NONE},
- {RD_SPLITMV, ALTREF_FRAME, NONE},
- {RD_SPLITMV, LAST_FRAME, ALTREF_FRAME},
- {RD_SPLITMV, GOLDEN_FRAME, ALTREF_FRAME},
-
- {RD_ZEROMV, LAST_FRAME, NONE},
- {RD_ZEROMV, GOLDEN_FRAME, NONE},
- {RD_ZEROMV, ALTREF_FRAME, NONE},
- {RD_ZEROMV, LAST_FRAME, ALTREF_FRAME},
- {RD_ZEROMV, GOLDEN_FRAME, ALTREF_FRAME},
-
- {RD_I4X4_PRED, INTRA_FRAME, NONE},
- {RD_H_PRED, INTRA_FRAME, NONE},
- {RD_V_PRED, INTRA_FRAME, NONE},
- {RD_D135_PRED, INTRA_FRAME, NONE},
- {RD_D207_PRED, INTRA_FRAME, NONE},
- {RD_D153_PRED, INTRA_FRAME, NONE},
- {RD_D63_PRED, INTRA_FRAME, NONE},
- {RD_D117_PRED, INTRA_FRAME, NONE},
- {RD_D45_PRED, INTRA_FRAME, NONE},
+ {NEARESTMV, LAST_FRAME, NONE},
+ {NEARESTMV, ALTREF_FRAME, NONE},
+ {NEARESTMV, GOLDEN_FRAME, NONE},
+
+ {DC_PRED, INTRA_FRAME, NONE},
+
+ {NEWMV, LAST_FRAME, NONE},
+ {NEWMV, ALTREF_FRAME, NONE},
+ {NEWMV, GOLDEN_FRAME, NONE},
+
+ {NEARMV, LAST_FRAME, NONE},
+ {NEARMV, ALTREF_FRAME, NONE},
+ {NEARESTMV, LAST_FRAME, ALTREF_FRAME},
+ {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
+
+ {TM_PRED, INTRA_FRAME, NONE},
+
+ {NEARMV, LAST_FRAME, ALTREF_FRAME},
+ {NEWMV, LAST_FRAME, ALTREF_FRAME},
+ {NEARMV, GOLDEN_FRAME, NONE},
+ {NEARMV, GOLDEN_FRAME, ALTREF_FRAME},
+ {NEWMV, GOLDEN_FRAME, ALTREF_FRAME},
+
+ {ZEROMV, LAST_FRAME, NONE},
+ {ZEROMV, GOLDEN_FRAME, NONE},
+ {ZEROMV, ALTREF_FRAME, NONE},
+ {ZEROMV, LAST_FRAME, ALTREF_FRAME},
+ {ZEROMV, GOLDEN_FRAME, ALTREF_FRAME},
+
+ {H_PRED, INTRA_FRAME, NONE},
+ {V_PRED, INTRA_FRAME, NONE},
+ {D135_PRED, INTRA_FRAME, NONE},
+ {D207_PRED, INTRA_FRAME, NONE},
+ {D153_PRED, INTRA_FRAME, NONE},
+ {D63_PRED, INTRA_FRAME, NONE},
+ {D117_PRED, INTRA_FRAME, NONE},
+ {D45_PRED, INTRA_FRAME, NONE},
+};
+
+const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
+ {LAST_FRAME, NONE},
+ {GOLDEN_FRAME, NONE},
+ {ALTREF_FRAME, NONE},
+ {LAST_FRAME, ALTREF_FRAME},
+ {GOLDEN_FRAME, ALTREF_FRAME},
+ {INTRA_FRAME, NONE},
};
// The baseline rd thresholds for breaking out of the rd loop for
@@ -106,8 +107,13 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
static int rd_thresh_block_size_factor[BLOCK_SIZES] =
{2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
-#define MAX_RD_THRESH_FACT 64
-#define RD_THRESH_INC 1
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC 1
+#define RD_THRESH_POW 1.25
+#define RD_MULT_EPB_RATIO 64
+
+#define MV_COST_WEIGHT 108
+#define MV_COST_WEIGHT_SUB 120
static void fill_token_costs(vp9_coeff_cost *c,
vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
@@ -155,18 +161,26 @@ void vp9_init_me_luts() {
}
}
-static int compute_rd_mult(int qindex) {
+int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex) {
const int q = vp9_dc_quant(qindex, 0);
- return (11 * q * q) >> 2;
+ // TODO(debargha): Adjust the function below
+ int rdmult = 88 * q * q / 25;
+ if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
+ if (cpi->twopass.next_iiratio > 31)
+ rdmult += (rdmult * rd_iifactor[31]) >> 4;
+ else
+ rdmult += (rdmult * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
+ }
+ return rdmult;
}
-static MB_PREDICTION_MODE rd_mode_to_mode(RD_PREDICTION_MODE rd_mode) {
- if (rd_mode == RD_SPLITMV || rd_mode == RD_I4X4_PRED) {
- assert(!"Invalid rd_mode");
- return MB_MODE_COUNT;
- }
- assert((int)rd_mode < (int)MB_MODE_COUNT);
- return (MB_PREDICTION_MODE)rd_mode;
+static int compute_rd_thresh_factor(int qindex) {
+ int q;
+ // TODO(debargha): Adjust the function below
+ q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
+ if (q < 8)
+ q = 8;
+ return q;
}
void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
@@ -174,102 +188,90 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
}
+static void set_block_thresholds(VP9_COMP *cpi) {
+ int i, bsize, segment_id;
+ VP9_COMMON *cm = &cpi->common;
-void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
- int q, i, bsize;
-
- vp9_clear_system_state(); // __asm emms;
-
- // Further tests required to see if optimum is different
- // for key frames, golden frames and arf frames.
- // if (cpi->common.refresh_golden_frame ||
- // cpi->common.refresh_alt_ref_frame)
- qindex = clamp(qindex, 0, MAXQ);
-
- cpi->RDMULT = compute_rd_mult(qindex);
- if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
- if (cpi->twopass.next_iiratio > 31)
- cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
- else
- cpi->RDMULT +=
- (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
- }
- cpi->mb.errorperbit = cpi->RDMULT >> 6;
- cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
-
- vp9_set_speed_features(cpi);
-
- q = (int)pow(vp9_dc_quant(qindex, 0) >> 2, 1.25);
- q <<= 2;
- if (q < 8)
- q = 8;
-
- if (cpi->RDMULT > 1000) {
- cpi->RDDIV = 1;
- cpi->RDMULT /= 100;
+ for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
+ int q;
+ int segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+ segment_qindex = clamp(segment_qindex + cm->y_dc_delta_q, 0, MAXQ);
+ q = compute_rd_thresh_factor(segment_qindex);
for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
+ // Threshold here seem unecessarily harsh but fine given actual
+ // range of values used for cpi->sf.thresh_mult[]
+ int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
+
for (i = 0; i < MAX_MODES; ++i) {
- // Threshold here seem unecessarily harsh but fine given actual
- // range of values used for cpi->sf.thresh_mult[]
- int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
-
- // *4 relates to the scaling of rd_thresh_block_size_factor[]
- if ((int64_t)cpi->sf.thresh_mult[i] < thresh_max) {
- cpi->rd_threshes[bsize][i] =
- cpi->sf.thresh_mult[i] * q *
- rd_thresh_block_size_factor[bsize] / (4 * 100);
+ if (cpi->sf.thresh_mult[i] < thresh_max) {
+ cpi->rd_threshes[segment_id][bsize][i] =
+ cpi->sf.thresh_mult[i] * q *
+ rd_thresh_block_size_factor[bsize] / 4;
} else {
- cpi->rd_threshes[bsize][i] = INT_MAX;
+ cpi->rd_threshes[segment_id][bsize][i] = INT_MAX;
}
}
- }
- } else {
- cpi->RDDIV = 100;
-
- for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
- for (i = 0; i < MAX_MODES; i++) {
- // Threshold here seem unecessarily harsh but fine given actual
- // range of values used for cpi->sf.thresh_mult[]
- int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
- if (cpi->sf.thresh_mult[i] < thresh_max) {
- cpi->rd_threshes[bsize][i] =
- cpi->sf.thresh_mult[i] * q *
- rd_thresh_block_size_factor[bsize] / 4;
+ for (i = 0; i < MAX_REFS; ++i) {
+ if (cpi->sf.thresh_mult_sub8x8[i] < thresh_max) {
+ cpi->rd_thresh_sub8x8[segment_id][bsize][i] =
+ cpi->sf.thresh_mult_sub8x8[i] * q *
+ rd_thresh_block_size_factor[bsize] / 4;
} else {
- cpi->rd_threshes[bsize][i] = INT_MAX;
+ cpi->rd_thresh_sub8x8[segment_id][bsize][i] = INT_MAX;
}
}
}
}
+}
+
+void vp9_initialize_rd_consts(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ int qindex, i;
+
+ vp9_clear_system_state(); // __asm emms;
+
+ // Further tests required to see if optimum is different
+ // for key frames, golden frames and arf frames.
+ // if (cpi->common.refresh_golden_frame ||
+ // cpi->common.refresh_alt_ref_frame)
+ qindex = clamp(cm->base_qindex + cm->y_dc_delta_q, 0, MAXQ);
+
+ cpi->RDDIV = RDDIV_BITS; // in bits (to multiply D by 128)
+ cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex);
+
+ cpi->mb.errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
+ cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
+
+ vp9_set_speed_features(cpi);
- fill_token_costs(cpi->mb.token_costs, cpi->common.fc.coef_probs);
+ set_block_thresholds(cpi);
- for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
- vp9_cost_tokens(cpi->mb.partition_cost[i],
- cpi->common.fc.partition_prob[cpi->common.frame_type][i],
+ fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs);
+
+ for (i = 0; i < PARTITION_CONTEXTS; i++)
+ vp9_cost_tokens(cpi->mb.partition_cost[i], get_partition_probs(cm, i),
vp9_partition_tree);
/*rough estimate for costing*/
vp9_init_mode_costs(cpi);
- if (cpi->common.frame_type != KEY_FRAME) {
+ if (!frame_is_intra_only(cm)) {
vp9_build_nmv_cost_table(
cpi->mb.nmvjointcost,
- cpi->mb.e_mbd.allow_high_precision_mv ?
- cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
- &cpi->common.fc.nmvc,
- cpi->mb.e_mbd.allow_high_precision_mv, 1, 1);
+ cm->allow_high_precision_mv ? cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
+ &cm->fc.nmvc,
+ cm->allow_high_precision_mv, 1, 1);
for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
MB_PREDICTION_MODE m;
for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
- cpi->mb.inter_mode_cost[i][m - NEARESTMV] =
+ cpi->mb.inter_mode_cost[i][inter_mode_offset(m)] =
cost_token(vp9_inter_mode_tree,
- cpi->common.fc.inter_mode_probs[i],
- vp9_inter_mode_encodings - NEARESTMV + m);
+ cm->fc.inter_mode_probs[i],
+ &vp9_inter_mode_encodings[inter_mode_offset(m)]);
}
}
}
@@ -369,8 +371,8 @@ static void model_rd_from_var_lapndz(int var, int n, int qstep,
double s2 = (double) var / n;
double x = qstep / sqrt(s2);
model_rd_norm(x, &R, &D);
- *rate = ((n << 8) * R + 0.5);
- *dist = (var * D + 0.5);
+ *rate = (int)((n << 8) * R + 0.5);
+ *dist = (int)(var * D + 0.5);
}
vp9_clear_system_state();
}
@@ -397,7 +399,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
pd->dequant[1] >> 3, &rate, &dist);
rate_sum += rate;
- dist_sum += dist;
+ dist_sum += (int)dist;
}
*out_rate_sum = rate_sum;
@@ -479,13 +481,13 @@ static const int16_t band_counts[TX_SIZES][8] = {
{ 1, 2, 3, 4, 11, 1024 - 21, 0 },
};
-static INLINE int cost_coeffs(MACROBLOCK *mb,
+static INLINE int cost_coeffs(MACROBLOCK *x,
int plane, int block,
ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
TX_SIZE tx_size,
const int16_t *scan, const int16_t *nb) {
- MACROBLOCKD *const xd = &mb->e_mbd;
- MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
struct macroblockd_plane *pd = &xd->plane[plane];
const PLANE_TYPE type = pd->plane_type;
const int16_t *band_count = &band_counts[tx_size][1];
@@ -493,9 +495,9 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
- mb->token_costs[tx_size][type][ref];
+ x->token_costs[tx_size][type][ref];
const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
- uint8_t token_cache[1024];
+ uint8_t *p_tok = x->token_cache;
int pt = combine_entropy_contexts(above_ec, left_ec);
int c, cost;
@@ -514,7 +516,7 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
int v = qcoeff_ptr[0];
int prev_t = vp9_dct_value_tokens_ptr[v].token;
cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
- token_cache[0] = vp9_pt_energy_class[prev_t];
+ p_tok[0] = vp9_pt_energy_class[prev_t];
++token_costs;
// ac tokens
@@ -524,9 +526,9 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
v = qcoeff_ptr[rc];
t = vp9_dct_value_tokens_ptr[v].token;
- pt = get_coef_context(nb, token_cache, c);
+ pt = get_coef_context(nb, p_tok, c);
cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
- token_cache[rc] = vp9_pt_energy_class[t];
+ p_tok[rc] = vp9_pt_energy_class[t];
prev_t = t;
if (!--band_left) {
band_left = *band_count++;
@@ -536,7 +538,7 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
// eob token
if (band_left) {
- pt = get_coef_context(nb, token_cache, c);
+ pt = get_coef_context(nb, p_tok, c);
cost += (*token_costs)[0][pt][DCT_EOB_TOKEN];
}
}
@@ -547,21 +549,6 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
return cost;
}
-struct rdcost_block_args {
- MACROBLOCK *x;
- ENTROPY_CONTEXT t_above[16];
- ENTROPY_CONTEXT t_left[16];
- TX_SIZE tx_size;
- int bw;
- int bh;
- int rate;
- int64_t dist;
- int64_t sse;
- int64_t best_rd;
- int skip;
- const int16_t *scan, *nb;
-};
-
static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
const int ss_txfrm_size = tx_size << 1;
struct rdcost_block_args* args = arg;
@@ -573,16 +560,16 @@ static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
int shift = args->tx_size == TX_32X32 ? 0 : 2;
int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
- args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
- &this_sse) >> shift;
- args->sse += this_sse >> shift;
+ args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+ &this_sse) >> shift;
+ args->sse = this_sse >> shift;
if (x->skip_encode &&
- xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
+ xd->mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME) {
// TODO(jingning): tune the model to better capture the distortion.
int64_t p = (pd->dequant[1] * pd->dequant[1] *
- (1 << ss_txfrm_size)) >> shift;
- args->dist += p;
+ (1 << ss_txfrm_size)) >> (shift + 2);
+ args->dist += (p >> 4);
args->sse += p;
}
}
@@ -594,10 +581,9 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
int x_idx, y_idx;
txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx);
- args->rate += cost_coeffs(args->x, plane, block,
- args->t_above + x_idx,
- args->t_left + y_idx, args->tx_size,
- args->scan, args->nb);
+ args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
+ args->t_left + y_idx, args->tx_size,
+ args->scan, args->nb);
}
static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -610,85 +596,114 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
if (args->skip)
return;
- rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
- rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
- rd = MIN(rd1, rd2);
- if (rd > args->best_rd) {
- args->skip = 1;
- args->rate = INT_MAX;
- args->dist = INT64_MAX;
- args->sse = INT64_MAX;
- return;
- }
- if (!is_inter_block(&xd->this_mi->mbmi))
+ if (!is_inter_block(&xd->mi_8x8[0]->mbmi))
vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &encode_args);
else
vp9_xform_quant(plane, block, plane_bsize, tx_size, &encode_args);
dist_block(plane, block, tx_size, args);
rate_block(plane, block, plane_bsize, tx_size, args);
-}
+ rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
+ rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
-static void txfm_rd_in_plane(MACROBLOCK *x,
- int *rate, int64_t *distortion,
- int *skippable, int64_t *sse,
- int64_t ref_best_rd, int plane,
- BLOCK_SIZE bsize, TX_SIZE tx_size) {
- MACROBLOCKD *const xd = &x->e_mbd;
- struct macroblockd_plane *const pd = &xd->plane[plane];
- const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
- const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs];
- const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs];
- int i;
- struct rdcost_block_args args = { x, { 0 }, { 0 }, tx_size,
- num_4x4_blocks_wide, num_4x4_blocks_high,
- 0, 0, 0, ref_best_rd, 0 };
+ // TODO(jingning): temporarily enabled only for luma component
+ rd = MIN(rd1, rd2);
if (plane == 0)
- xd->this_mi->mbmi.tx_size = tx_size;
+ x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block];
+
+ args->this_rate += args->rate;
+ args->this_dist += args->dist;
+ args->this_sse += args->sse;
+ args->this_rd += rd;
+ if (args->this_rd > args->best_rd) {
+ args->skip = 1;
+ return;
+ }
+}
+
+void vp9_get_entropy_contexts(TX_SIZE tx_size,
+ ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
+ const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
+ int num_4x4_w, int num_4x4_h) {
+ int i;
switch (tx_size) {
case TX_4X4:
- vpx_memcpy(&args.t_above, pd->above_context,
- sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide);
- vpx_memcpy(&args.t_left, pd->left_context,
- sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high);
- get_scan_nb_4x4(get_tx_type_4x4(pd->plane_type, xd, 0),
- &args.scan, &args.nb);
+ vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+ vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
break;
case TX_8X8:
- for (i = 0; i < num_4x4_blocks_wide; i += 2)
- args.t_above[i] = !!*(uint16_t *)&pd->above_context[i];
- for (i = 0; i < num_4x4_blocks_high; i += 2)
- args.t_left[i] = !!*(uint16_t *)&pd->left_context[i];
- get_scan_nb_8x8(get_tx_type_8x8(pd->plane_type, xd),
- &args.scan, &args.nb);
+ for (i = 0; i < num_4x4_w; i += 2)
+ t_above[i] = !!*(const uint16_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 2)
+ t_left[i] = !!*(const uint16_t *)&left[i];
break;
case TX_16X16:
- for (i = 0; i < num_4x4_blocks_wide; i += 4)
- args.t_above[i] = !!*(uint32_t *)&pd->above_context[i];
- for (i = 0; i < num_4x4_blocks_high; i += 4)
- args.t_left[i] = !!*(uint32_t *)&pd->left_context[i];
- get_scan_nb_16x16(get_tx_type_16x16(pd->plane_type, xd),
- &args.scan, &args.nb);
+ for (i = 0; i < num_4x4_w; i += 4)
+ t_above[i] = !!*(const uint32_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 4)
+ t_left[i] = !!*(const uint32_t *)&left[i];
break;
case TX_32X32:
- for (i = 0; i < num_4x4_blocks_wide; i += 8)
- args.t_above[i] = !!*(uint64_t *)&pd->above_context[i];
- for (i = 0; i < num_4x4_blocks_high; i += 8)
- args.t_left[i] = !!*(uint64_t *)&pd->left_context[i];
- args.scan = vp9_default_scan_32x32;
- args.nb = vp9_default_scan_32x32_neighbors;
+ for (i = 0; i < num_4x4_w; i += 8)
+ t_above[i] = !!*(const uint64_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 8)
+ t_left[i] = !!*(const uint64_t *)&left[i];
break;
default:
- assert(0);
+ assert(!"Invalid transform size.");
}
+}
+
+static void init_rdcost_stack(MACROBLOCK *x, TX_SIZE tx_size,
+ const int num_4x4_w, const int num_4x4_h,
+ const int64_t ref_rdcost,
+ struct rdcost_block_args *arg) {
+ vpx_memset(arg, 0, sizeof(struct rdcost_block_args));
+ arg->x = x;
+ arg->tx_size = tx_size;
+ arg->bw = num_4x4_w;
+ arg->bh = num_4x4_h;
+ arg->best_rd = ref_rdcost;
+}
+
+static void txfm_rd_in_plane(MACROBLOCK *x,
+ struct rdcost_block_args *rd_stack,
+ int *rate, int64_t *distortion,
+ int *skippable, int64_t *sse,
+ int64_t ref_best_rd, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bs];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
+
+ init_rdcost_stack(x, tx_size, num_4x4_w, num_4x4_h,
+ ref_best_rd, rd_stack);
+ if (plane == 0)
+ xd->mi_8x8[0]->mbmi.tx_size = tx_size;
- foreach_transformed_block_in_plane(xd, bsize, plane, block_yrd_txfm, &args);
- *distortion = args.dist;
- *rate = args.rate;
- *sse = args.sse;
- *skippable = vp9_is_skippable_in_plane(xd, bsize, plane) && (!args.skip);
+ vp9_get_entropy_contexts(tx_size, rd_stack->t_above, rd_stack->t_left,
+ pd->above_context, pd->left_context,
+ num_4x4_w, num_4x4_h);
+
+ get_scan(xd, tx_size, pd->plane_type, 0, &rd_stack->scan, &rd_stack->nb);
+
+ foreach_transformed_block_in_plane(xd, bsize, plane,
+ block_yrd_txfm, rd_stack);
+ if (rd_stack->skip) {
+ *rate = INT_MAX;
+ *distortion = INT64_MAX;
+ *sse = INT64_MAX;
+ *skippable = 0;
+ } else {
+ *distortion = rd_stack->this_dist;
+ *rate = rd_stack->this_rate;
+ *sse = rd_stack->this_sse;
+ *skippable = vp9_is_skippable_in_plane(xd, bsize, plane);
+ }
}
static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
@@ -696,28 +711,18 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
int *skip, int64_t *sse,
int64_t ref_best_rd,
BLOCK_SIZE bs) {
- const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
+ const TX_SIZE max_tx_size = max_txsize_lookup[bs];
VP9_COMMON *const cm = &cpi->common;
+ const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
- if (max_txfm_size == TX_32X32 &&
- (cm->tx_mode == ALLOW_32X32 ||
- cm->tx_mode == TX_MODE_SELECT)) {
- mbmi->tx_size = TX_32X32;
- } else if (max_txfm_size >= TX_16X16 &&
- (cm->tx_mode == ALLOW_16X16 ||
- cm->tx_mode == ALLOW_32X32 ||
- cm->tx_mode == TX_MODE_SELECT)) {
- mbmi->tx_size = TX_16X16;
- } else if (cm->tx_mode != ONLY_4X4) {
- mbmi->tx_size = TX_8X8;
- } else {
- mbmi->tx_size = TX_4X4;
- }
- txfm_rd_in_plane(x, rate, distortion, skip,
+ MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+
+ mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
+
+ txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
&sse[mbmi->tx_size], ref_best_rd, 0, bs,
mbmi->tx_size);
- cpi->txfm_stepdown_count[0]++;
+ cpi->tx_stepdown_count[0]++;
}
static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
@@ -729,13 +734,13 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
const TX_SIZE max_tx_size = max_txsize_lookup[bs];
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
int64_t rd[TX_SIZES][2];
int n, m;
int s0, s1;
- const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->this_mi);
+ const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]);
for (n = TX_4X4; n <= max_tx_size; n++) {
r[n][1] = r[n][0];
@@ -811,15 +816,15 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
rd[TX_32X32][1] < rd[TX_16X16][1] &&
rd[TX_32X32][1] < rd[TX_8X8][1] &&
rd[TX_32X32][1] < rd[TX_4X4][1]) {
- cpi->txfm_stepdown_count[0]++;
+ cpi->tx_stepdown_count[0]++;
} else if (max_tx_size >= TX_16X16 &&
rd[TX_16X16][1] < rd[TX_8X8][1] &&
rd[TX_16X16][1] < rd[TX_4X4][1]) {
- cpi->txfm_stepdown_count[max_tx_size - TX_16X16]++;
+ cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
} else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
- cpi->txfm_stepdown_count[max_tx_size - TX_8X8]++;
+ cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
} else {
- cpi->txfm_stepdown_count[max_tx_size - TX_4X4]++;
+ cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
}
}
@@ -829,10 +834,10 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
int *s, int *skip, int64_t *sse,
int64_t ref_best_rd,
BLOCK_SIZE bs) {
- const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
+ const TX_SIZE max_tx_size = max_txsize_lookup[bs];
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
int64_t rd[TX_SIZES][2];
int n, m;
@@ -840,14 +845,14 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
// double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
- const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->this_mi);
+ const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]);
// for (n = TX_4X4; n <= max_txfm_size; n++)
// r[n][0] = (r[n][0] * scale_r[n]);
- for (n = TX_4X4; n <= max_txfm_size; n++) {
+ for (n = TX_4X4; n <= max_tx_size; n++) {
r[n][1] = r[n][0];
- for (m = 0; m <= n - (n == max_txfm_size); m++) {
+ for (m = 0; m <= n - (n == max_tx_size); m++) {
if (m == n)
r[n][1] += vp9_cost_zero(tx_probs[m]);
else
@@ -859,7 +864,7 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
s0 = vp9_cost_bit(skip_prob, 0);
s1 = vp9_cost_bit(skip_prob, 1);
- for (n = TX_4X4; n <= max_txfm_size; n++) {
+ for (n = TX_4X4; n <= max_tx_size; n++) {
if (s[n]) {
rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
} else {
@@ -867,19 +872,19 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
}
}
- for (n = TX_4X4; n <= max_txfm_size; n++) {
- rd[n][0] = (scale_rd[n] * rd[n][0]);
- rd[n][1] = (scale_rd[n] * rd[n][1]);
+ for (n = TX_4X4; n <= max_tx_size; n++) {
+ rd[n][0] = (int64_t)(scale_rd[n] * rd[n][0]);
+ rd[n][1] = (int64_t)(scale_rd[n] * rd[n][1]);
}
- if (max_txfm_size == TX_32X32 &&
+ if (max_tx_size == TX_32X32 &&
(cm->tx_mode == ALLOW_32X32 ||
(cm->tx_mode == TX_MODE_SELECT &&
rd[TX_32X32][1] <= rd[TX_16X16][1] &&
rd[TX_32X32][1] <= rd[TX_8X8][1] &&
rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
mbmi->tx_size = TX_32X32;
- } else if (max_txfm_size >= TX_16X16 &&
+ } else if (max_tx_size >= TX_16X16 &&
(cm->tx_mode == ALLOW_16X16 ||
cm->tx_mode == ALLOW_32X32 ||
(cm->tx_mode == TX_MODE_SELECT &&
@@ -898,22 +903,22 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
// Actually encode using the chosen mode if a model was used, but do not
// update the r, d costs
- txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size],
- ref_best_rd, 0, bs, mbmi->tx_size);
+ txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
+ &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size);
- if (max_txfm_size == TX_32X32 &&
+ if (max_tx_size == TX_32X32 &&
rd[TX_32X32][1] <= rd[TX_16X16][1] &&
rd[TX_32X32][1] <= rd[TX_8X8][1] &&
rd[TX_32X32][1] <= rd[TX_4X4][1]) {
- cpi->txfm_stepdown_count[0]++;
- } else if (max_txfm_size >= TX_16X16 &&
+ cpi->tx_stepdown_count[0]++;
+ } else if (max_tx_size >= TX_16X16 &&
rd[TX_16X16][1] <= rd[TX_8X8][1] &&
rd[TX_16X16][1] <= rd[TX_4X4][1]) {
- cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
+ cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
} else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
- cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
+ cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
} else {
- cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
+ cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
}
}
@@ -925,15 +930,17 @@ static void super_block_yrd(VP9_COMP *cpi,
int r[TX_SIZES][2], s[TX_SIZES];
int64_t d[TX_SIZES], sse[TX_SIZES];
MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+ struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack;
+ const int b_inter_mode = is_inter_block(mbmi);
assert(bs == mbmi->sb_type);
- if (mbmi->ref_frame[0] > INTRA_FRAME)
+ if (b_inter_mode)
vp9_subtract_sby(x, bs);
if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
(cpi->sf.tx_size_search_method != USE_FULL_RD &&
- mbmi->ref_frame[0] == INTRA_FRAME)) {
+ !b_inter_mode)) {
vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
ref_best_rd, bs);
@@ -943,7 +950,7 @@ static void super_block_yrd(VP9_COMP *cpi,
}
if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
- mbmi->ref_frame[0] > INTRA_FRAME) {
+ b_inter_mode) {
if (bs >= BLOCK_32X32)
model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
&r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
@@ -961,14 +968,16 @@ static void super_block_yrd(VP9_COMP *cpi,
skip, sse, ref_best_rd, bs);
} else {
if (bs >= BLOCK_32X32)
- txfm_rd_in_plane(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
- &sse[TX_32X32], ref_best_rd, 0, bs, TX_32X32);
+ txfm_rd_in_plane(x, rdcost_stack, &r[TX_32X32][0], &d[TX_32X32],
+ &s[TX_32X32], &sse[TX_32X32],
+ ref_best_rd, 0, bs, TX_32X32);
if (bs >= BLOCK_16X16)
- txfm_rd_in_plane(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
- &sse[TX_16X16], ref_best_rd, 0, bs, TX_16X16);
- txfm_rd_in_plane(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+ txfm_rd_in_plane(x, rdcost_stack, &r[TX_16X16][0], &d[TX_16X16],
+ &s[TX_16X16], &sse[TX_16X16],
+ ref_best_rd, 0, bs, TX_16X16);
+ txfm_rd_in_plane(x, rdcost_stack, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
&sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8);
- txfm_rd_in_plane(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+ txfm_rd_in_plane(x, rdcost_stack, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
&sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4);
choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
skip, txfm_cache, bs);
@@ -1022,23 +1031,23 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
ENTROPY_CONTEXT ta[2], tempa[2];
ENTROPY_CONTEXT tl[2], templ[2];
- TX_TYPE tx_type = DCT_DCT;
+
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
- int idx, idy, block;
+ int idx, idy;
uint8_t best_dst[8 * 8];
assert(ib < 4);
vpx_memcpy(ta, a, sizeof(ta));
vpx_memcpy(tl, l, sizeof(tl));
- xd->this_mi->mbmi.tx_size = TX_4X4;
+ xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
int64_t this_rd;
int ratey = 0;
- if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+ if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
continue;
// Only do the oblique modes if the best so far is
@@ -1058,11 +1067,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
int64_t ssz;
const int16_t *scan;
+ const int16_t *nb;
uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
-
- block = ib + idy * 2 + idx;
- xd->this_mi->bmi[block].as_mode = mode;
+ const int block = ib + idy * 2 + idx;
+ TX_TYPE tx_type;
+ xd->mi_8x8[0]->bmi[block].as_mode = mode;
src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
vp9_predict_intra_block(xd, block, 1,
@@ -1075,29 +1085,28 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
dst, dst_stride);
tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block);
- if (tx_type != DCT_DCT) {
+ get_scan_nb_4x4(tx_type, &scan, &nb);
+
+ if (tx_type != DCT_DCT)
vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
- x->quantize_b_4x4(x, block, tx_type, 16);
- } else {
- x->fwd_txm4x4(src_diff, coeff, 16);
- x->quantize_b_4x4(x, block, tx_type, 16);
- }
+ else
+ x->fwd_txm4x4(src_diff, coeff, 8);
+
+ vp9_regular_quantize_b_4x4(x, 4, block, scan, get_iscan_4x4(tx_type));
- scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block));
ratey += cost_coeffs(x, 0, block,
- tempa + idx, templ + idy, TX_4X4, scan,
- vp9_get_coef_neighbors_handle(scan));
+ tempa + idx, templ + idy, TX_4X4, scan, nb);
distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
16, &ssz) >> 2;
if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
goto next;
if (tx_type != DCT_DCT)
- vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
+ vp9_iht4x4_16_add(BLOCK_OFFSET(pd->dqcoeff, block),
dst, pd->dst.stride, tx_type);
else
- xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
- dst, pd->dst.stride);
+ xd->itxm_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride,
+ 16);
}
}
@@ -1138,10 +1147,10 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
int64_t best_rd) {
int i, j;
MACROBLOCKD *const xd = &mb->e_mbd;
- MODE_INFO *const mic = xd->this_mi;
+ MODE_INFO *const mic = xd->mi_8x8[0];
const MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
- const MODE_INFO *left_mi = xd->mi_8x8[-1];
- const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type;
+ const MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL;
+ const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
int idx, idy;
@@ -1166,9 +1175,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
i = idy * 2 + idx;
if (cpi->common.frame_type == KEY_FRAME) {
const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, i);
- const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
- left_block_mode(mic, left_mi, i) :
- DC_PRED;
+ const MB_PREDICTION_MODE L = left_block_mode(mic, left_mi, i);
bmode_costs = mb->y_mode_costs[A][L];
}
@@ -1212,7 +1219,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
MB_PREDICTION_MODE mode;
MB_PREDICTION_MODE mode_selected = DC_PRED;
MACROBLOCKD *const xd = &x->e_mbd;
- MODE_INFO *const mic = xd->this_mi;
+ MODE_INFO *const mic = xd->mi_8x8[0];
int this_rate, this_rate_tokenonly, s;
int64_t this_distortion, this_rd;
TX_SIZE best_tx = TX_4X4;
@@ -1227,15 +1234,14 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
for (mode = DC_PRED; mode <= TM_PRED; mode++) {
int64_t local_tx_cache[TX_MODES];
MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
- MODE_INFO *left_mi = xd->mi_8x8[-1];
+ MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL;
- if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+ if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode)))
continue;
if (cpi->common.frame_type == KEY_FRAME) {
const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, 0);
- const MB_PREDICTION_MODE L = xd->left_available ?
- left_block_mode(mic, left_mi, 0) : DC_PRED;
+ const MB_PREDICTION_MODE L = left_block_mode(mic, left_mi, 0);
bmode_costs = x->y_mode_costs[A][L];
}
@@ -1277,12 +1283,12 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
return best_rd;
}
-static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
+static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
int *rate, int64_t *distortion, int *skippable,
int64_t *sse, BLOCK_SIZE bsize,
int64_t ref_best_rd) {
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
int plane;
int pnrate = 0, pnskip = 1;
@@ -1300,7 +1306,7 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
*skippable = 1;
for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
- txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
+ txfm_rd_in_plane(x, &cpi->rdcost_stack, &pnrate, &pndist, &pnskip, &pnsse,
ref_best_rd, plane, bsize, uv_txfm_size);
if (pnrate == INT_MAX)
goto term;
@@ -1332,14 +1338,15 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
// int mode_mask = (bsize <= BLOCK_8X8)
// ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
- for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+ for (mode = DC_PRED; mode <= TM_PRED; mode ++) {
// if (!(mode_mask & (1 << mode)))
- if (!(cpi->sf.intra_uv_mode_mask & (1 << mode)))
+ if (!(cpi->sf.intra_uv_mode_mask[max_uv_txsize_lookup[bsize]]
+ & (1 << mode)))
continue;
x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
- super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
+ super_block_uvrd(cpi, x, &this_rate_tokenonly,
&this_distortion, &s, &this_sse, bsize, best_rd);
if (this_rate_tokenonly == INT_MAX)
continue;
@@ -1370,8 +1377,8 @@ static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
int64_t this_sse;
x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
- super_block_uvrd(&cpi->common, x, rate_tokenonly,
- distortion, skippable, &this_sse, bsize, INT64_MAX);
+ super_block_uvrd(cpi, x, rate_tokenonly, distortion,
+ skippable, &this_sse, bsize, INT64_MAX);
*rate = *rate_tokenonly +
x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
@@ -1404,12 +1411,12 @@ static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
int mode_context) {
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- const int segment_id = xd->this_mi->mbmi.segment_id;
+ const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
// Don't account for mode here if segment skip is enabled.
if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
assert(is_inter_mode(mode));
- return x->inter_mode_cost[mode_context][mode - NEARESTMV];
+ return x->inter_mode_cost[mode_context][inter_mode_offset(mode)];
} else {
return 0;
}
@@ -1426,10 +1433,6 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int mi_row, int mi_col,
int_mv single_newmv[MAX_REF_FRAMES],
int *rate_mv);
-static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize,
- int mi_row, int mi_col,
- int_mv *tmp_mv, int *rate_mv);
static int labels2mode(MACROBLOCK *x, int i,
MB_PREDICTION_MODE this_mode,
@@ -1440,12 +1443,13 @@ static int labels2mode(MACROBLOCK *x, int i,
int_mv *second_best_ref_mv,
int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
MACROBLOCKD *const xd = &x->e_mbd;
- MODE_INFO *const mic = xd->this_mi;
+ MODE_INFO *const mic = xd->mi_8x8[0];
MB_MODE_INFO *mbmi = &mic->mbmi;
int cost = 0, thismvcost = 0;
int idx, idy;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+ const int has_second_rf = has_second_ref(mbmi);
/* We have to be careful retrieving previously-encoded motion vectors.
Ones from this macroblock have to be pulled from the BLOCKD array
@@ -1457,29 +1461,30 @@ static int labels2mode(MACROBLOCK *x, int i,
switch (m = this_mode) {
case NEWMV:
this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
- thismvcost = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
- 102);
- if (mbmi->ref_frame[1] > 0) {
+ thismvcost = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv,
+ mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+ if (has_second_rf) {
this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
- thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
- mvjcost, mvcost, 102);
+ thismvcost += vp9_mv_bit_cost(&this_second_mv->as_mv,
+ &second_best_ref_mv->as_mv,
+ mvjcost, mvcost, MV_COST_WEIGHT_SUB);
}
break;
case NEARESTMV:
this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int;
- if (mbmi->ref_frame[1] > 0)
+ if (has_second_rf)
this_second_mv->as_int =
frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
break;
case NEARMV:
this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int;
- if (mbmi->ref_frame[1] > 0)
+ if (has_second_rf)
this_second_mv->as_int =
frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
break;
case ZEROMV:
this_mv->as_int = 0;
- if (mbmi->ref_frame[1] > 0)
+ if (has_second_rf)
this_second_mv->as_int = 0;
break;
default:
@@ -1490,10 +1495,11 @@ static int labels2mode(MACROBLOCK *x, int i,
mbmi->mode_context[mbmi->ref_frame[0]]);
mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
- if (mbmi->ref_frame[1] > 0)
+ if (has_second_rf)
mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
- x->partition_info->bmi[i].mode = m;
+ mic->bmi[i].as_mode = m;
+
for (idy = 0; idy < num_4x4_blocks_high; ++idy)
for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
@@ -1514,25 +1520,21 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
int k;
MACROBLOCKD *xd = &x->e_mbd;
struct macroblockd_plane *const pd = &xd->plane[0];
- MODE_INFO *const mi = xd->this_mi;
+ struct macroblock_plane *const p = &x->plane[0];
+ MODE_INFO *const mi = xd->mi_8x8[0];
const BLOCK_SIZE bsize = mi->mbmi.sb_type;
const int width = plane_block_width(bsize, pd);
const int height = plane_block_height(bsize, pd);
int idx, idy;
- const int src_stride = x->plane[0].src.stride;
- uint8_t* const src = raster_block_offset_uint8(BLOCK_8X8, i,
- x->plane[0].src.buf,
- src_stride);
- int16_t* src_diff = raster_block_offset_int16(BLOCK_8X8, i,
- x->plane[0].src_diff);
- int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, i);
- uint8_t* const dst = raster_block_offset_uint8(BLOCK_8X8, i,
+
+ uint8_t *const src = raster_block_offset_uint8(BLOCK_8X8, i,
+ p->src.buf, p->src.stride);
+ uint8_t *const dst = raster_block_offset_uint8(BLOCK_8X8, i,
pd->dst.buf, pd->dst.stride);
int64_t thisdistortion = 0, thissse = 0;
- int thisrate = 0;
- int ref, second_ref = has_second_ref(&mi->mbmi);
-
- for (ref = 0; ref < 1 + second_ref; ++ref) {
+ int thisrate = 0, ref;
+ const int is_compound = has_second_ref(&mi->mbmi);
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
const uint8_t *pre = raster_block_offset_uint8(BLOCK_8X8, i,
pd->pre[ref].buf, pd->pre[ref].stride);
vp9_build_inter_predictor(pre, pd->pre[ref].stride,
@@ -1542,20 +1544,23 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
width, height, ref, &xd->subpix, MV_PRECISION_Q3);
}
- vp9_subtract_block(height, width, src_diff, 8, src, src_stride,
+ vp9_subtract_block(height, width,
+ raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
+ src, p->src.stride,
dst, pd->dst.stride);
k = i;
for (idy = 0; idy < height / 4; ++idy) {
for (idx = 0; idx < width / 4; ++idx) {
int64_t ssz, rd, rd1, rd2;
+ int16_t* coeff;
k += (idy * 2 + idx);
- src_diff = raster_block_offset_int16(BLOCK_8X8, k,
- x->plane[0].src_diff);
- coeff = BLOCK_OFFSET(x->plane[0].coeff, k);
- x->fwd_txm4x4(src_diff, coeff, 16);
- x->quantize_b_4x4(x, k, DCT_DCT, 16);
+ coeff = BLOCK_OFFSET(p->coeff, k);
+ x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
+ coeff, 8);
+ vp9_regular_quantize_b_4x4(x, 4, k, get_scan_4x4(DCT_DCT),
+ get_iscan_4x4(DCT_DCT));
thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
16, &ssz);
thissse += ssz;
@@ -1571,6 +1576,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
return INT64_MAX;
}
}
+
*distortion = thisdistortion >> 2;
*labelyrate = thisrate;
*sse = thissse >> 2;
@@ -1623,7 +1629,7 @@ static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
pd->pre[0].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[0].buf,
pd->pre[0].stride);
- if (mbmi->ref_frame[1])
+ if (has_second_ref(mbmi))
pd->pre[1].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[1].buf,
pd->pre[1].stride);
}
@@ -1633,19 +1639,21 @@ static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
MB_MODE_INFO *mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
x->plane[0].src = orig_src;
x->e_mbd.plane[0].pre[0] = orig_pre[0];
- if (mbmi->ref_frame[1])
+ if (has_second_ref(mbmi))
x->e_mbd.plane[0].pre[1] = orig_pre[1];
}
static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
BEST_SEG_INFO *bsi_buf, int filter_idx,
int_mv seg_mvs[4][MAX_REF_FRAMES],
int mi_row, int mi_col) {
- int i, j, br = 0, idx, idy;
+ int i, br = 0, idx, idy;
int64_t bd = 0, block_sse = 0;
MB_PREDICTION_MODE this_mode;
MODE_INFO *mi = x->e_mbd.mi_8x8[0];
MB_MODE_INFO *const mbmi = &mi->mbmi;
+ struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
const int label_count = 4;
int64_t this_segment_rd = 0;
int label_mv_thresh;
@@ -1658,9 +1666,10 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
int mode_idx;
int subpelmv = 1, have_ref = 0;
+ const int has_second_rf = has_second_ref(mbmi);
- vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
- vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
+ vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
+ vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
v_fn_ptr = &cpi->fn_ptr[bsize];
@@ -1682,17 +1691,17 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
i = idy * 2 + idx;
frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0;
- frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0;
- vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
+ vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile,
&frame_mv[NEARESTMV][mbmi->ref_frame[0]],
&frame_mv[NEARMV][mbmi->ref_frame[0]],
i, 0, mi_row, mi_col);
- if (mbmi->ref_frame[1] > 0)
- vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
- &frame_mv[NEARESTMV][mbmi->ref_frame[1]],
- &frame_mv[NEARMV][mbmi->ref_frame[1]],
- i, 1, mi_row, mi_col);
-
+ if (has_second_rf) {
+ frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0;
+ vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile,
+ &frame_mv[NEARESTMV][mbmi->ref_frame[1]],
+ &frame_mv[NEARMV][mbmi->ref_frame[1]],
+ i, 1, mi_row, mi_col);
+ }
// search for the best motion vector on this segment
for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
const struct buf_2d orig_src = x->plane[0].src;
@@ -1705,7 +1714,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
if ((this_mode == NEARMV || this_mode == NEARESTMV ||
this_mode == ZEROMV) &&
frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 &&
- (mbmi->ref_frame[1] <= 0 ||
+ (!has_second_rf ||
frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) {
int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
int c1 = cost_mv_ref(cpi, NEARMV, rfc);
@@ -1720,7 +1729,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
continue;
} else {
assert(this_mode == ZEROMV);
- if (mbmi->ref_frame[1] <= 0) {
+ if (!has_second_rf) {
if ((c3 >= c2 &&
frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
(c3 >= c1 &&
@@ -1738,14 +1747,14 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
}
}
- vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
+ vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
sizeof(bsi->rdstat[i][mode_idx].ta));
vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
sizeof(bsi->rdstat[i][mode_idx].tl));
// motion search for newmv (single predictor case only)
- if (mbmi->ref_frame[1] <= 0 && this_mode == NEWMV &&
+ if (!has_second_rf && this_mode == NEWMV &&
seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
int step_param = 0;
int further_steps;
@@ -1795,20 +1804,23 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
// adjust src pointer for this block
mi_buf_shift(x, i);
if (cpi->sf.search_method == HEX) {
- bestsme = vp9_hex_search(x, &mvp_full,
+ bestsme = vp9_hex_search(x, &mvp_full.as_mv,
step_param,
sadpb, 1, v_fn_ptr, 1,
- bsi->ref_mv, &mode_mv[NEWMV]);
+ &bsi->ref_mv->as_mv,
+ &mode_mv[NEWMV].as_mv);
} else if (cpi->sf.search_method == SQUARE) {
- bestsme = vp9_square_search(x, &mvp_full,
+ bestsme = vp9_square_search(x, &mvp_full.as_mv,
step_param,
sadpb, 1, v_fn_ptr, 1,
- bsi->ref_mv, &mode_mv[NEWMV]);
+ &bsi->ref_mv->as_mv,
+ &mode_mv[NEWMV].as_mv);
} else if (cpi->sf.search_method == BIGDIA) {
- bestsme = vp9_bigdia_search(x, &mvp_full,
+ bestsme = vp9_bigdia_search(x, &mvp_full.as_mv,
step_param,
sadpb, 1, v_fn_ptr, 1,
- bsi->ref_mv, &mode_mv[NEWMV]);
+ &bsi->ref_mv->as_mv,
+ &mode_mv[NEWMV].as_mv);
} else {
bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
sadpb, further_steps, 0, v_fn_ptr,
@@ -1839,8 +1851,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
if (bestsme < INT_MAX) {
int distortion;
unsigned int sse;
- cpi->find_fractional_mv_step(x, &mode_mv[NEWMV],
- bsi->ref_mv, x->errorperbit, v_fn_ptr,
+ cpi->find_fractional_mv_step(x,
+ &mode_mv[NEWMV].as_mv,
+ &bsi->ref_mv->as_mv,
+ cpi->common.allow_high_precision_mv,
+ x->errorperbit, v_fn_ptr,
0, cpi->sf.subpel_iters_per_step,
x->nmvjointcost, x->mvcost,
&distortion, &sse);
@@ -1856,12 +1871,14 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
mi_buf_restore(x, orig_src, orig_pre);
}
- if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV &&
- mbmi->interp_filter == EIGHTTAP) {
+ if (has_second_rf) {
if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
continue;
+ }
+ if (has_second_rf && this_mode == NEWMV &&
+ mbmi->interp_filter == EIGHTTAP) {
// adjust src pointers
mi_buf_shift(x, i);
if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
@@ -1891,7 +1908,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
if (num_4x4_blocks_high > 1)
bsi->rdstat[i + 2][mode_idx].mvs[0].as_int =
mode_mv[this_mode].as_int;
- if (mbmi->ref_frame[1] > 0) {
+ if (has_second_rf) {
bsi->rdstat[i][mode_idx].mvs[1].as_int =
second_mode_mv[this_mode].as_int;
if (num_4x4_blocks_wide > 1)
@@ -1905,7 +1922,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
// Trap vectors that reach beyond the UMV borders
if (mv_check_bounds(x, &mode_mv[this_mode]))
continue;
- if (mbmi->ref_frame[1] > 0 &&
+ if (has_second_rf &&
mv_check_bounds(x, &second_mode_mv[this_mode]))
continue;
@@ -1915,7 +1932,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
(mode_mv[this_mode].as_mv.col & 0x0f);
have_ref = mode_mv[this_mode].as_int ==
ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
- if (mbmi->ref_frame[1] > 0) {
+ if (has_second_rf) {
subpelmv |= (second_mode_mv[this_mode].as_mv.row & 0x0f) ||
(second_mode_mv[this_mode].as_mv.col & 0x0f);
have_ref &= second_mode_mv[this_mode].as_int ==
@@ -1926,7 +1943,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
ref_bsi = bsi_buf + 1;
have_ref = mode_mv[this_mode].as_int ==
ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
- if (mbmi->ref_frame[1] > 0) {
+ if (has_second_rf) {
have_ref &= second_mode_mv[this_mode].as_int ==
ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
}
@@ -1936,6 +1953,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
sizeof(SEG_RDSTAT));
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[i + 1][mode_idx].eobs =
+ ref_bsi->rdstat[i + 1][mode_idx].eobs;
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[i + 2][mode_idx].eobs =
+ ref_bsi->rdstat[i + 2][mode_idx].eobs;
+
if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
mode_selected = this_mode;
best_rd = bsi->rdstat[i][mode_idx].brdcost;
@@ -1956,7 +1980,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
bsi->rdstat[i][mode_idx].brate, 0);
bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
- bsi->rdstat[i][mode_idx].eobs = x->e_mbd.plane[0].eobs[i];
+ bsi->rdstat[i][mode_idx].eobs = pd->eobs[i];
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[i + 1][mode_idx].eobs = pd->eobs[i + 1];
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[i + 2][mode_idx].eobs = pd->eobs[i + 2];
}
if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
@@ -1997,15 +2025,6 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
bsi->segment_rd = INT64_MAX;
return;
}
-
- for (j = 1; j < num_4x4_blocks_high; ++j)
- vpx_memcpy(&x->partition_info->bmi[i + j * 2],
- &x->partition_info->bmi[i],
- sizeof(x->partition_info->bmi[i]));
- for (j = 1; j < num_4x4_blocks_wide; ++j)
- vpx_memcpy(&x->partition_info->bmi[i + j],
- &x->partition_info->bmi[i],
- sizeof(x->partition_info->bmi[i]));
}
} /* for each label */
@@ -2017,10 +2036,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
// update the coding decisions
for (i = 0; i < 4; ++i)
- bsi->modes[i] = x->partition_info->bmi[i].mode;
+ bsi->modes[i] = mi->bmi[i].as_mode;
}
static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
int_mv *best_ref_mv,
int_mv *second_best_ref_mv,
int64_t best_rd,
@@ -2036,7 +2056,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
int i;
BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
MACROBLOCKD *xd = &x->e_mbd;
- MODE_INFO *mi = xd->this_mi;
+ MODE_INFO *mi = xd->mi_8x8[0];
MB_MODE_INFO *mbmi = &mi->mbmi;
int mode_idx;
@@ -2051,7 +2071,8 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
for (i = 0; i < 4; i++)
bsi->modes[i] = ZEROMV;
- rd_check_segment_txsize(cpi, x, bsi_buf, filter_idx, seg_mvs, mi_row, mi_col);
+ rd_check_segment_txsize(cpi, x, tile, bsi_buf, filter_idx, seg_mvs,
+ mi_row, mi_col);
if (bsi->segment_rd > best_rd)
return INT64_MAX;
@@ -2059,10 +2080,10 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
for (i = 0; i < 4; i++) {
mode_idx = inter_mode_offset(bsi->modes[i]);
mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
- if (mbmi->ref_frame[1] > 0)
+ if (has_second_ref(mbmi))
mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
- x->partition_info->bmi[i].mode = bsi->modes[i];
+ mi->bmi[i].as_mode = bsi->modes[i];
}
/*
@@ -2082,7 +2103,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
uint8_t *ref_y_buffer, int ref_y_stride,
int ref_frame, BLOCK_SIZE block_size ) {
MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
int_mv this_mv;
int i;
int zero_seen = 0;
@@ -2195,22 +2216,18 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
int mode_index,
- PARTITION_INFO *partition,
int_mv *ref_mv,
int_mv *second_ref_mv,
int64_t comp_pred_diff[NB_PREDICTION_TYPES],
int64_t tx_size_diff[TX_MODES],
- int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]) {
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
MACROBLOCKD *const xd = &x->e_mbd;
// Take a snapshot of the coding context so it can be
// restored if we decide to encode this way
ctx->skip = x->skip;
ctx->best_mode_index = mode_index;
- ctx->mic = *xd->this_mi;
-
- if (partition)
- ctx->partition_info = *partition;
+ ctx->mic = *xd->mi_8x8[0];
ctx->best_ref_mv.as_int = ref_mv->as_int;
ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
@@ -2219,11 +2236,9 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
ctx->comp_pred_diff = (int)comp_pred_diff[COMP_PREDICTION_ONLY];
ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION];
- // FIXME(rbultje) does this memcpy the whole array? I believe sizeof()
- // doesn't actually work this way
- memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
- memcpy(ctx->best_filter_diff, best_filter_diff,
- sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1));
+ vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
+ vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
+ sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
}
static void setup_pred_block(const MACROBLOCKD *xd,
@@ -2253,6 +2268,7 @@ static void setup_pred_block(const MACROBLOCKD *xd,
}
static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
int idx, MV_REFERENCE_FRAME frame_type,
BLOCK_SIZE block_size,
int mi_row, int mi_col,
@@ -2263,17 +2279,13 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
VP9_COMMON *cm = &cpi->common;
YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
// set up scaling factors
scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
- scale[frame_type].x_offset_q4 =
- ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].x_scale_fp,
- REF_SCALE_SHIFT) & 0xf;
- scale[frame_type].y_offset_q4 =
- ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].y_scale_fp,
- REF_SCALE_SHIFT) & 0xf;
+ scale[frame_type].sfc->set_scaled_offsets(&scale[frame_type],
+ mi_row * MI_SIZE, mi_col * MI_SIZE);
// TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
// use the UV scaling factors.
@@ -2281,13 +2293,13 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
&scale[frame_type], &scale[frame_type]);
// Gets an initial list of candidate vectors from neighbours and orders them
- vp9_find_mv_refs(&cpi->common, xd, xd->this_mi,
+ vp9_find_mv_refs(cm, xd, tile, xd->mi_8x8[0],
xd->last_mi,
frame_type,
mbmi->ref_mvs[frame_type], mi_row, mi_col);
// Candidate refinement carried out at encoder and decoder
- vp9_find_best_ref_mvs(xd,
+ vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv,
mbmi->ref_mvs[frame_type],
&frame_nearest_mv[frame_type],
&frame_near_mv[frame_type]);
@@ -2295,7 +2307,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
// Further refinement that is encode side only to test the top few candidates
// in full and choose the best as the centre point for subsequent searches.
// The current implementation doesn't support scaling.
- if (!vp9_is_scaled(&scale[frame_type]))
+ if (!vp9_is_scaled(scale[frame_type].sfc) && block_size >= BLOCK_8X8)
mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
frame_type, block_size);
}
@@ -2311,19 +2323,20 @@ static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
static INLINE int get_switchable_rate(const MACROBLOCK *x) {
const MACROBLOCKD *const xd = &x->e_mbd;
- const MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+ const MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
const int ctx = vp9_get_pred_context_switchable_interp(xd);
return SWITCHABLE_INTERP_RATE_FACTOR *
x->switchable_interp_costs[ctx][mbmi->interp_filter];
}
static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
BLOCK_SIZE bsize,
int mi_row, int mi_col,
int_mv *tmp_mv, int *rate_mv) {
MACROBLOCKD *xd = &x->e_mbd;
VP9_COMMON *cm = &cpi->common;
- MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
int bestsme = INT_MAX;
int further_steps, step_param;
@@ -2402,23 +2415,23 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
if (cpi->sf.search_method == HEX) {
- bestsme = vp9_hex_search(x, &mvp_full,
+ bestsme = vp9_hex_search(x, &mvp_full.as_mv,
step_param,
sadpb, 1,
&cpi->fn_ptr[block_size], 1,
- &ref_mv, tmp_mv);
+ &ref_mv.as_mv, &tmp_mv->as_mv);
} else if (cpi->sf.search_method == SQUARE) {
- bestsme = vp9_square_search(x, &mvp_full,
+ bestsme = vp9_square_search(x, &mvp_full.as_mv,
step_param,
sadpb, 1,
&cpi->fn_ptr[block_size], 1,
- &ref_mv, tmp_mv);
+ &ref_mv.as_mv, &tmp_mv->as_mv);
} else if (cpi->sf.search_method == BIGDIA) {
- bestsme = vp9_bigdia_search(x, &mvp_full,
+ bestsme = vp9_bigdia_search(x, &mvp_full.as_mv,
step_param,
sadpb, 1,
&cpi->fn_ptr[block_size], 1,
- &ref_mv, tmp_mv);
+ &ref_mv.as_mv, &tmp_mv->as_mv);
} else {
bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
sadpb, further_steps, 1,
@@ -2434,16 +2447,16 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
if (bestsme < INT_MAX) {
int dis; /* TODO: use dis in distortion calculation later. */
unsigned int sse;
- cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv,
+ cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+ cm->allow_high_precision_mv,
x->errorperbit,
&cpi->fn_ptr[block_size],
0, cpi->sf.subpel_iters_per_step,
x->nmvjointcost, x->mvcost,
&dis, &sse);
}
- *rate_mv = vp9_mv_bit_cost(tmp_mv, &ref_mv,
- x->nmvjointcost, x->mvcost,
- 96);
+ *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
x->pred_mv[ref].as_int = tmp_mv->as_int;
@@ -2463,7 +2476,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int *rate_mv) {
int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
int refs[2] = { mbmi->ref_frame[0],
(mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
int_mv ref_mv[2];
@@ -2499,12 +2512,12 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
for (i = 0; i < MAX_MB_PLANE; i++)
backup_second_yv12[i] = xd->plane[i].pre[1];
- setup_pre_planes(xd, 0, scaled_ref_frame[1], mi_row, mi_col, NULL);
+ setup_pre_planes(xd, 1, scaled_ref_frame[1], mi_row, mi_col, NULL);
}
- xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+ xd->scale_factor[0].sfc->set_scaled_offsets(&xd->scale_factor[0],
mi_row, mi_col);
- xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+ xd->scale_factor[1].sfc->set_scaled_offsets(&xd->scale_factor[1],
mi_row, mi_col);
scaled_first_yv12 = xd->plane[0].pre[0];
@@ -2569,8 +2582,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
unsigned int sse;
bestsme = cpi->find_fractional_mv_step_comp(
- x, &tmp_mv,
- &ref_mv[id],
+ x, &tmp_mv.as_mv,
+ &ref_mv[id].as_mv,
+ cpi->common.allow_high_precision_mv,
x->errorperbit,
&cpi->fn_ptr[block_size],
0, cpi->sf.subpel_iters_per_step,
@@ -2602,17 +2616,18 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
for (i = 0; i < MAX_MB_PLANE; i++)
xd->plane[i].pre[1] = backup_second_yv12[i];
}
- *rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]],
- &mbmi->ref_mvs[refs[0]][0],
- x->nmvjointcost, x->mvcost, 96);
- *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]],
- &mbmi->ref_mvs[refs[1]][0],
- x->nmvjointcost, x->mvcost, 96);
+ *rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+ &mbmi->ref_mvs[refs[0]][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+ &mbmi->ref_mvs[refs[1]][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
vpx_free(second_pred);
}
static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
BLOCK_SIZE bsize,
int64_t txfm_cache[],
int *rate2, int64_t *distortion,
@@ -2620,7 +2635,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
int *rate_y, int64_t *distortion_y,
int *rate_uv, int64_t *distortion_uv,
int *mode_excluded, int *disable_skip,
- INTERPOLATIONFILTERTYPE *best_filter,
+ INTERPOLATION_TYPE *best_filter,
int_mv (*mode_mv)[MAX_REF_FRAMES],
int mi_row, int mi_col,
int_mv single_newmv[MAX_REF_FRAMES],
@@ -2628,8 +2643,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
const int64_t ref_best_rd) {
VP9_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
- const int is_comp_pred = (mbmi->ref_frame[1] > 0);
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+ const int is_comp_pred = has_second_ref(mbmi);
const int num_refs = is_comp_pred ? 2 : 1;
const int this_mode = mbmi->mode;
int_mv *frame_mv = mode_mv[this_mode];
@@ -2647,6 +2662,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
int orig_dst_stride[MAX_MB_PLANE];
int rs = 0;
+ if (is_comp_pred) {
+ if (frame_mv[refs[0]].as_int == INVALID_MV ||
+ frame_mv[refs[1]].as_int == INVALID_MV)
+ return INT64_MAX;
+ }
+
if (this_mode == NEWMV) {
int rate_mv;
if (is_comp_pred) {
@@ -2658,23 +2679,21 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
joint_motion_search(cpi, x, bsize, frame_mv,
mi_row, mi_col, single_newmv, &rate_mv);
} else {
- rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]],
- &mbmi->ref_mvs[refs[0]][0],
- x->nmvjointcost, x->mvcost, 96);
- rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]],
- &mbmi->ref_mvs[refs[1]][0],
- x->nmvjointcost, x->mvcost, 96);
+ rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+ &mbmi->ref_mvs[refs[0]][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+ &mbmi->ref_mvs[refs[1]][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
}
- if (frame_mv[refs[0]].as_int == INVALID_MV ||
- frame_mv[refs[1]].as_int == INVALID_MV)
- return INT64_MAX;
*rate2 += rate_mv;
} else {
int_mv tmp_mv;
- single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
+ single_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+ &tmp_mv, &rate_mv);
*rate2 += rate_mv;
frame_mv[refs[0]].as_int =
- xd->this_mi->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+ xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
single_newmv[refs[0]].as_int = tmp_mv.as_int;
}
}
@@ -2904,7 +2923,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
unsigned int thresh_ac;
// The encode_breakout input
unsigned int encode_breakout = x->encode_breakout << 4;
- int max_thresh = 36000;
+ unsigned int max_thresh = 36000;
// Use extreme low threshold for static frames to limit skipping.
if (cpi->enable_encode_breakout == 2)
@@ -3001,7 +3020,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
- super_block_uvrd(cm, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
+ super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
bsize, ref_best_rd - rdcosty);
if (*rate_uv == INT_MAX) {
*rate2 = INT_MAX;
@@ -3038,7 +3057,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
x->skip_encode = 0;
ctx->skip = 0;
- xd->this_mi->mbmi.ref_frame[0] = INTRA_FRAME;
+ xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
if (bsize >= BLOCK_8X8) {
if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
&dist_y, &y_skip, bsize, tx_cache,
@@ -3070,14 +3089,19 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0);
*returndist = dist_y + dist_uv;
if (cpi->sf.tx_size_search_method == USE_FULL_RD)
- for (i = 0; i < TX_MODES; i++)
- ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
+ for (i = 0; i < TX_MODES; i++) {
+ if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX)
+ ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
+ else
+ ctx->tx_rd_diff[i] = 0;
+ }
}
- ctx->mic = *xd->this_mi;
+ ctx->mic = *xd->mi_8x8[0];
}
int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
int mi_row, int mi_col,
int *returnrate,
int64_t *returndistortion,
@@ -3086,10 +3110,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int64_t best_rd_so_far) {
VP9_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
const struct segmentation *seg = &cm->seg;
const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
- RD_PREDICTION_MODE this_mode;
+ MB_PREDICTION_MODE this_mode;
MV_REFERENCE_FRAME ref_frame, second_ref_frame;
unsigned char segment_id = mbmi->segment_id;
int comp_pred, i;
@@ -3103,13 +3127,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
cpi->gld_fb_idx,
cpi->alt_fb_idx};
int64_t best_rd = best_rd_so_far;
- int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise
int64_t best_tx_rd[TX_MODES];
int64_t best_tx_diff[TX_MODES];
int64_t best_pred_diff[NB_PREDICTION_TYPES];
int64_t best_pred_rd[NB_PREDICTION_TYPES];
- int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
- int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+ int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
MB_MODE_INFO best_mbmode = { 0 };
int j;
int mode_index, best_mode_index = 0;
@@ -3118,9 +3141,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int64_t best_intra_rd = INT64_MAX;
int64_t best_inter_rd = INT64_MAX;
MB_PREDICTION_MODE best_intra_mode = DC_PRED;
- // MB_PREDICTION_MODE best_inter_mode = ZEROMV;
MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
- INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
+ INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE;
int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
int64_t dist_uv[TX_SIZES];
int skip_uv[TX_SIZES];
@@ -3130,22 +3152,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
unsigned int mode_mask = 0;
int64_t mode_distortions[MB_MODE_COUNT] = {-1};
int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
- int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
- cpi->common.y_dc_delta_q);
- int_mv seg_mvs[4][MAX_REF_FRAMES];
- union b_mode_info best_bmodes[4];
- PARTITION_INFO best_partition;
+ int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
int best_skip2 = 0;
x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
- for (i = 0; i < 4; i++) {
- int j;
- for (j = 0; j < MAX_REF_FRAMES; j++)
- seg_mvs[i][j].as_int = INVALID_MV;
- }
// Everywhere the flag is set the error is much higher than its neighbors.
ctx->frames_with_high_error = 0;
ctx->modes_with_high_error = 0;
@@ -3157,7 +3170,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
best_pred_rd[i] = INT64_MAX;
for (i = 0; i < TX_MODES; i++)
best_tx_rd[i] = INT64_MAX;
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
best_filter_rd[i] = INT64_MAX;
for (i = 0; i < TX_SIZES; i++)
rate_uv_intra[i] = INT_MAX;
@@ -3199,8 +3212,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
if (cpi->ref_frame_flags & flag_list[ref_frame]) {
- setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
- mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
+ setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame,
+ block_size, mi_row, mi_col,
+ frame_mv[NEARESTMV], frame_mv[NEARMV],
yv12_mb, scale_factor);
}
frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
@@ -3251,7 +3265,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
assert(!"Invalid Reference frame");
}
}
- if (cpi->mode_skip_mask & (1 << mode_index))
+ if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
continue;
}
@@ -3261,9 +3275,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
continue;
// Test best rd so far against threshold for trying this mode.
- if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] *
- cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 5)) ||
- cpi->rd_threshes[bsize][mode_index] == INT_MAX)
+ if ((best_rd < ((int64_t)cpi->rd_threshes[segment_id][bsize][mode_index] *
+ cpi->rd_thresh_freq_fact[bsize][mode_index] >> 5)) ||
+ cpi->rd_threshes[segment_id][bsize][mode_index] == INT_MAX)
continue;
// Do not allow compound prediction if the segment level reference
@@ -3313,25 +3327,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
second_ref_frame != best_inter_ref_frame)
continue;
}
- // TODO(jingning, jkoleszar): scaling reference frame not supported for
- // SPLITMV.
- if (ref_frame > 0 &&
- vp9_is_scaled(&scale_factor[ref_frame]) &&
- this_mode == RD_SPLITMV)
- continue;
-
- if (second_ref_frame > 0 &&
- vp9_is_scaled(&scale_factor[second_ref_frame]) &&
- this_mode == RD_SPLITMV)
- continue;
-
- if (bsize >= BLOCK_8X8 &&
- (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
- continue;
-
- if (bsize < BLOCK_8X8 &&
- !(this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
- continue;
set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
mbmi->uv_mode = DC_PRED;
@@ -3339,7 +3334,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// Evaluate all sub-pel filters irrespective of whether we can use
// them for this frame.
mbmi->interp_filter = cm->mcomp_filter_type;
- vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+ vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
if (comp_pred) {
if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
@@ -3373,7 +3368,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// If the segment skip feature is enabled....
// then do nothing if the current mode is not allowed..
} else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
- (this_mode != RD_ZEROMV && ref_frame != INTRA_FRAME)) {
+ (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
continue;
// Disable this drop out case if the ref frame
// segment level feature is enabled for this segment. This is to
@@ -3385,11 +3380,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// an unfiltered alternative. We allow near/nearest as well
// because they may result in zero-zero MVs but be cheaper.
if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
- if ((this_mode != RD_ZEROMV &&
- !(this_mode == RD_NEARMV &&
- frame_mv[RD_NEARMV][ALTREF_FRAME].as_int == 0) &&
- !(this_mode == RD_NEARESTMV &&
- frame_mv[RD_NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
+ if ((this_mode != ZEROMV &&
+ !(this_mode == NEARMV &&
+ frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
+ !(this_mode == NEARESTMV &&
+ frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
ref_frame != ALTREF_FRAME) {
continue;
}
@@ -3401,7 +3396,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// a representative block in the boundary ( first ) and then implement a
// function that does sads when inside the border..
if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
- this_mode == RD_NEWMV) {
+ this_mode == NEWMV) {
continue;
}
@@ -3411,58 +3406,27 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
cpi->mode_test_hits[bsize]++;
#endif
- if (this_mode == RD_I4X4_PRED) {
- int rate;
-
- /*
- if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
- (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME))
- continue;
- */
- // RD_I4X4_PRED is only considered for block sizes less than 8x8.
- mbmi->tx_size = TX_4X4;
- if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
- &distortion_y, best_rd) >= best_rd)
- continue;
- rate2 += rate;
- rate2 += intra_cost_penalty;
- distortion2 += distortion_y;
-
- if (rate_uv_intra[TX_4X4] == INT_MAX) {
- choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
- &rate_uv_tokenonly[TX_4X4],
- &dist_uv[TX_4X4], &skip_uv[TX_4X4],
- &mode_uv[TX_4X4]);
- }
- rate2 += rate_uv_intra[TX_4X4];
- rate_uv = rate_uv_tokenonly[TX_4X4];
- distortion2 += dist_uv[TX_4X4];
- distortion_uv = dist_uv[TX_4X4];
- mbmi->uv_mode = mode_uv[TX_4X4];
- tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
- for (i = 0; i < TX_MODES; ++i)
- tx_cache[i] = tx_cache[ONLY_4X4];
- } else if (ref_frame == INTRA_FRAME) {
+ if (ref_frame == INTRA_FRAME) {
TX_SIZE uv_tx;
// Disable intra modes other than DC_PRED for blocks with low variance
// Threshold for intra skipping based on source variance
// TODO(debargha): Specialize the threshold for super block sizes
- static const int skip_intra_var_thresh[BLOCK_SIZES] = {
+ static const unsigned int skip_intra_var_thresh[BLOCK_SIZES] = {
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
};
if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
- this_mode != RD_DC_PRED &&
+ this_mode != DC_PRED &&
x->source_variance < skip_intra_var_thresh[mbmi->sb_type])
continue;
// Only search the oblique modes if the best so far is
// one of the neighboring directional modes
if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
- (this_mode >= RD_D45_PRED && this_mode <= RD_TM_PRED)) {
+ (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)
continue;
}
- mbmi->mode = rd_mode_to_mode(this_mode);
+ mbmi->mode = this_mode;
if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
if (conditional_skipintra(mbmi->mode, best_intra_mode))
continue;
@@ -3488,11 +3452,631 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
mbmi->uv_mode = mode_uv[uv_tx];
rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
- if (this_mode != RD_DC_PRED && this_mode != RD_TM_PRED)
+ if (this_mode != DC_PRED && this_mode != TM_PRED)
rate2 += intra_cost_penalty;
distortion2 = distortion_y + distortion_uv;
- } else if (this_mode == RD_SPLITMV) {
- const int is_comp_pred = second_ref_frame > 0;
+ } else {
+ mbmi->mode = this_mode;
+ compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
+ this_rd = handle_inter_mode(cpi, x, tile, bsize,
+ tx_cache,
+ &rate2, &distortion2, &skippable,
+ &rate_y, &distortion_y,
+ &rate_uv, &distortion_uv,
+ &mode_excluded, &disable_skip,
+ &tmp_best_filter, frame_mv,
+ mi_row, mi_col,
+ single_newmv, &total_sse, best_rd);
+ if (this_rd == INT64_MAX)
+ continue;
+ }
+
+ if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+ rate2 += compmode_cost;
+ }
+
+ // Estimate the reference frame signaling cost and add it
+ // to the rolling cost variable.
+ if (second_ref_frame > INTRA_FRAME) {
+ rate2 += ref_costs_comp[ref_frame];
+ } else {
+ rate2 += ref_costs_single[ref_frame];
+ }
+
+ if (!disable_skip) {
+ // Test for the condition where skip block will be activated
+ // because there are no non zero coefficients and make any
+ // necessary adjustment for rate. Ignore if skip is coded at
+ // segment level as the cost wont have been added in.
+ // Is Mb level skip allowed (i.e. not coded at segment level).
+ const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
+ SEG_LVL_SKIP);
+
+ if (skippable) {
+ // Back out the coefficient coding costs
+ rate2 -= (rate_y + rate_uv);
+ // for best yrd calculation
+ rate_uv = 0;
+
+ if (mb_skip_allowed) {
+ int prob_skip_cost;
+
+ // Cost the skip mb case
+ vp9_prob skip_prob =
+ vp9_get_pred_prob_mbskip(cm, xd);
+
+ if (skip_prob) {
+ prob_skip_cost = vp9_cost_bit(skip_prob, 1);
+ rate2 += prob_skip_cost;
+ }
+ }
+ } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
+ if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+ RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+ // Add in the cost of the no skip flag.
+ int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+ 0);
+ rate2 += prob_skip_cost;
+ } else {
+ // FIXME(rbultje) make this work for splitmv also
+ int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+ 1);
+ rate2 += prob_skip_cost;
+ distortion2 = total_sse;
+ assert(total_sse >= 0);
+ rate2 -= (rate_y + rate_uv);
+ rate_y = 0;
+ rate_uv = 0;
+ this_skip2 = 1;
+ }
+ } else if (mb_skip_allowed) {
+ // Add in the cost of the no skip flag.
+ int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+ 0);
+ rate2 += prob_skip_cost;
+ }
+
+ // Calculate the final RD estimate for this mode.
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+ }
+
+ // Keep record of best intra rd
+ if (!is_inter_block(&xd->mi_8x8[0]->mbmi) &&
+ this_rd < best_intra_rd) {
+ best_intra_rd = this_rd;
+ best_intra_mode = xd->mi_8x8[0]->mbmi.mode;
+ }
+
+ // Keep record of best inter rd with single reference
+ if (is_inter_block(&xd->mi_8x8[0]->mbmi) &&
+ !has_second_ref(&xd->mi_8x8[0]->mbmi) &&
+ !mode_excluded && this_rd < best_inter_rd) {
+ best_inter_rd = this_rd;
+ best_inter_ref_frame = ref_frame;
+ }
+
+ if (!disable_skip && ref_frame == INTRA_FRAME) {
+ for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+ best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+ best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
+ }
+
+ // Store the respective mode distortions for later use.
+ if (mode_distortions[this_mode] == -1
+ || distortion2 < mode_distortions[this_mode]) {
+ mode_distortions[this_mode] = distortion2;
+ }
+ if (frame_distortions[ref_frame] == -1
+ || distortion2 < frame_distortions[ref_frame]) {
+ frame_distortions[ref_frame] = distortion2;
+ }
+
+ // Did this mode help.. i.e. is it the new best mode
+ if (this_rd < best_rd || x->skip) {
+ if (!mode_excluded) {
+ // Note index of best mode so far
+ best_mode_index = mode_index;
+
+ if (ref_frame == INTRA_FRAME) {
+ /* required for left and above block mv */
+ mbmi->mv[0].as_int = 0;
+ }
+
+ *returnrate = rate2;
+ *returndistortion = distortion2;
+ best_rd = this_rd;
+ best_mbmode = *mbmi;
+ best_skip2 = this_skip2;
+ vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+ sizeof(uint8_t) * ctx->num_4x4_blk);
+
+ // TODO(debargha): enhance this test with a better distortion prediction
+ // based on qp, activity mask and history
+ if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+ (mode_index > MIN_EARLY_TERM_INDEX)) {
+ const int qstep = xd->plane[0].dequant[1];
+ // TODO(debargha): Enhance this by specializing for each mode_index
+ int scale = 4;
+ if (x->source_variance < UINT_MAX) {
+ const int var_adjust = (x->source_variance < 16);
+ scale -= var_adjust;
+ }
+ if (ref_frame > INTRA_FRAME &&
+ distortion2 * scale < qstep * qstep) {
+ early_term = 1;
+ }
+ }
+ }
+ }
+
+ /* keep record of best compound/single-only prediction */
+ if (!disable_skip && ref_frame != INTRA_FRAME) {
+ int single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+ if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+ single_rate = rate2 - compmode_cost;
+ hybrid_rate = rate2;
+ } else {
+ single_rate = rate2;
+ hybrid_rate = rate2 + compmode_cost;
+ }
+
+ single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+ hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+ if (second_ref_frame <= INTRA_FRAME &&
+ single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
+ best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
+ } else if (second_ref_frame > INTRA_FRAME &&
+ single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
+ best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
+ }
+ if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
+ best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+ }
+
+ /* keep record of best filter type */
+ if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
+ cm->mcomp_filter_type != BILINEAR) {
+ int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
+ SWITCHABLE_FILTERS : cm->mcomp_filter_type];
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+ int64_t adj_rd;
+ // In cases of poor prediction, filter_cache[] can contain really big
+ // values, which actually are bigger than this_rd itself. This can
+ // cause negative best_filter_rd[] values, which is obviously silly.
+ // Therefore, if filter_cache < ref, we do an adjusted calculation.
+ if (cpi->rd_filter_cache[i] >= ref) {
+ adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
+ } else {
+ // FIXME(rbultje) do this for comppsred also
+ //
+ // To prevent out-of-range computation in
+ // adj_rd = cpi->rd_filter_cache[i] * this_rd / ref
+ // cpi->rd_filter_cache[i] / ref is converted to a 256 based ratio.
+ int tmp = cpi->rd_filter_cache[i] * 256 / ref;
+ adj_rd = (this_rd * tmp) >> 8;
+ }
+ best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
+ }
+ }
+
+ /* keep record of best txfm size */
+ if (bsize < BLOCK_32X32) {
+ if (bsize < BLOCK_16X16)
+ tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
+
+ tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
+ }
+ if (!mode_excluded && this_rd != INT64_MAX) {
+ for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
+ int64_t adj_rd = INT64_MAX;
+ adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
+
+ if (adj_rd < best_tx_rd[i])
+ best_tx_rd[i] = adj_rd;
+ }
+ }
+
+ if (early_term)
+ break;
+
+ if (x->skip && !comp_pred)
+ break;
+ }
+
+ if (best_rd >= best_rd_so_far)
+ return INT64_MAX;
+
+ // If we used an estimate for the uv intra rd in the loop above...
+ if (cpi->sf.use_uv_intra_rd_estimate) {
+ // Do Intra UV best rd mode selection if best mode choice above was intra.
+ if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
+ TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
+ rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
+ &rate_uv_tokenonly[uv_tx_size],
+ &dist_uv[uv_tx_size],
+ &skip_uv[uv_tx_size],
+ bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
+ }
+ }
+
+ // If we are using reference masking and the set mask flag is set then
+ // create the reference frame mask.
+ if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
+ cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame);
+
+ // Flag all modes that have a distortion thats > 2x the best we found at
+ // this level.
+ for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
+ if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)
+ continue;
+
+ if (mode_distortions[mode_index] > 2 * *returndistortion) {
+ ctx->modes_with_high_error |= (1 << mode_index);
+ }
+ }
+
+ // Flag all ref frames that have a distortion thats > 2x the best we found at
+ // this level.
+ for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ if (frame_distortions[ref_frame] > 2 * *returndistortion) {
+ ctx->frames_with_high_error |= (1 << ref_frame);
+ }
+ }
+
+ assert((cm->mcomp_filter_type == SWITCHABLE) ||
+ (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
+ (best_mbmode.ref_frame[0] == INTRA_FRAME));
+
+ // Updating rd_thresh_freq_fact[] here means that the different
+ // partition/block sizes are handled independently based on the best
+ // choice for the current partition. It may well be better to keep a scaled
+ // best rd so far value and update rd_thresh_freq_fact based on the mode/size
+ // combination that wins out.
+ if (cpi->sf.adaptive_rd_thresh) {
+ for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
+ if (mode_index == best_mode_index) {
+ cpi->rd_thresh_freq_fact[bsize][mode_index] -=
+ (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3);
+ } else {
+ cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
+ if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
+ (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) {
+ cpi->rd_thresh_freq_fact[bsize][mode_index] =
+ cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT;
+ }
+ }
+ }
+ }
+
+ // macroblock modes
+ *mbmi = best_mbmode;
+ x->skip |= best_skip2;
+
+ for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+ if (best_pred_rd[i] == INT64_MAX)
+ best_pred_diff[i] = INT_MIN;
+ else
+ best_pred_diff[i] = best_rd - best_pred_rd[i];
+ }
+
+ if (!x->skip) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+ if (best_filter_rd[i] == INT64_MAX)
+ best_filter_diff[i] = 0;
+ else
+ best_filter_diff[i] = best_rd - best_filter_rd[i];
+ }
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
+ } else {
+ vp9_zero(best_filter_diff);
+ }
+
+ if (!x->skip) {
+ for (i = 0; i < TX_MODES; i++) {
+ if (best_tx_rd[i] == INT64_MAX)
+ best_tx_diff[i] = 0;
+ else
+ best_tx_diff[i] = best_rd - best_tx_rd[i];
+ }
+ } else {
+ vp9_zero(best_tx_diff);
+ }
+
+ set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
+ scale_factor);
+ store_coding_context(x, ctx, best_mode_index,
+ &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
+ &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
+ mbmi->ref_frame[1]][0],
+ best_pred_diff, best_tx_diff, best_filter_diff);
+
+ return best_rd;
+}
+
+
+int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
+ int mi_row, int mi_col,
+ int *returnrate,
+ int64_t *returndistortion,
+ BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far) {
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+ const struct segmentation *seg = &cm->seg;
+ const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+ MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+ unsigned char segment_id = mbmi->segment_id;
+ int comp_pred, i;
+ int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+ struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+ static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+ VP9_ALT_FLAG };
+ int idx_list[4] = {0,
+ cpi->lst_fb_idx,
+ cpi->gld_fb_idx,
+ cpi->alt_fb_idx};
+ int64_t best_rd = best_rd_so_far;
+ int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise
+ int64_t best_tx_rd[TX_MODES];
+ int64_t best_tx_diff[TX_MODES];
+ int64_t best_pred_diff[NB_PREDICTION_TYPES];
+ int64_t best_pred_rd[NB_PREDICTION_TYPES];
+ int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+ MB_MODE_INFO best_mbmode = { 0 };
+ int mode_index, best_mode_index = 0;
+ unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+ vp9_prob comp_mode_p;
+ int64_t best_inter_rd = INT64_MAX;
+ MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
+ INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE;
+ int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
+ int64_t dist_uv[TX_SIZES];
+ int skip_uv[TX_SIZES];
+ MB_PREDICTION_MODE mode_uv[TX_SIZES] = { 0 };
+ struct scale_factors scale_factor[4];
+ unsigned int ref_frame_mask = 0;
+ unsigned int mode_mask = 0;
+ int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
+ cpi->common.y_dc_delta_q);
+ int_mv seg_mvs[4][MAX_REF_FRAMES];
+ b_mode_info best_bmodes[4];
+ int best_skip2 = 0;
+
+ x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
+ vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
+
+ for (i = 0; i < 4; i++) {
+ int j;
+ for (j = 0; j < MAX_REF_FRAMES; j++)
+ seg_mvs[i][j].as_int = INVALID_MV;
+ }
+
+ estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
+ &comp_mode_p);
+
+ for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+ best_pred_rd[i] = INT64_MAX;
+ for (i = 0; i < TX_MODES; i++)
+ best_tx_rd[i] = INT64_MAX;
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+ best_filter_rd[i] = INT64_MAX;
+ for (i = 0; i < TX_SIZES; i++)
+ rate_uv_intra[i] = INT_MAX;
+
+ *returnrate = INT_MAX;
+
+ // Create a mask set to 1 for each reference frame used by a smaller
+ // resolution.
+ if (cpi->sf.use_avoid_tested_higherror) {
+ ref_frame_mask = 0;
+ mode_mask = 0;
+ ref_frame_mask = ~ref_frame_mask;
+ mode_mask = ~mode_mask;
+ }
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+ setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame,
+ block_size, mi_row, mi_col,
+ frame_mv[NEARESTMV], frame_mv[NEARMV],
+ yv12_mb, scale_factor);
+ }
+ frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+ frame_mv[ZEROMV][ref_frame].as_int = 0;
+ }
+
+ for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
+ int mode_excluded = 0;
+ int64_t this_rd = INT64_MAX;
+ int disable_skip = 0;
+ int compmode_cost = 0;
+ int rate2 = 0, rate_y = 0, rate_uv = 0;
+ int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+ int skippable = 0;
+ int64_t tx_cache[TX_MODES];
+ int i;
+ int this_skip2 = 0;
+ int64_t total_sse = INT_MAX;
+ int early_term = 0;
+
+ for (i = 0; i < TX_MODES; ++i)
+ tx_cache[i] = INT64_MAX;
+
+ x->skip = 0;
+ ref_frame = vp9_ref_order[mode_index].ref_frame;
+ second_ref_frame = vp9_ref_order[mode_index].second_ref_frame;
+
+ // Look at the reference frame of the best mode so far and set the
+ // skip mask to look at a subset of the remaining modes.
+ if (mode_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) {
+ if (mode_index == 3) {
+ switch (vp9_ref_order[best_mode_index].ref_frame) {
+ case INTRA_FRAME:
+ cpi->mode_skip_mask = 0;
+ break;
+ case LAST_FRAME:
+ cpi->mode_skip_mask = 0x0010;
+ break;
+ case GOLDEN_FRAME:
+ cpi->mode_skip_mask = 0x0008;
+ break;
+ case ALTREF_FRAME:
+ cpi->mode_skip_mask = 0x0000;
+ break;
+ case NONE:
+ case MAX_REF_FRAMES:
+ assert(!"Invalid Reference frame");
+ }
+ }
+ if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
+ continue;
+ }
+
+ // Skip if the current reference frame has been masked off
+ if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
+ (cpi->ref_frame_mask & (1 << ref_frame)))
+ continue;
+
+ // Test best rd so far against threshold for trying this mode.
+ if ((best_rd <
+ ((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] *
+ cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 5)) ||
+ cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] == INT_MAX)
+ continue;
+
+ // Do not allow compound prediction if the segment level reference
+ // frame feature is in use as in this case there can only be one reference.
+ if ((second_ref_frame > INTRA_FRAME) &&
+ vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+ continue;
+
+ mbmi->ref_frame[0] = ref_frame;
+ mbmi->ref_frame[1] = second_ref_frame;
+
+ if (!(ref_frame == INTRA_FRAME
+ || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
+ continue;
+ }
+ if (!(second_ref_frame == NONE
+ || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
+ continue;
+ }
+
+ comp_pred = second_ref_frame > INTRA_FRAME;
+ if (comp_pred) {
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
+ if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME)
+ continue;
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
+ if (ref_frame != best_inter_ref_frame &&
+ second_ref_frame != best_inter_ref_frame)
+ continue;
+ }
+
+ // TODO(jingning, jkoleszar): scaling reference frame not supported for
+ // sub8x8 blocks.
+ if (ref_frame > 0 &&
+ vp9_is_scaled(scale_factor[ref_frame].sfc))
+ continue;
+
+ if (second_ref_frame > 0 &&
+ vp9_is_scaled(scale_factor[second_ref_frame].sfc))
+ continue;
+
+ set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
+ mbmi->uv_mode = DC_PRED;
+
+ // Evaluate all sub-pel filters irrespective of whether we can use
+ // them for this frame.
+ mbmi->interp_filter = cm->mcomp_filter_type;
+ vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+
+ if (comp_pred) {
+ if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
+ continue;
+ set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
+
+ mode_excluded = mode_excluded
+ ? mode_excluded
+ : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+ } else {
+ if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
+ mode_excluded =
+ mode_excluded ?
+ mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+ }
+ }
+
+ // Select prediction reference frames.
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ if (comp_pred)
+ xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+ }
+
+ // If the segment reference frame feature is enabled....
+ // then do nothing if the current ref frame is not allowed..
+ if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
+ (int)ref_frame) {
+ continue;
+ // If the segment skip feature is enabled....
+ // then do nothing if the current mode is not allowed..
+ } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
+ ref_frame != INTRA_FRAME) {
+ continue;
+ // Disable this drop out case if the ref frame
+ // segment level feature is enabled for this segment. This is to
+ // prevent the possibility that we end up unable to pick any mode.
+ } else if (!vp9_segfeature_active(seg, segment_id,
+ SEG_LVL_REF_FRAME)) {
+ // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+ // unless ARNR filtering is enabled in which case we want
+ // an unfiltered alternative. We allow near/nearest as well
+ // because they may result in zero-zero MVs but be cheaper.
+ if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+ continue;
+ }
+
+#ifdef MODE_TEST_HIT_STATS
+ // TEST/DEBUG CODE
+ // Keep a rcord of the number of test hits at each size
+ cpi->mode_test_hits[bsize]++;
+#endif
+
+ if (ref_frame == INTRA_FRAME) {
+ int rate;
+ mbmi->tx_size = TX_4X4;
+ if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
+ &distortion_y, best_rd) >= best_rd)
+ continue;
+ rate2 += rate;
+ rate2 += intra_cost_penalty;
+ distortion2 += distortion_y;
+
+ if (rate_uv_intra[TX_4X4] == INT_MAX) {
+ choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
+ &rate_uv_tokenonly[TX_4X4],
+ &dist_uv[TX_4X4], &skip_uv[TX_4X4],
+ &mode_uv[TX_4X4]);
+ }
+ rate2 += rate_uv_intra[TX_4X4];
+ rate_uv = rate_uv_tokenonly[TX_4X4];
+ distortion2 += dist_uv[TX_4X4];
+ distortion_uv = dist_uv[TX_4X4];
+ mbmi->uv_mode = mode_uv[TX_4X4];
+ tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+ for (i = 0; i < TX_MODES; ++i)
+ tx_cache[i] = tx_cache[ONLY_4X4];
+ } else {
int rate;
int64_t distortion;
int64_t this_rd_thresh;
@@ -3501,30 +4085,20 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
int tmp_best_skippable = 0;
int switchable_filter_index;
- int_mv *second_ref = is_comp_pred ?
- &mbmi->ref_mvs[second_ref_frame][0] : NULL;
- union b_mode_info tmp_best_bmodes[16];
+ int_mv *second_ref = comp_pred ?
+ &mbmi->ref_mvs[second_ref_frame][0] : NULL;
+ b_mode_info tmp_best_bmodes[16];
MB_MODE_INFO tmp_best_mbmode;
- PARTITION_INFO tmp_best_partition;
BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
int pred_exists = 0;
int uv_skippable;
- if (is_comp_pred) {
- if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
- if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
- continue;
- if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
- if (ref_frame != best_inter_ref_frame &&
- second_ref_frame != best_inter_ref_frame)
- continue;
- }
this_rd_thresh = (ref_frame == LAST_FRAME) ?
- cpi->rd_threshes[bsize][THR_NEWMV] :
- cpi->rd_threshes[bsize][THR_NEWA];
+ cpi->rd_thresh_sub8x8[segment_id][bsize][THR_LAST] :
+ cpi->rd_thresh_sub8x8[segment_id][bsize][THR_ALTR];
this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
- cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;
- xd->this_mi->mbmi.tx_size = TX_4X4;
+ cpi->rd_thresh_sub8x8[segment_id][bsize][THR_GOLD] : this_rd_thresh;
+ xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
if (cm->mcomp_filter_type != BILINEAR) {
@@ -3542,7 +4116,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
mbmi->interp_filter = switchable_filter_index;
vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
- tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+ tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
&mbmi->ref_mvs[ref_frame][0],
second_ref,
best_yrd,
@@ -3578,9 +4152,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
tmp_best_sse = total_sse;
tmp_best_skippable = skippable;
tmp_best_mbmode = *mbmi;
- tmp_best_partition = *x->partition_info;
- for (i = 0; i < 4; i++)
- tmp_best_bmodes[i] = xd->this_mi->bmi[i];
+ for (i = 0; i < 4; i++) {
+ tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
+ x->zcoeff_blk[TX_4X4][i] = !xd->plane[0].eobs[i];
+ }
pred_exists = 1;
if (switchable_filter_index == 0 &&
cpi->sf.use_rd_breakout &&
@@ -3607,7 +4182,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (!pred_exists) {
// Handles the special case when a filter that is not in the
// switchable list (bilinear, 6-tap) is indicated at the frame level
- tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+ tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
&mbmi->ref_mvs[ref_frame][0],
second_ref,
best_yrd,
@@ -3630,9 +4205,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
distortion = tmp_best_distortion;
skippable = tmp_best_skippable;
*mbmi = tmp_best_mbmode;
- *x->partition_info = tmp_best_partition;
for (i = 0; i < 4; i++)
- xd->this_mi->bmi[i] = tmp_best_bmodes[i];
+ xd->mi_8x8[0]->bmi[i] = tmp_best_bmodes[i];
}
rate2 += rate;
@@ -3642,12 +4216,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
rate2 += get_switchable_rate(x);
if (!mode_excluded) {
- if (is_comp_pred)
+ if (comp_pred)
mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
else
mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
}
- compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred);
+ compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
tmp_best_rdu = best_rd -
MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
@@ -3658,7 +4232,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// then dont bother looking at UV
vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
BLOCK_8X8);
- super_block_uvrd(cm, x, &rate_uv, &distortion_uv, &uv_skippable,
+ super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
&uv_sse, BLOCK_8X8, tmp_best_rdu);
if (rate_uv == INT_MAX)
continue;
@@ -3671,20 +4245,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
for (i = 0; i < TX_MODES; ++i)
tx_cache[i] = tx_cache[ONLY_4X4];
}
- } else {
- mbmi->mode = rd_mode_to_mode(this_mode);
- compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
- this_rd = handle_inter_mode(cpi, x, bsize,
- tx_cache,
- &rate2, &distortion2, &skippable,
- &rate_y, &distortion_y,
- &rate_uv, &distortion_uv,
- &mode_excluded, &disable_skip,
- &tmp_best_filter, frame_mv,
- mi_row, mi_col,
- single_newmv, &total_sse, best_rd);
- if (this_rd == INT64_MAX)
- continue;
}
if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
@@ -3708,25 +4268,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
SEG_LVL_SKIP);
- if (skippable && bsize >= BLOCK_8X8) {
- // Back out the coefficient coding costs
- rate2 -= (rate_y + rate_uv);
- // for best yrd calculation
- rate_uv = 0;
-
- if (mb_skip_allowed) {
- int prob_skip_cost;
-
- // Cost the skip mb case
- vp9_prob skip_prob =
- vp9_get_pred_prob_mbskip(cm, xd);
-
- if (skip_prob) {
- prob_skip_cost = vp9_cost_bit(skip_prob, 1);
- rate2 += prob_skip_cost;
- }
- }
- } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
+ if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
// Add in the cost of the no skip flag.
@@ -3756,42 +4298,22 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
}
- // Keep record of best intra rd
- if (xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
- is_intra_mode(xd->this_mi->mbmi.mode) &&
- this_rd < best_intra_rd) {
- best_intra_rd = this_rd;
- best_intra_mode = xd->this_mi->mbmi.mode;
- }
// Keep record of best inter rd with single reference
- if (xd->this_mi->mbmi.ref_frame[0] > INTRA_FRAME &&
- xd->this_mi->mbmi.ref_frame[1] == NONE &&
+ if (xd->mi_8x8[0]->mbmi.ref_frame[0] > INTRA_FRAME &&
+ xd->mi_8x8[0]->mbmi.ref_frame[1] == NONE &&
!mode_excluded &&
this_rd < best_inter_rd) {
best_inter_rd = this_rd;
best_inter_ref_frame = ref_frame;
- // best_inter_mode = xd->this_mi->mbmi.mode;
}
if (!disable_skip && ref_frame == INTRA_FRAME) {
for (i = 0; i < NB_PREDICTION_TYPES; ++i)
best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
}
- if (this_mode != RD_I4X4_PRED && this_mode != RD_SPLITMV) {
- // Store the respective mode distortions for later use.
- if (mode_distortions[this_mode] == -1
- || distortion2 < mode_distortions[this_mode]) {
- mode_distortions[this_mode] = distortion2;
- }
- if (frame_distortions[ref_frame] == -1
- || distortion2 < frame_distortions[ref_frame]) {
- frame_distortions[ref_frame] = distortion2;
- }
- }
-
// Did this mode help.. i.e. is it the new best mode
if (this_rd < best_rd || x->skip) {
if (!mode_excluded) {
@@ -3810,15 +4332,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
best_mbmode = *mbmi;
best_skip2 = this_skip2;
- best_partition = *x->partition_info;
+ vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+ sizeof(uint8_t) * ctx->num_4x4_blk);
- if (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV)
- for (i = 0; i < 4; i++)
- best_bmodes[i] = xd->this_mi->bmi[i];
+ for (i = 0; i < 4; i++)
+ best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
// TODO(debargha): enhance this test with a better distortion prediction
// based on qp, activity mask and history
- if (cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) {
+ if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+ (mode_index > MIN_EARLY_TERM_INDEX)) {
const int qstep = xd->plane[0].dequant[1];
// TODO(debargha): Enhance this by specializing for each mode_index
int scale = 4;
@@ -3865,7 +4388,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
cm->mcomp_filter_type != BILINEAR) {
int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
SWITCHABLE_FILTERS : cm->mcomp_filter_type];
- for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
int64_t adj_rd;
// In cases of poor prediction, filter_cache[] can contain really big
// values, which actually are bigger than this_rd itself. This can
@@ -3882,8 +4405,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
/* keep record of best txfm size */
if (bsize < BLOCK_32X32) {
if (bsize < BLOCK_16X16) {
- if (this_mode == RD_SPLITMV || this_mode == RD_I4X4_PRED)
- tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
+ tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
}
tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
@@ -3891,11 +4413,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (!mode_excluded && this_rd != INT64_MAX) {
for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
int64_t adj_rd = INT64_MAX;
- if (this_mode != RD_I4X4_PRED) {
+ if (ref_frame > INTRA_FRAME)
adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
- } else {
+ else
adj_rd = this_rd;
- }
if (adj_rd < best_tx_rd[i])
best_tx_rd[i] = adj_rd;
@@ -3915,39 +4436,20 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// If we used an estimate for the uv intra rd in the loop above...
if (cpi->sf.use_uv_intra_rd_estimate) {
// Do Intra UV best rd mode selection if best mode choice above was intra.
- if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
+ if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) {
TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
&rate_uv_tokenonly[uv_tx_size],
&dist_uv[uv_tx_size],
&skip_uv[uv_tx_size],
- bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
+ BLOCK_8X8);
}
}
// If we are using reference masking and the set mask flag is set then
// create the reference frame mask.
if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
- cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame);
-
- // Flag all modes that have a distortion thats > 2x the best we found at
- // this level.
- for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
- if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)
- continue;
-
- if (mode_distortions[mode_index] > 2 * *returndistortion) {
- ctx->modes_with_high_error |= (1 << mode_index);
- }
- }
-
- // Flag all ref frames that have a distortion thats > 2x the best we found at
- // this level.
- for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
- if (frame_distortions[ref_frame] > 2 * *returndistortion) {
- ctx->frames_with_high_error |= (1 << ref_frame);
- }
- }
+ cpi->ref_frame_mask = ~(1 << vp9_ref_order[best_mode_index].ref_frame);
if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
*returnrate = INT_MAX;
@@ -3965,16 +4467,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// best rd so far value and update rd_thresh_freq_fact based on the mode/size
// combination that wins out.
if (cpi->sf.adaptive_rd_thresh) {
- for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
+ for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
if (mode_index == best_mode_index) {
- cpi->rd_thresh_freq_fact[bsize][mode_index] -=
- (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3);
+ cpi->rd_thresh_freq_sub8x8[bsize][mode_index] -=
+ (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 3);
} else {
- cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
- if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
- (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) {
- cpi->rd_thresh_freq_fact[bsize][mode_index] =
- cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT;
+ cpi->rd_thresh_freq_sub8x8[bsize][mode_index] += RD_THRESH_INC;
+ if (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >
+ (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) {
+ cpi->rd_thresh_freq_sub8x8[bsize][mode_index] =
+ cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT;
}
}
}
@@ -3983,27 +4485,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// macroblock modes
*mbmi = best_mbmode;
x->skip |= best_skip2;
- if (best_mbmode.ref_frame[0] == INTRA_FRAME &&
- best_mbmode.sb_type < BLOCK_8X8) {
+ if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
for (i = 0; i < 4; i++)
- xd->this_mi->bmi[i].as_mode = best_bmodes[i].as_mode;
- }
-
- if (best_mbmode.ref_frame[0] != INTRA_FRAME &&
- best_mbmode.sb_type < BLOCK_8X8) {
- for (i = 0; i < 4; i++)
- xd->this_mi->bmi[i].as_mv[0].as_int =
- best_bmodes[i].as_mv[0].as_int;
-
- if (mbmi->ref_frame[1] > 0)
- for (i = 0; i < 4; i++)
- xd->this_mi->bmi[i].as_mv[1].as_int =
- best_bmodes[i].as_mv[1].as_int;
-
- *x->partition_info = best_partition;
+ xd->mi_8x8[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
+ } else {
+ for (i = 0; i < 4; ++i)
+ vpx_memcpy(&xd->mi_8x8[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
- mbmi->mv[0].as_int = xd->this_mi->bmi[3].as_mv[0].as_int;
- mbmi->mv[1].as_int = xd->this_mi->bmi[3].as_mv[1].as_int;
+ mbmi->mv[0].as_int = xd->mi_8x8[0]->bmi[3].as_mv[0].as_int;
+ mbmi->mv[1].as_int = xd->mi_8x8[0]->bmi[3].as_mv[1].as_int;
}
for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
@@ -4014,7 +4504,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
if (!x->skip) {
- for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
if (best_filter_rd[i] == INT64_MAX)
best_filter_diff[i] = 0;
else
@@ -4023,7 +4513,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (cm->mcomp_filter_type == SWITCHABLE)
assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
} else {
- vpx_memset(best_filter_diff, 0, sizeof(best_filter_diff));
+ vp9_zero(best_filter_diff);
}
if (!x->skip) {
@@ -4034,13 +4524,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
best_tx_diff[i] = best_rd - best_tx_rd[i];
}
} else {
- vpx_memset(best_tx_diff, 0, sizeof(best_tx_diff));
+ vp9_zero(best_tx_diff);
}
set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
scale_factor);
store_coding_context(x, ctx, best_mode_index,
- &best_partition,
&mbmi->ref_mvs[mbmi->ref_frame[0]][0],
&mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
mbmi->ref_frame[1]][0],
diff --git a/libvpx/vp9/encoder/vp9_rdopt.h b/libvpx/vp9/encoder/vp9_rdopt.h
index eba7df9..92fb235 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/libvpx/vp9/encoder/vp9_rdopt.h
@@ -12,10 +12,17 @@
#ifndef VP9_ENCODER_VP9_RDOPT_H_
#define VP9_ENCODER_VP9_RDOPT_H_
-#define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
+#define RDDIV_BITS 7
+
+#define RDCOST(RM, DM, R, D) \
+ (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM))
#define QIDX_SKIP_THRESH 115
-void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
+struct TileInfo;
+
+int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex);
+
+void vp9_initialize_rd_consts(VP9_COMP *cpi);
void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
@@ -24,13 +31,31 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
PICK_MODE_CONTEXT *ctx, int64_t best_rd);
int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+ const struct TileInfo *const tile,
int mi_row, int mi_col,
- int *r, int64_t *d, BLOCK_SIZE bsize,
- PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+ int *returnrate,
+ int64_t *returndistortion,
+ BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far);
+
+int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+ const struct TileInfo *const tile,
+ int mi_row, int mi_col,
+ int *returnrate,
+ int64_t *returndistortion,
+ BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far);
void vp9_init_me_luts();
void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
MB_PREDICTION_MODE mb, int_mv *mv);
+void vp9_get_entropy_contexts(TX_SIZE tx_size,
+ ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
+ const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
+ int num_4x4_w, int num_4x4_h);
+
#endif // VP9_ENCODER_VP9_RDOPT_H_
diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c
index 10655e8..24f011f 100644
--- a/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/libvpx/vp9/encoder/vp9_segmentation.c
@@ -117,7 +117,8 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) {
return cost;
}
-static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+static void count_segs(VP9_COMP *cpi, const TileInfo *const tile,
+ MODE_INFO **mi_8x8,
int *no_pred_segcounts,
int (*temporal_predictor_count)[2],
int *t_unpred_seg_counts,
@@ -129,9 +130,10 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- segment_id = mi_8x8[0]->mbmi.segment_id;
+ xd->mi_8x8 = mi_8x8;
+ segment_id = xd->mi_8x8[0]->mbmi.segment_id;
- set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
// Count the number of hits on each segment with no prediction
no_pred_segcounts[segment_id]++;
@@ -156,7 +158,8 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
}
}
-static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile,
+ MODE_INFO **mi_8x8,
int *no_pred_segcounts,
int (*temporal_predictor_count)[2],
int *t_unpred_seg_counts,
@@ -174,19 +177,20 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
if (bw == bs && bh == bs) {
- count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+ count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts, bs, bs, mi_row, mi_col);
} else if (bw == bs && bh < bs) {
- count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+ count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
- count_segs(cpi, mi_8x8 + hbs * mis, no_pred_segcounts,
+ count_segs(cpi, tile, mi_8x8 + hbs * mis, no_pred_segcounts,
temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
mi_row + hbs, mi_col);
} else if (bw < bs && bh == bs) {
- count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+ count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
- count_segs(cpi, mi_8x8 + hbs, no_pred_segcounts, temporal_predictor_count,
- t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs);
+ count_segs(cpi, tile, mi_8x8 + hbs,
+ no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts,
+ hbs, bs, mi_row, mi_col + hbs);
} else {
const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
int n;
@@ -197,7 +201,7 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
const int mi_dc = hbs * (n & 1);
const int mi_dr = hbs * (n >> 1);
- count_segs_sb(cpi, &mi_8x8[mi_dr * mis + mi_dc],
+ count_segs_sb(cpi, tile, &mi_8x8[mi_dr * mis + mi_dc],
no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts,
mi_row + mi_dr, mi_col + mi_dc, subsize);
@@ -233,15 +237,18 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
// First of all generate stats regarding how well the last segment map
// predicts this one
for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
- vp9_get_tile_col_offsets(cm, tile_col);
- mi_ptr = cm->mi_grid_visible + cm->cur_tile_mi_col_start;
+ TileInfo tile;
+
+ vp9_tile_init(&tile, cm, 0, tile_col);
+ mi_ptr = cm->mi_grid_visible + tile.mi_col_start;
for (mi_row = 0; mi_row < cm->mi_rows;
mi_row += 8, mi_ptr += 8 * mis) {
mi = mi_ptr;
- for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+ for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
mi_col += 8, mi += 8)
- count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count,
- t_unpred_seg_counts, mi_row, mi_col, BLOCK_64X64);
+ count_segs_sb(cpi, &tile, mi, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts,
+ mi_row, mi_col, BLOCK_64X64);
}
}
@@ -251,13 +258,13 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree);
// Key frames cannot use temporal prediction
- if (cm->frame_type != KEY_FRAME) {
+ if (!frame_is_intra_only(cm)) {
// Work out probability tree for coding those segments not
// predicted using the temporal method and the cost.
calc_segtree_probs(t_unpred_seg_counts, t_pred_tree);
t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree);
- // Add in the cost of the signalling for each prediction context
+ // Add in the cost of the signaling for each prediction context.
for (i = 0; i < PREDICTION_PROBS; i++) {
const int count0 = temporal_predictor_count[i][0];
const int count1 = temporal_predictor_count[i][1];
diff --git a/libvpx/vp9/encoder/vp9_ssim.c b/libvpx/vp9/encoder/vp9_ssim.c
index c155516..a5f18e6 100644
--- a/libvpx/vp9/encoder/vp9_ssim.c
+++ b/libvpx/vp9/encoder/vp9_ssim.c
@@ -42,8 +42,8 @@ void vp9_ssim_parms_8x8_c(uint8_t *s, int sp, uint8_t *r, int rp,
}
}
-const static int64_t cc1 = 26634; // (64^2*(.01*255)^2
-const static int64_t cc2 = 239708; // (64^2*(.03*255)^2
+static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
static double similarity(unsigned long sum_s, unsigned long sum_r,
unsigned long sum_sq_s, unsigned long sum_sq_r,
diff --git a/libvpx/vp9/encoder/vp9_subexp.c b/libvpx/vp9/encoder/vp9_subexp.c
index 667b801..387fc90 100644
--- a/libvpx/vp9/encoder/vp9_subexp.c
+++ b/libvpx/vp9/encoder/vp9_subexp.c
@@ -221,7 +221,8 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
}
void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
- vp9_prob upd, unsigned int *ct) {
+ const unsigned int ct[2]) {
+ const vp9_prob upd = DIFF_UPDATE_PROB;
vp9_prob newp = get_binary_prob(ct[0], ct[1]);
const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
upd);
diff --git a/libvpx/vp9/encoder/vp9_subexp.h b/libvpx/vp9/encoder/vp9_subexp.h
index 7acdaf6..521c777 100644
--- a/libvpx/vp9/encoder/vp9_subexp.h
+++ b/libvpx/vp9/encoder/vp9_subexp.h
@@ -19,7 +19,7 @@ void vp9_write_prob_diff_update(vp9_writer *w,
vp9_prob newp, vp9_prob oldp);
void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
- vp9_prob upd, unsigned int *ct);
+ unsigned int *ct);
int vp9_prob_diff_update_savings_search(const unsigned int *ct,
vp9_prob oldp, vp9_prob *bestp,
diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c
index 63826ee..2cace03 100644
--- a/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -29,7 +29,7 @@
#include "vpx_ports/vpx_timer.h"
#define ALT_REF_MC_ENABLED 1 // dis/enable MC in AltRef filtering
-#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
+#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
uint8_t *y_mb_ptr,
@@ -38,14 +38,15 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
int stride,
int mv_row,
int mv_col,
- uint8_t *pred) {
+ uint8_t *pred,
+ struct scale_factors *scale) {
const int which_mv = 0;
MV mv = { mv_row, mv_col };
vp9_build_inter_predictor(y_mb_ptr, stride,
&pred[0], 16,
&mv,
- &xd->scale_factor[which_mv],
+ scale,
16, 16,
which_mv,
&xd->subpix, MV_PRECISION_Q3);
@@ -55,7 +56,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
vp9_build_inter_predictor(u_mb_ptr, stride,
&pred[256], 8,
&mv,
- &xd->scale_factor[which_mv],
+ scale,
8, 8,
which_mv,
&xd->subpix, MV_PRECISION_Q4);
@@ -63,7 +64,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
vp9_build_inter_predictor(v_mb_ptr, stride,
&pred[320], 8,
&mv,
- &xd->scale_factor[which_mv],
+ scale,
8, 8,
which_mv,
&xd->subpix, MV_PRECISION_Q4);
@@ -83,7 +84,6 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
for (i = 0, k = 0; i < block_size; i++) {
for (j = 0; j < block_size; j++, k++) {
-
int src_byte = frame1[byte];
int pixel_value = *frame2++;
@@ -151,13 +151,12 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
/*cpi->sf.search_method == HEX*/
- // TODO Check that the 16x16 vf & sdf are selected here
// Ignore mv costing by sending NULL pointer instead of cost arrays
ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0];
- bestsme = vp9_hex_search(x, &best_ref_mv1_full,
+ bestsme = vp9_hex_search(x, &best_ref_mv1_full.as_mv,
step_param, sadpb, 1,
&cpi->fn_ptr[BLOCK_16X16],
- 0, &best_ref_mv1, ref_mv);
+ 0, &best_ref_mv1.as_mv, &ref_mv->as_mv);
#if ALT_REF_SUBPEL_ENABLED
// Try sub-pixel MC?
@@ -166,8 +165,9 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
int distortion;
unsigned int sse;
// Ignore mv costing by sending NULL pointer instead of cost array
- bestsme = cpi->find_fractional_mv_step(x, ref_mv,
- &best_ref_mv1,
+ bestsme = cpi->find_fractional_mv_step(x, &ref_mv->as_mv,
+ &best_ref_mv1.as_mv,
+ cpi->common.allow_high_precision_mv,
x->errorperbit,
&cpi->fn_ptr[BLOCK_16X16],
0, cpi->sf.subpel_iters_per_step,
@@ -187,7 +187,8 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
static void temporal_filter_iterate_c(VP9_COMP *cpi,
int frame_count,
int alt_ref_index,
- int strength) {
+ int strength,
+ struct scale_factors *scale) {
int byte;
int frame;
int mb_col, mb_row;
@@ -281,7 +282,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
cpi->frames[frame]->y_stride,
mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row,
mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col,
- predictor);
+ predictor, scale);
// Apply the filter (YUV)
vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
@@ -375,6 +376,9 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
- (num_frames_backward + 1);
+ struct scale_factors scale;
+ struct scale_factors_common scale_comm;
+
switch (blur_type) {
case 1:
// Backward Blur
@@ -424,26 +428,22 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
#ifdef DEBUGFWG
// DEBUG FWG
- printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d"
-, max_frames
-, num_frames_backward
-, num_frames_forward
-, frames_to_blur
-, frames_to_blur_backward
-, frames_to_blur_forward
-, cpi->source_encode_index
-, cpi->last_alt_ref_sei
-, start_frame);
+ printf(
+ "max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d "
+ "start:%d",
+ max_frames, num_frames_backward, num_frames_forward, frames_to_blur,
+ frames_to_blur_backward, frames_to_blur_forward, cpi->source_encode_index,
+ cpi->last_alt_ref_sei, start_frame);
#endif
// Setup scaling factors. Scaling on each of the arnr frames is not supported
- vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0],
- cm->yv12_fb[cm->new_fb_idx].y_crop_width,
- cm->yv12_fb[cm->new_fb_idx].y_crop_height,
+ vp9_setup_scale_factors_for_frame(&scale, &scale_comm,
+ get_frame_new_buffer(cm)->y_crop_width,
+ get_frame_new_buffer(cm)->y_crop_height,
cm->width, cm->height);
// Setup frame pointers, NULL indicates frame not included in filter
- vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));
+ vp9_zero(cpi->frames);
for (frame = 0; frame < frames_to_blur; frame++) {
int which_buffer = start_frame - frame;
struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
@@ -452,7 +452,7 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
}
temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward,
- strength);
+ strength, &scale);
}
void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c
index 0c9bf9d..7d4676e 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/libvpx/vp9/encoder/vp9_tokenize.c
@@ -21,24 +21,12 @@
#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_entropy.h"
-/* Global event counters used for accumulating statistics across several
- compressions, then generating vp9_context.c = initial stats. */
-
-#ifdef ENTROPY_STATS
-vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES];
-extern vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES];
-#endif /* ENTROPY_STATS */
-
-DECLARE_ALIGNED(16, extern const uint8_t,
- vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-
static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
const TOKENVALUE *vp9_dct_value_tokens_ptr;
static int dct_value_cost[DCT_MAX_VALUE * 2];
const int *vp9_dct_value_cost_ptr;
static void fill_value_tokens() {
-
TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
const vp9_extra_bit *const e = vp9_extra_bits;
@@ -60,9 +48,9 @@ static void fill_value_tokens() {
t[i].token = --j;
eb |= (a - e[j].base_val) << 1;
- } else
+ } else {
t[i].token = a;
-
+ }
t[i].extra = eb;
}
@@ -81,9 +69,7 @@ static void fill_value_tokens() {
cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */
dct_value_cost[i + DCT_MAX_VALUE] = cost;
}
-
}
-
} while (++i < DCT_MAX_VALUE);
vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
@@ -95,6 +81,7 @@ struct tokenize_b_args {
MACROBLOCKD *xd;
TOKENEXTRA **tp;
TX_SIZE tx_size;
+ uint8_t *token_cache;
};
static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -113,8 +100,9 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
VP9_COMP *cpi = args->cpi;
MACROBLOCKD *xd = args->xd;
TOKENEXTRA **tp = args->tp;
+ uint8_t *token_cache = args->token_cache;
struct macroblockd_plane *pd = &xd->plane[plane];
- MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
int pt; /* near block/prev token context index */
int c = 0, rc = 0;
TOKENEXTRA *t = *tp; /* store tokens starting here */
@@ -127,21 +115,16 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
const int ref = is_inter_block(mbmi);
- uint8_t token_cache[1024];
- const uint8_t *band_translate;
- ENTROPY_CONTEXT *A, *L;
+ const uint8_t *const band_translate = get_band_translate(tx_size);
const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
int aoff, loff;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
- A = pd->above_context + aoff;
- L = pd->left_context + loff;
-
assert((!type && !plane) || (type && plane));
- pt = get_entropy_context(xd, tx_size, type, block, A, L,
- &scan, &band_translate);
- nb = vp9_get_coef_neighbors_handle(scan);
+ pt = get_entropy_context(tx_size, pd->above_context + aoff,
+ pd->left_context + loff);
+ get_scan(xd, tx_size, type, block, &scan, &nb);
c = 0;
do {
const int band = get_coef_band(band_translate, c);
@@ -210,12 +193,12 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
TOKENEXTRA *t_backup = *t;
const int mb_skip_context = vp9_get_pred_context_mbskip(xd);
const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
SEG_LVL_SKIP);
- struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size};
+ struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size, cpi->mb.token_cache};
mbmi->skip_coeff = vp9_sb_is_skippable(xd, bsize);
if (mbmi->skip_coeff) {
@@ -236,149 +219,6 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
}
}
-#ifdef ENTROPY_STATS
-void init_context_counters(void) {
- FILE *f = fopen("context.bin", "rb");
- if (!f) {
- vp9_zero(context_counters);
- } else {
- fread(context_counters, sizeof(context_counters), 1, f);
- fclose(f);
- }
-
- f = fopen("treeupdate.bin", "rb");
- if (!f) {
- vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));
- } else {
- fread(tree_update_hist, sizeof(tree_update_hist), 1, f);
- fclose(f);
- }
-}
-
-static void print_counter(FILE *f, vp9_coeff_accum *context_counters,
- int block_types, const char *header) {
- int type, ref, band, pt, t;
-
- fprintf(f, "static const vp9_coeff_count %s = {\n", header);
-
-#define Comma(X) (X ? "," : "")
- type = 0;
- do {
- ref = 0;
- fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
- do {
- fprintf(f, "%s\n { /* %s */", Comma(type), ref ? "Inter" : "Intra");
- band = 0;
- do {
- fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
- pt = 0;
- do {
- fprintf(f, "%s\n {", Comma(pt));
-
- t = 0;
- do {
- const int64_t x = context_counters[type][ref][band][pt][t];
- const int y = (int) x;
-
- assert(x == (int64_t) y); /* no overflow handling yet */
- fprintf(f, "%s %d", Comma(t), y);
- } while (++t < 1 + MAX_ENTROPY_TOKENS);
- fprintf(f, "}");
- } while (++pt < PREV_COEF_CONTEXTS);
- fprintf(f, "\n }");
- } while (++band < COEF_BANDS);
- fprintf(f, "\n }");
- } while (++ref < REF_TYPES);
- fprintf(f, "\n }");
- } while (++type < block_types);
- fprintf(f, "\n};\n");
-}
-
-static void print_probs(FILE *f, vp9_coeff_accum *context_counters,
- int block_types, const char *header) {
- int type, ref, band, pt, t;
-
- fprintf(f, "static const vp9_coeff_probs %s = {", header);
-
- type = 0;
-#define Newline(x, spaces) (x ? " " : "\n" spaces)
- do {
- fprintf(f, "%s%s{ /* block Type %d */",
- Comma(type), Newline(type, " "), type);
- ref = 0;
- do {
- fprintf(f, "%s%s{ /* %s */",
- Comma(band), Newline(band, " "), ref ? "Inter" : "Intra");
- band = 0;
- do {
- fprintf(f, "%s%s{ /* Coeff Band %d */",
- Comma(band), Newline(band, " "), band);
- pt = 0;
- do {
- unsigned int branch_ct[ENTROPY_NODES][2];
- unsigned int coef_counts[MAX_ENTROPY_TOKENS + 1];
- vp9_prob coef_probs[ENTROPY_NODES];
-
- if (pt >= 3 && band == 0)
- break;
- for (t = 0; t < MAX_ENTROPY_TOKENS + 1; ++t)
- coef_counts[t] = context_counters[type][ref][band][pt][t];
- vp9_tree_probs_from_distribution(vp9_coef_tree, coef_probs,
- branch_ct, coef_counts, 0);
- branch_ct[0][1] = coef_counts[MAX_ENTROPY_TOKENS] - branch_ct[0][0];
- coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
- fprintf(f, "%s\n {", Comma(pt));
-
- t = 0;
- do {
- fprintf(f, "%s %3d", Comma(t), coef_probs[t]);
- } while (++t < ENTROPY_NODES);
-
- fprintf(f, " }");
- } while (++pt < PREV_COEF_CONTEXTS);
- fprintf(f, "\n }");
- } while (++band < COEF_BANDS);
- fprintf(f, "\n }");
- } while (++ref < REF_TYPES);
- fprintf(f, "\n }");
- } while (++type < block_types);
- fprintf(f, "\n};\n");
-}
-
-void print_context_counters() {
- FILE *f = fopen("vp9_context.c", "w");
-
- fprintf(f, "#include \"vp9_entropy.h\"\n");
- fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
-
- /* print counts */
- print_counter(f, context_counters[TX_4X4], BLOCK_TYPES,
- "vp9_default_coef_counts_4x4[BLOCK_TYPES]");
- print_counter(f, context_counters[TX_8X8], BLOCK_TYPES,
- "vp9_default_coef_counts_8x8[BLOCK_TYPES]");
- print_counter(f, context_counters[TX_16X16], BLOCK_TYPES,
- "vp9_default_coef_counts_16x16[BLOCK_TYPES]");
- print_counter(f, context_counters[TX_32X32], BLOCK_TYPES,
- "vp9_default_coef_counts_32x32[BLOCK_TYPES]");
-
- /* print coefficient probabilities */
- print_probs(f, context_counters[TX_4X4], BLOCK_TYPES,
- "default_coef_probs_4x4[BLOCK_TYPES]");
- print_probs(f, context_counters[TX_8X8], BLOCK_TYPES,
- "default_coef_probs_8x8[BLOCK_TYPES]");
- print_probs(f, context_counters[TX_16X16], BLOCK_TYPES,
- "default_coef_probs_16x16[BLOCK_TYPES]");
- print_probs(f, context_counters[TX_32X32], BLOCK_TYPES,
- "default_coef_probs_32x32[BLOCK_TYPES]");
-
- fclose(f);
-
- f = fopen("context.bin", "wb");
- fwrite(context_counters, sizeof(context_counters), 1, f);
- fclose(f);
-}
-#endif
-
void vp9_tokenize_initialize() {
fill_value_tokens();
}
diff --git a/libvpx/vp9/encoder/vp9_tokenize.h b/libvpx/vp9/encoder/vp9_tokenize.h
index b78e100..e24e31b 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/libvpx/vp9/encoder/vp9_tokenize.h
@@ -28,9 +28,6 @@ typedef struct {
uint8_t skip_eob_node;
} TOKENEXTRA;
-typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
- [MAX_ENTROPY_TOKENS + 1];
-
int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize);
int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
int plane);
@@ -39,13 +36,6 @@ struct VP9_COMP;
void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
BLOCK_SIZE bsize);
-#ifdef ENTROPY_STATS
-void init_context_counters();
-void print_context_counters();
-
-extern vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES];
-#endif
-
extern const int *vp9_dct_value_cost_ptr;
/* TODO: The Token field should be broken out into a separate char array to
* improve cache locality, since it's needed for costing when the rest of the
diff --git a/libvpx/vp9/encoder/vp9_vaq.c b/libvpx/vp9/encoder/vp9_vaq.c
new file mode 100644
index 0000000..1f9cb87
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_vaq.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vp9/encoder/vp9_vaq.h"
+
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+#define ENERGY_MIN (-3)
+#define ENERGY_MAX (3)
+#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
+#define ENERGY_IN_BOUNDS(energy)\
+ assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
+
+static double q_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static double rdmult_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static int segment_id[MAX_SEGMENTS] = { 5, 3, 1, 0, 2, 4, 6, 7 };
+
+#define Q_RATIO(i) q_ratio[(i) - ENERGY_MIN]
+#define RDMULT_RATIO(i) rdmult_ratio[(i) - ENERGY_MIN]
+#define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
+
+DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = {0};
+
+unsigned int vp9_vaq_segment_id(int energy) {
+ ENERGY_IN_BOUNDS(energy);
+
+ return SEGMENT_ID(energy);
+}
+
+double vp9_vaq_rdmult_ratio(int energy) {
+ ENERGY_IN_BOUNDS(energy);
+
+ vp9_clear_system_state(); // __asm emms;
+
+ return RDMULT_RATIO(energy);
+}
+
+double vp9_vaq_inv_q_ratio(int energy) {
+ ENERGY_IN_BOUNDS(energy);
+
+ vp9_clear_system_state(); // __asm emms;
+
+ return Q_RATIO(-energy);
+}
+
+void vp9_vaq_init() {
+ int i;
+ double base_ratio;
+
+ assert(ENERGY_SPAN <= MAX_SEGMENTS);
+
+ vp9_clear_system_state(); // __asm emms;
+
+ base_ratio = 1.8;
+
+ for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
+ Q_RATIO(i) = pow(base_ratio, i/3.0);
+ }
+}
+
+void vp9_vaq_frame_setup(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ struct segmentation *seg = &cm->seg;
+ int base_q = vp9_convert_qindex_to_q(cm->base_qindex);
+ int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
+ cm->y_dc_delta_q);
+ int i;
+
+ vp9_enable_segmentation((VP9_PTR)cpi);
+ vp9_clearall_segfeatures(seg);
+
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ vp9_clear_system_state(); // __asm emms;
+
+ for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
+ int qindex_delta, segment_rdmult;
+
+ if (Q_RATIO(i) == 1) {
+ // No need to enable SEG_LVL_ALT_Q for this segment
+ RDMULT_RATIO(i) = 1;
+ continue;
+ }
+
+ qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i));
+ vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
+ vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
+
+ segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
+ cm->y_dc_delta_q);
+
+ RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
+ }
+}
+
+
+static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bs) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ unsigned int var, sse;
+ int right_overflow = (xd->mb_to_right_edge < 0) ?
+ ((-xd->mb_to_right_edge) >> 3) : 0;
+ int bottom_overflow = (xd->mb_to_bottom_edge < 0) ?
+ ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+ if (right_overflow || bottom_overflow) {
+ const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow;
+ const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow;
+ int avg;
+ variance(x->plane[0].src.buf, x->plane[0].src.stride,
+ vp9_64_zeros, 0, bw, bh, &sse, &avg);
+ var = sse - (((int64_t)avg * avg) / (bw * bh));
+ return (256 * var) / (bw * bh);
+ } else {
+ var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+ x->plane[0].src.stride,
+ vp9_64_zeros, 0, &sse);
+ return (256 * var) >> num_pels_log2_lookup[bs];
+ }
+}
+
+int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+ double energy;
+ unsigned int var = block_variance(cpi, x, bs);
+
+ vp9_clear_system_state(); // __asm emms;
+
+ // if (var <= 1000)
+ // return 0;
+
+ energy = 0.9*(logf(var + 1) - 10.0);
+ return clamp(round(energy), ENERGY_MIN, ENERGY_MAX);
+}
diff --git a/libvpx/vp9/encoder/vp9_vaq.h b/libvpx/vp9/encoder/vp9_vaq.h
new file mode 100644
index 0000000..dc18b22
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_vaq.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_CONFIG_VAQ_H_
+#define VP9_ENCODER_VP9_CONFIG_VAQ_H_
+
+#include "vp9/encoder/vp9_onyx_int.h"
+
+unsigned int vp9_vaq_segment_id(int energy);
+double vp9_vaq_rdmult_ratio(int energy);
+double vp9_vaq_inv_q_ratio(int energy);
+
+void vp9_vaq_init();
+void vp9_vaq_frame_setup(VP9_COMP *cpi);
+
+int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+
+#endif // VP9_ENCODER_VP9_CONFIG_VAQ_H_
diff --git a/libvpx/vp9/encoder/vp9_variance.h b/libvpx/vp9/encoder/vp9_variance.h
index 6e686d6..2ded97c 100644
--- a/libvpx/vp9/encoder/vp9_variance.h
+++ b/libvpx/vp9/encoder/vp9_variance.h
@@ -14,6 +14,15 @@
#include "vpx/vpx_integer.h"
// #include "./vpx_config.h"
+void variance(const uint8_t *src_ptr,
+ int source_stride,
+ const uint8_t *ref_ptr,
+ int recon_stride,
+ int w,
+ int h,
+ unsigned int *sse,
+ int *sum);
+
typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -67,12 +76,6 @@ typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
unsigned int *sse,
const uint8_t *second_pred);
-typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
- int rp, unsigned long *sum_s,
- unsigned long *sum_r, unsigned long *sum_sq_s,
- unsigned long *sum_sq_r,
- unsigned long *sum_sxr);
-
typedef unsigned int (*vp9_getmbss_fn_t)(const short *);
typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
diff --git a/libvpx/vp9/encoder/vp9_variance_c.c b/libvpx/vp9/encoder/vp9_variance_c.c
index 155ba8a..8bc3850 100644
--- a/libvpx/vp9/encoder/vp9_variance_c.c
+++ b/libvpx/vp9/encoder/vp9_variance_c.c
@@ -8,13 +8,150 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "./vp9_rtcd.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_filter.h"
-#include "vp9/common/vp9_subpelvar.h"
-#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
-#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
+
+#include "vp9/encoder/vp9_variance.h"
+
+void variance(const uint8_t *src_ptr,
+ int source_stride,
+ const uint8_t *ref_ptr,
+ int recon_stride,
+ int w,
+ int h,
+ unsigned int *sse,
+ int *sum) {
+ int i, j;
+ int diff;
+
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ diff = src_ptr[j] - ref_ptr[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ src_ptr += source_stride;
+ ref_ptr += recon_stride;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_first_pass
+ *
+ * INPUTS : uint8_t *src_ptr : Pointer to source block.
+ * uint32_t src_pixels_per_line : Stride of input block.
+ * uint32_t pixel_step : Offset between filter input
+ * samples (see notes).
+ * uint32_t output_height : Input block height.
+ * uint32_t output_width : Input block width.
+ * int32_t *vp9_filter : Array of 2 bi-linear filter
+ * taps.
+ *
+ * OUTPUTS : int32_t *output_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
+ * either horizontal or vertical direction to produce the
+ * filtered output block. Used to implement first-pass
+ * of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
+ * Two filter taps should sum to VP9_FILTER_WEIGHT.
+ * pixel_step defines whether the filter is applied
+ * horizontally (pixel_step=1) or vertically (pixel_step=
+ * stride).
+ * It defines the offset required to move from one input
+ * to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
+ uint16_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int16_t *vp9_filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; i++) {
+ for (j = 0; j < output_width; j++) {
+ output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+ (int)src_ptr[pixel_step] * vp9_filter[1],
+ FILTER_BITS);
+
+ src_ptr++;
+ }
+
+ // Next row...
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_second_pass
+ *
+ * INPUTS : int32_t *src_ptr : Pointer to source block.
+ * uint32_t src_pixels_per_line : Stride of input block.
+ * uint32_t pixel_step : Offset between filter input
+ * samples (see notes).
+ * uint32_t output_height : Input block height.
+ * uint32_t output_width : Input block width.
+ * int32_t *vp9_filter : Array of 2 bi-linear filter
+ * taps.
+ *
+ * OUTPUTS : uint16_t *output_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
+ * either horizontal or vertical direction to produce the
+ * filtered output block. Used to implement second-pass
+ * of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Requires 32-bit input as produced by
+ * filter_block2d_bil_first_pass.
+ * Two filter taps should sum to VP9_FILTER_WEIGHT.
+ * pixel_step defines whether the filter is applied
+ * horizontally (pixel_step=1) or vertically (pixel_step=
+ * stride).
+ * It defines the offset required to move from one input
+ * to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int16_t *vp9_filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; i++) {
+ for (j = 0; j < output_width; j++) {
+ output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+ (int)src_ptr[pixel_step] * vp9_filter[1],
+ FILTER_BITS);
+ src_ptr++;
+ }
+
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
unsigned int i, sum = 0;
diff --git a/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
index 95ae266..2d59775 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -27,32 +27,14 @@ static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
__m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
return _mm_unpacklo_epi64(buf0, buf1);
}
-
-static INLINE __m128i k_cvtlo_epi16(__m128i a, __m128i mask16, __m128i kZero) {
- // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers
- __m128i sign_bit = _mm_and_si128(a, mask16);
- __m128i b = _mm_unpacklo_epi16(a, kZero);
- sign_bit = _mm_cmplt_epi16(sign_bit, kZero);
- sign_bit = _mm_unpacklo_epi16(kZero, sign_bit);
- return _mm_or_si128(sign_bit, b);
-}
-
-static INLINE __m128i k_cvthi_epi16(__m128i a, __m128i mask16, __m128i kZero) {
- // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers
- __m128i sign_bit = _mm_and_si128(a, mask16);
- __m128i b = _mm_unpackhi_epi16(a, kZero);
- sign_bit = _mm_cmplt_epi16(sign_bit, kZero);
- sign_bit = _mm_unpackhi_epi16(kZero, sign_bit);
- return _mm_or_si128(sign_bit, b);
-}
#endif
-void FDCT32x32_2D(int16_t *input,
- int16_t *output_org, int pitch) {
+void FDCT32x32_2D(const int16_t *input,
+ int16_t *output_org, int stride) {
// Calculate pre-multiplied strides
- const int str1 = pitch >> 1;
- const int str2 = pitch;
- const int str3 = pitch + str1;
+ const int str1 = stride;
+ const int str2 = 2 * stride;
+ const int str3 = 2 * stride + str1;
// We need an intermediate buffer between passes.
DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
// Constants
@@ -111,13 +93,13 @@ void FDCT32x32_2D(int16_t *input,
// Note: even though all the loads below are aligned, using the aligned
// intrinsic make the code slightly slower.
if (0 == pass) {
- int16_t *in = &input[column_start];
+ const int16_t *in = &input[column_start];
// step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
// Note: the next four blocks could be in a loop. That would help the
// instruction cache but is actually slower.
{
- int16_t *ina = in + 0 * str1;
- int16_t *inb = in + 31 * str1;
+ const int16_t *ina = in + 0 * str1;
+ const int16_t *inb = in + 31 * str1;
__m128i *step1a = &step1[ 0];
__m128i *step1b = &step1[31];
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
@@ -146,8 +128,8 @@ void FDCT32x32_2D(int16_t *input,
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
}
{
- int16_t *ina = in + 4 * str1;
- int16_t *inb = in + 27 * str1;
+ const int16_t *ina = in + 4 * str1;
+ const int16_t *inb = in + 27 * str1;
__m128i *step1a = &step1[ 4];
__m128i *step1b = &step1[27];
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
@@ -176,8 +158,8 @@ void FDCT32x32_2D(int16_t *input,
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
}
{
- int16_t *ina = in + 8 * str1;
- int16_t *inb = in + 23 * str1;
+ const int16_t *ina = in + 8 * str1;
+ const int16_t *inb = in + 23 * str1;
__m128i *step1a = &step1[ 8];
__m128i *step1b = &step1[23];
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
@@ -206,8 +188,8 @@ void FDCT32x32_2D(int16_t *input,
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
}
{
- int16_t *ina = in + 12 * str1;
- int16_t *inb = in + 19 * str1;
+ const int16_t *ina = in + 12 * str1;
+ const int16_t *inb = in + 19 * str1;
__m128i *step1a = &step1[12];
__m128i *step1b = &step1[19];
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
@@ -1159,28 +1141,43 @@ void FDCT32x32_2D(int16_t *input,
} else {
__m128i lstep1[64], lstep2[64], lstep3[64];
__m128i u[32], v[32], sign[16];
- const __m128i mask16 = _mm_set1_epi32(0x80008000);
const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
// start using 32-bit operations
// stage 3
{
// expanding to 32-bit length priori to addition operations
- lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero);
- lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero);
- lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero);
- lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero);
- lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero);
- lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero);
- lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero);
- lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero);
- lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero);
- lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero);
- lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero);
- lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero);
- lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero);
- lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero);
- lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero);
- lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero);
+ lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero);
+ lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero);
+ lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero);
+ lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero);
+ lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero);
+ lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero);
+ lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero);
+ lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero);
+ lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero);
+ lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero);
+ lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero);
+ lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero);
+ lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero);
+ lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero);
+ lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero);
+ lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero);
+ lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne);
+ lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne);
+ lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne);
+ lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne);
+ lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne);
+ lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne);
+ lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne);
+ lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne);
+ lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne);
+ lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne);
+ lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
+ lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
+ lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
+ lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
+ lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
+ lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
@@ -1231,42 +1228,75 @@ void FDCT32x32_2D(int16_t *input,
lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
}
{
- lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero);
- lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero);
- lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero);
- lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero);
- lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero);
- lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero);
- lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero);
- lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero);
- lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero);
- lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero);
- lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero);
- lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero);
- lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero);
- lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero);
- lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero);
- lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero);
-
- lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero);
- lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero);
- lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero);
- lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero);
- lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero);
- lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero);
- lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero);
- lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero);
- lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero);
- lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero);
- lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero);
- lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero);
- lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero);
- lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero);
- lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero);
- lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero);
+ lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
+ lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
+ lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
+ lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
+ lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
+ lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
+ lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
+ lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
+ lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
+ lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
+ lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
+ lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
+ lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
+ lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
+ lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
+ lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
+ lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
+ lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
+ lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
+ lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
+ lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
+ lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
+ lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
+ lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
+ lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
+ lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
+ lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
+ lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
+ lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
+ lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
+ lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
+ lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
+
+ lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
+ lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
+ lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
+ lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
+ lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
+ lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
+ lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
+ lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
+ lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
+ lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
+ lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
+ lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
+ lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
+ lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
+ lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
+ lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
+ lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
+ lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
+ lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
+ lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
+ lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
+ lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
+ lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
+ lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
+ lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
+ lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
+ lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
+ lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
+ lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
+ lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
+ lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
+ lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
+
lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
@@ -1302,14 +1332,22 @@ void FDCT32x32_2D(int16_t *input,
// stage 4
{
// expanding to 32-bit length priori to addition operations
- lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero);
- lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero);
- lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero);
- lstep2[19] = k_cvthi_epi16(step2[ 9], mask16, kZero);
- lstep2[28] = k_cvtlo_epi16(step2[14], mask16, kZero);
- lstep2[29] = k_cvthi_epi16(step2[14], mask16, kZero);
- lstep2[30] = k_cvtlo_epi16(step2[15], mask16, kZero);
- lstep2[31] = k_cvthi_epi16(step2[15], mask16, kZero);
+ lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero);
+ lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero);
+ lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero);
+ lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero);
+ lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
+ lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
+ lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
+ lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
+ lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
+ lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
+ lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
+ lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
+ lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
+ lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
+ lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
+ lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]);
lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]);
@@ -1337,41 +1375,41 @@ void FDCT32x32_2D(int16_t *input,
lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
}
{
- // to be continued...
- //
- const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
- const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-
- u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
- u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
- u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
- u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
-
- // TODO(jingning): manually inline k_madd_epi32_ to further hide
- // instruction latency.
- v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
- v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
- v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
- v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
- v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
- v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
- v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
- v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
-
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
- lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ // to be continued...
+ //
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
+ v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
+ v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
+ v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
+ v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
+ v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
+ v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
+ v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
+
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+ lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
}
{
const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
@@ -2647,4 +2685,4 @@ void FDCT32x32_2D(int16_t *input,
}
}
}
-}
+} // NOLINT
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
index eb271fe..dc11501 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
@@ -12,14 +12,13 @@
#include "vp9/common/vp9_idct.h" // for cospi constants
#include "vpx_ports/mem.h"
-void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
// as the first pass results are transposed, we tranpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
- const int stride = pitch >> 1;
int pass;
// Constants
// When we use them, in one case, they are all the same. In all others
@@ -112,12 +111,8 @@ void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
}
}
-void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) {
- vp9_short_fdct4x4_sse2(input, output, pitch);
- vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch);
-}
-
-static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) {
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+ int stride) {
const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
__m128i mask;
@@ -171,22 +166,21 @@ static INLINE void transpose_4x4(__m128i *res) {
void fdct4_1d_sse2(__m128i *in) {
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[4], v[4];
- u[0] = _mm_add_epi16(in[0], in[3]);
- u[1] = _mm_add_epi16(in[1], in[2]);
- u[2] = _mm_sub_epi16(in[1], in[2]);
- u[3] = _mm_sub_epi16(in[0], in[3]);
+ u[0]=_mm_unpacklo_epi16(in[0], in[1]);
+ u[1]=_mm_unpacklo_epi16(in[3], in[2]);
+
+ v[0] = _mm_add_epi16(u[0], u[1]);
+ v[1] = _mm_sub_epi16(u[0], u[1]);
- v[0] = _mm_unpacklo_epi16(u[0], u[1]);
- v[1] = _mm_unpacklo_epi16(u[2], u[3]);
u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
- u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08); // 1
- u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24); // 3
+ u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1
+ u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3
v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
@@ -249,7 +243,7 @@ void fadst4_1d_sse2(__m128i *in) {
transpose_4x4(in);
}
-void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in[4];
load_buffer_4x4(input, in, stride);
@@ -277,8 +271,7 @@ void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
write_buffer_4x4(output, in);
}
-void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
- const int stride = pitch >> 1;
+void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
int pass;
// Constants
// When we use them, in one case, they are all the same. In all others
@@ -535,15 +528,16 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
}
// load 8x8 array
-static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) {
- in[0] = _mm_load_si128((__m128i *)(input + 0 * stride));
- in[1] = _mm_load_si128((__m128i *)(input + 1 * stride));
- in[2] = _mm_load_si128((__m128i *)(input + 2 * stride));
- in[3] = _mm_load_si128((__m128i *)(input + 3 * stride));
- in[4] = _mm_load_si128((__m128i *)(input + 4 * stride));
- in[5] = _mm_load_si128((__m128i *)(input + 5 * stride));
- in[6] = _mm_load_si128((__m128i *)(input + 6 * stride));
- in[7] = _mm_load_si128((__m128i *)(input + 7 * stride));
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+ int stride) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
in[0] = _mm_slli_epi16(in[0], 2);
in[1] = _mm_slli_epi16(in[1], 2);
@@ -1033,7 +1027,7 @@ void fadst8_1d_sse2(__m128i *in) {
array_transpose_8x8(in, in);
}
-void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in[8];
load_buffer_8x8(input, in, stride);
@@ -1062,18 +1056,17 @@ void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
write_buffer_8x8(output, in, 8);
}
-void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
// as the first pass results are transposed, we tranpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
- const int stride = pitch >> 1;
int pass;
// We need an intermediate buffer between passes.
DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
- int16_t *in = input;
+ const int16_t *in = input;
int16_t *out = intermediate;
// Constants
// When we use them, in one case, they are all the same. In all others
@@ -1688,7 +1681,7 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
}
}
-static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0,
+static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
__m128i *in1, int stride) {
// load first 8 columns
load_buffer_8x8(input, in0, stride);
@@ -2540,7 +2533,7 @@ void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
}
-void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in0[16], in1[16];
load_buffer_16x16(input, in0, in1, stride);
@@ -2572,13 +2565,13 @@ void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
write_buffer_16x16(output, in0, in1, 16);
}
-#define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2
+#define FDCT32x32_2D vp9_fdct32x32_rd_sse2
#define FDCT32x32_HIGH_PRECISION 0
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
#undef FDCT32x32_2D
#undef FDCT32x32_HIGH_PRECISION
-#define FDCT32x32_2D vp9_short_fdct32x32_sse2
+#define FDCT32x32_2D vp9_fdct32x32_sse2
#define FDCT32x32_HIGH_PRECISION 1
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
#undef FDCT32x32_2D
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
index d141560..a3d0114 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
+++ b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
@@ -8,12 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vpx_config.h"
+#include "./vpx_config.h"
#include "vp9/encoder/vp9_variance.h"
#include "vp9/common/vp9_pragmas.h"
#include "vpx_ports/mem.h"
-extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
+extern unsigned int vp9_get_mb_ss_mmx(const int16_t *src_ptr);
extern unsigned int vp9_get8x8var_mmx
(
const unsigned char *src_ptr,
@@ -45,7 +45,6 @@ unsigned int vp9_variance4x4_mmx(
vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
*sse = var;
return (var - (((unsigned int)avg * avg) >> 4));
-
}
unsigned int vp9_variance8x8_mmx(
@@ -61,7 +60,6 @@ unsigned int vp9_variance8x8_mmx(
*sse = var;
return (var - (((unsigned int)avg * avg) >> 6));
-
}
unsigned int vp9_mse16x16_mmx(
@@ -74,10 +72,14 @@ unsigned int vp9_mse16x16_mmx(
int sum0, sum1, sum2, sum3;
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+ vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+ &sum0);
+ vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+ &sse1, &sum1);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+ ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride,
+ ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
var = sse0 + sse1 + sse2 + sse3;
*sse = var;
@@ -94,11 +96,14 @@ unsigned int vp9_variance16x16_mmx(
unsigned int sse0, sse1, sse2, sse3, var;
int sum0, sum1, sum2, sum3, avg;
-
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+ vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+ &sum0);
+ vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+ &sse1, &sum1);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+ ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride,
+ ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
var = sse0 + sse1 + sse2 + sse3;
avg = sum0 + sum1 + sum2 + sum3;
@@ -115,14 +120,15 @@ unsigned int vp9_variance16x8_mmx(
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+ &sum0);
+ vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+ &sse1, &sum1);
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
-
}
@@ -135,13 +141,14 @@ unsigned int vp9_variance8x16_mmx(
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
+ vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+ &sum0);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+ ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
-
}
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
index cea934d..79e42c4 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vpx_config.h"
+#include "./vpx_config.h"
#include "vp9/encoder/vp9_variance.h"
#include "vp9/common/vp9_pragmas.h"
@@ -26,7 +26,7 @@ extern unsigned int vp9_get4x4var_mmx
unsigned int vp9_get_mb_ss_sse2
(
- const short *src_ptr
+ const int16_t *src_ptr
);
unsigned int vp9_get16x16var_sse2
(
@@ -250,7 +250,6 @@ unsigned int vp9_mse16x16_sse2(
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
-
unsigned int sse0;
int sum0;
vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
@@ -407,12 +406,12 @@ FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16, 8, 16, 4, 3, opt1,); \
-FN(8, 16, 8, 3, 4, opt1,); \
-FN(8, 8, 8, 3, 3, opt1,); \
-FN(8, 4, 8, 3, 2, opt1,); \
-FN(4, 8, 4, 2, 3, opt2,); \
-FN(4, 4, 4, 2, 2, opt2,)
+FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \
+FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \
+FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \
+FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \
+FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \
+FN(4, 4, 4, 2, 2, opt2, (unsigned int))
FNS(sse2, sse);
FNS(ssse3, ssse3);
@@ -487,12 +486,12 @@ FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16, 8, 16, 4, 3, opt1,); \
-FN(8, 16, 8, 3, 4, opt1,); \
-FN(8, 8, 8, 3, 3, opt1,); \
-FN(8, 4, 8, 3, 2, opt1,); \
-FN(4, 8, 4, 2, 3, opt2,); \
-FN(4, 4, 4, 2, 2, opt2,)
+FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \
+FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \
+FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \
+FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \
+FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \
+FN(4, 4, 4, 2, 2, opt2, (unsigned int))
FNS(sse2, sse);
FNS(ssse3, ssse3);
diff --git a/libvpx/vp9/vp9_common.mk b/libvpx/vp9/vp9_common.mk
index 687fb48..0badb08 100644
--- a/libvpx/vp9/vp9_common.mk
+++ b/libvpx/vp9/vp9_common.mk
@@ -48,7 +48,6 @@ VP9_COMMON_SRCS-yes += common/vp9_reconintra.h
VP9_COMMON_SRCS-yes += common/vp9_rtcd.c
VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.sh
VP9_COMMON_SRCS-yes += common/vp9_sadmxn.h
-VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h
VP9_COMMON_SRCS-yes += common/vp9_scale.h
VP9_COMMON_SRCS-yes += common/vp9_scale.c
VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
@@ -69,13 +68,17 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c
VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
VP9_COMMON_SRCS-yes += common/vp9_common_data.c
VP9_COMMON_SRCS-yes += common/vp9_common_data.h
+VP9_COMMON_SRCS-yes += common/vp9_scan.c
+VP9_COMMON_SRCS-yes += common/vp9_scan.h
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
@@ -88,11 +91,28 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
endif
+# common (c)
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_common_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_avg_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_vert_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_avg_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_cols_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_dspr2.c
+
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
@@ -109,5 +129,6 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht8x8_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_save_reg_neon$(ASM)
$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh))
diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c
index 48866d2..1942039 100644
--- a/libvpx/vp9/vp9_cx_iface.c
+++ b/libvpx/vp9/vp9_cx_iface.c
@@ -8,30 +8,30 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <stdlib.h>
+#include <string.h>
#include "vpx/vpx_codec.h"
#include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_version.h"
+#include "./vpx_version.h"
#include "vp9/encoder/vp9_onyx_int.h"
#include "vpx/vp8cx.h"
#include "vp9/encoder/vp9_firstpass.h"
#include "vp9/common/vp9_onyx.h"
#include "vp9/vp9_iface_common.h"
-#include <stdlib.h>
-#include <string.h>
struct vp9_extracfg {
struct vpx_codec_pkt_list *pkt_list;
- int cpu_used; /** available cpu percentage in 1/16*/
- unsigned int enable_auto_alt_ref; /** if encoder decides to uses alternate reference frame */
+ int cpu_used; /* available cpu percentage in 1/16 */
+ unsigned int enable_auto_alt_ref;
unsigned int noise_sensitivity;
unsigned int Sharpness;
unsigned int static_thresh;
unsigned int tile_columns;
unsigned int tile_rows;
- unsigned int arnr_max_frames; /* alt_ref Noise Reduction Max Frame Count */
- unsigned int arnr_strength; /* alt_ref Noise Reduction Strength */
- unsigned int arnr_type; /* alt_ref filter type */
+ unsigned int arnr_max_frames;
+ unsigned int arnr_strength;
+ unsigned int arnr_type;
unsigned int experimental;
vp8e_tuning tuning;
unsigned int cq_level; /* constrained quality level */
@@ -48,7 +48,7 @@ struct extraconfig_map {
static const struct extraconfig_map extracfg_map[] = {
{
0,
- {
+ { // NOLINT
NULL,
0, /* cpu_used */
1, /* enable_auto_alt_ref */
@@ -85,11 +85,11 @@ struct vpx_codec_alg_priv {
uint32_t pending_frame_magnitude;
vpx_image_t preview_img;
vp8_postproc_cfg_t preview_ppcfg;
- vpx_codec_pkt_list_decl(64) pkt_list; // changed to accomendate the maximum number of lagged frames allowed
+ vpx_codec_pkt_list_decl(64) pkt_list;
unsigned int fixed_kf_cntr;
};
-static const VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
+static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
switch (frame) {
case VP8_LAST_FRAME:
return VP9_LAST_FLAG;
@@ -120,26 +120,26 @@ update_error_state(vpx_codec_alg_priv_t *ctx,
#define ERROR(str) do {\
ctx->base.err_detail = str;\
return VPX_CODEC_INVALID_PARAM;\
- } while(0)
+ } while (0)
-#define RANGE_CHECK(p,memb,lo,hi) do {\
- if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
+#define RANGE_CHECK(p, memb, lo, hi) do {\
+ if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
ERROR(#memb " out of range ["#lo".."#hi"]");\
- } while(0)
+ } while (0)
-#define RANGE_CHECK_HI(p,memb,hi) do {\
- if(!((p)->memb <= (hi))) \
+#define RANGE_CHECK_HI(p, memb, hi) do {\
+ if (!((p)->memb <= (hi))) \
ERROR(#memb " out of range [.."#hi"]");\
- } while(0)
+ } while (0)
-#define RANGE_CHECK_LO(p,memb,lo) do {\
- if(!((p)->memb >= (lo))) \
+#define RANGE_CHECK_LO(p, memb, lo) do {\
+ if (!((p)->memb >= (lo))) \
ERROR(#memb " out of range ["#lo"..]");\
- } while(0)
+ } while (0)
-#define RANGE_CHECK_BOOL(p,memb) do {\
- if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
- } while(0)
+#define RANGE_CHECK_BOOL(p, memb) do {\
+ if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
+ } while (0)
static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
const vpx_codec_enc_cfg_t *cfg,
@@ -247,7 +247,8 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
oxcf->width = cfg.g_w;
oxcf->height = cfg.g_h;
/* guess a frame rate if out of whack, use 30 */
- oxcf->framerate = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);
+ oxcf->framerate = (double)(cfg.g_timebase.den)
+ / (double)(cfg.g_timebase.num);
if (oxcf->framerate > 180) {
oxcf->framerate = 30;
@@ -255,7 +256,7 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
switch (cfg.g_pass) {
case VPX_RC_ONE_PASS:
- oxcf->Mode = MODE_BESTQUALITY;
+ oxcf->Mode = MODE_GOODQUALITY;
break;
case VPX_RC_FIRST_PASS:
oxcf->Mode = MODE_FIRSTPASS;
@@ -266,25 +267,25 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
}
if (cfg.g_pass == VPX_RC_FIRST_PASS) {
- oxcf->allow_lag = 0;
- oxcf->lag_in_frames = 0;
+ oxcf->allow_lag = 0;
+ oxcf->lag_in_frames = 0;
} else {
- oxcf->allow_lag = (cfg.g_lag_in_frames) > 0;
- oxcf->lag_in_frames = cfg.g_lag_in_frames;
+ oxcf->allow_lag = (cfg.g_lag_in_frames) > 0;
+ oxcf->lag_in_frames = cfg.g_lag_in_frames;
}
// VBR only supported for now.
// CBR code has been deprectated for experimental phase.
// CQ mode not yet tested
oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
- /*
if (cfg.rc_end_usage == VPX_CQ)
oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
- */
- if (cfg.rc_end_usage == VPX_Q)
+ else if (cfg.rc_end_usage == VPX_Q)
oxcf->end_usage = USAGE_CONSTANT_QUALITY;
+ else if (cfg.rc_end_usage == VPX_CBR)
+ oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
- oxcf->target_bandwidth = cfg.rc_target_bitrate;
+ oxcf->target_bandwidth = cfg.rc_target_bitrate;
oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
oxcf->best_allowed_q = cfg.rc_min_quantizer;
@@ -299,7 +300,7 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
oxcf->starting_buffer_level = cfg.rc_buf_initial_sz;
oxcf->optimal_buffer_level = cfg.rc_buf_optimal_sz;
- oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct;
+ oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct;
oxcf->two_pass_vbrmin_section = cfg.rc_2pass_vbr_minsection_pct;
oxcf->two_pass_vbrmax_section = cfg.rc_2pass_vbr_maxsection_pct;
@@ -315,23 +316,23 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
oxcf->encode_breakout = vp8_cfg.static_thresh;
oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref;
oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity;
- oxcf->Sharpness = vp8_cfg.Sharpness;
+ oxcf->Sharpness = vp8_cfg.Sharpness;
- oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in;
- oxcf->output_pkt_list = vp8_cfg.pkt_list;
+ oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in;
+ oxcf->output_pkt_list = vp8_cfg.pkt_list;
oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
- oxcf->arnr_strength = vp8_cfg.arnr_strength;
- oxcf->arnr_type = vp8_cfg.arnr_type;
+ oxcf->arnr_strength = vp8_cfg.arnr_strength;
+ oxcf->arnr_type = vp8_cfg.arnr_type;
oxcf->tuning = vp8_cfg.tuning;
oxcf->tile_columns = vp8_cfg.tile_columns;
- oxcf->tile_rows = vp8_cfg.tile_rows;
+ oxcf->tile_rows = vp8_cfg.tile_rows;
oxcf->lossless = vp8_cfg.lossless;
- oxcf->error_resilient_mode = cfg.g_error_resilient;
+ oxcf->error_resilient_mode = cfg.g_error_resilient;
oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode;
oxcf->ss_number_layers = cfg.ss_number_layers;
@@ -441,8 +442,6 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
MAP(VP8E_SET_ARNR_TYPE, xcfg.arnr_type);
MAP(VP8E_SET_TUNING, xcfg.tuning);
MAP(VP8E_SET_CQ_LEVEL, xcfg.cq_level);
- MAP(VP9E_SET_MAX_Q, ctx->cfg.rc_max_quantizer);
- MAP(VP9E_SET_MIN_Q, ctx->cfg.rc_min_quantizer);
MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct);
MAP(VP9E_SET_LOSSLESS, xcfg.lossless);
MAP(VP9E_SET_FRAME_PARALLEL_DECODING, xcfg.frame_parallel_decoding_mode);
@@ -500,7 +499,7 @@ static vpx_codec_err_t vp9e_common_init(vpx_codec_ctx_t *ctx,
*/
for (i = 0;
extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
- i++);
+ i++) {}
priv->vp8_cfg = extracfg_map[i].cfg;
priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
@@ -555,7 +554,6 @@ static vpx_codec_err_t vp9e_exp_init(vpx_codec_ctx_t *ctx,
static vpx_codec_err_t vp9e_destroy(vpx_codec_alg_priv_t *ctx) {
-
free(ctx->cx_data);
vp9_remove_compressor(&ctx->cpi);
free(ctx);
@@ -589,7 +587,8 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
static int write_superframe_index(vpx_codec_alg_priv_t *ctx) {
uint8_t marker = 0xc0;
- int mag, mask, index_sz;
+ unsigned int mask;
+ int mag, index_sz;
assert(ctx->pending_frame_count);
assert(ctx->pending_frame_count <= 8);
@@ -713,8 +712,10 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx,
lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
/* vp8 use 10,000,000 ticks/second as time stamp */
- dst_time_stamp = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
- dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+ dst_time_stamp = pts * 10000000 * ctx->cfg.g_timebase.num
+ / ctx->cfg.g_timebase.den;
+ dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num /
+ ctx->cfg.g_timebase.den;
if (img != NULL) {
res = image2yuvconfig(img, &sd);
@@ -768,7 +769,7 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx,
}
/* Add the frame packet to the list of returned packets. */
- round = 1000000 * ctx->cfg.g_timebase.num / 2 - 1;
+ round = (vpx_codec_pts_t)1000000 * ctx->cfg.g_timebase.num / 2 - 1;
delta = (dst_end_time_stamp - dst_time_stamp);
pkt.kind = VPX_CODEC_CX_FRAME_PKT;
pkt.data.frame.pts =
@@ -840,8 +841,6 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx,
cx_data += size;
cx_data_sz -= size;
}
-
- // printf("timestamp: %lld, duration: %d\n", pkt->data.frame.pts, pkt->data.frame.duration);
}
}
}
@@ -868,15 +867,14 @@ static vpx_codec_err_t vp9e_set_reference(vpx_codec_alg_priv_t *ctx,
vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type),
&sd);
return VPX_CODEC_OK;
- } else
+ } else {
return VPX_CODEC_INVALID_PARAM;
-
+ }
}
static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx,
int ctr_id,
va_list args) {
-
vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
if (data) {
@@ -887,8 +885,9 @@ static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx,
vp9_copy_reference_enc(ctx->cpi,
ref_frame_to_vp9_reframe(frame->frame_type), &sd);
return VPX_CODEC_OK;
- } else
+ } else {
return VPX_CODEC_INVALID_PARAM;
+ }
}
static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
@@ -917,8 +916,9 @@ static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx,
if (data) {
ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);
return VPX_CODEC_OK;
- } else
+ } else {
return VPX_CODEC_INVALID_PARAM;
+ }
#else
(void)ctx;
(void)ctr_id;
@@ -929,7 +929,6 @@ static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx,
static vpx_image_t *vp9e_get_preview(vpx_codec_alg_priv_t *ctx) {
-
YV12_BUFFER_CONFIG sd;
vp9_ppflags_t flags = {0};
@@ -942,8 +941,9 @@ static vpx_image_t *vp9e_get_preview(vpx_codec_alg_priv_t *ctx) {
if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) {
yuvconfig2image(&ctx->preview_img, &sd, NULL);
return &ctx->preview_img;
- } else
+ } else {
return NULL;
+ }
}
static vpx_codec_err_t vp9e_update_entropy(vpx_codec_alg_priv_t *ctx,
@@ -952,7 +952,6 @@ static vpx_codec_err_t vp9e_update_entropy(vpx_codec_alg_priv_t *ctx,
int update = va_arg(args, int);
vp9_update_entropy(ctx->cpi, update);
return VPX_CODEC_OK;
-
}
static vpx_codec_err_t vp9e_update_reference(vpx_codec_alg_priv_t *ctx,
@@ -974,64 +973,30 @@ static vpx_codec_err_t vp9e_use_reference(vpx_codec_alg_priv_t *ctx,
static vpx_codec_err_t vp9e_set_roi_map(vpx_codec_alg_priv_t *ctx,
int ctr_id,
va_list args) {
- vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
-
- if (data) {
- vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
-
- if (!vp9_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
- roi->delta_q, roi->delta_lf, roi->static_threshold))
- return VPX_CODEC_OK;
- else
- return VPX_CODEC_INVALID_PARAM;
- } else
- return VPX_CODEC_INVALID_PARAM;
+ // TODO(yaowu): Need to re-implement and test for VP9.
+ return VPX_CODEC_INVALID_PARAM;
}
static vpx_codec_err_t vp9e_set_activemap(vpx_codec_alg_priv_t *ctx,
int ctr_id,
va_list args) {
- vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
-
- if (data) {
-
- vpx_active_map_t *map = (vpx_active_map_t *)data;
-
- if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
- return VPX_CODEC_OK;
- else
- return VPX_CODEC_INVALID_PARAM;
- } else
- return VPX_CODEC_INVALID_PARAM;
+ // TODO(yaowu): Need to re-implement and test for VP9.
+ return VPX_CODEC_INVALID_PARAM;
}
static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,
int ctr_id,
va_list args) {
-
vpx_scaling_mode_t *data = va_arg(args, vpx_scaling_mode_t *);
if (data) {
int res;
vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
- res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
- scalemode.v_scaling_mode);
-
- if (!res) {
- return VPX_CODEC_OK;
- } else
- return VPX_CODEC_INVALID_PARAM;
- } else
- return VPX_CODEC_INVALID_PARAM;
-}
+ res = vp9_set_internal_size(ctx->cpi,
+ (VPX_SCALING)scalemode.h_scaling_mode,
+ (VPX_SCALING)scalemode.v_scaling_mode);
-static vpx_codec_err_t vp9e_set_width(vpx_codec_alg_priv_t *ctx, int ctr_id,
- va_list args) {
- unsigned int *data = va_arg(args, unsigned int *);
- if (data) {
- int res;
- res = vp9_set_size_literal(ctx->cpi, *data, 0);
if (!res) {
return VPX_CODEC_OK;
} else {
@@ -1042,50 +1007,40 @@ static vpx_codec_err_t vp9e_set_width(vpx_codec_alg_priv_t *ctx, int ctr_id,
}
}
-static vpx_codec_err_t vp9e_set_height(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
- unsigned int *data = va_arg(args, unsigned int *);
+static vpx_codec_err_t vp9e_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id,
+ va_list args) {
+ int data = va_arg(args, int);
+ vp9_set_svc(ctx->cpi, data);
+ return VPX_CODEC_OK;
+}
- if (data) {
- int res;
- res = vp9_set_size_literal(ctx->cpi, 0, *data);
+static vpx_codec_err_t vp9e_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
+ int ctr_id, va_list args) {
+ vpx_svc_parameters_t *data = va_arg(args, vpx_svc_parameters_t *);
+ VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
+ vpx_svc_parameters_t params;
- if (!res) {
- return VPX_CODEC_OK;
- } else {
- return VPX_CODEC_INVALID_PARAM;
- }
- } else {
+ if (data == NULL) {
return VPX_CODEC_INVALID_PARAM;
}
-}
-
-static vpx_codec_err_t vp9e_set_layer(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
- unsigned int *data = va_arg(args, unsigned int *);
- if (data) {
- int res;
- res = 0;
+ params = *(vpx_svc_parameters_t *)data;
- res = vp9_switch_layer(ctx->cpi, *data);
+ cpi->current_layer = params.layer;
+ cpi->lst_fb_idx = params.lst_fb_idx;
+ cpi->gld_fb_idx = params.gld_fb_idx;
+ cpi->alt_fb_idx = params.alt_fb_idx;
- if (!res) {
- return VPX_CODEC_OK;
- } else {
- return VPX_CODEC_INVALID_PARAM;
- }
- } else {
+ if (vp9_set_size_literal(ctx->cpi, params.width, params.height) != 0) {
return VPX_CODEC_INVALID_PARAM;
}
-}
-static vpx_codec_err_t vp9e_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id,
- va_list args) {
- int data = va_arg(args, int);
- vp9_set_svc(ctx->cpi, data);
+ ctx->cfg.rc_max_quantizer = params.max_quantizer;
+ ctx->cfg.rc_min_quantizer = params.min_quantizer;
+
+ set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
+ vp9_change_config(ctx->cpi, &ctx->oxcf);
+
return VPX_CODEC_OK;
}
@@ -1113,23 +1068,19 @@ static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = {
{VP8E_SET_ARNR_TYPE, set_param},
{VP8E_SET_TUNING, set_param},
{VP8E_SET_CQ_LEVEL, set_param},
- {VP9E_SET_MAX_Q, set_param},
- {VP9E_SET_MIN_Q, set_param},
{VP8E_SET_MAX_INTRA_BITRATE_PCT, set_param},
{VP9E_SET_LOSSLESS, set_param},
{VP9E_SET_FRAME_PARALLEL_DECODING, set_param},
{VP9_GET_REFERENCE, get_reference},
- {VP9E_SET_WIDTH, vp9e_set_width},
- {VP9E_SET_HEIGHT, vp9e_set_height},
- {VP9E_SET_LAYER, vp9e_set_layer},
{VP9E_SET_SVC, vp9e_set_svc},
+ {VP9E_SET_SVC_PARAMETERS, vp9e_set_svc_parameters},
{ -1, NULL},
};
static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = {
{
0,
- {
+ { // NOLINT
0, /* g_usage */
0, /* g_threads */
0, /* g_profile */
@@ -1198,13 +1149,13 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = {
vp9e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */
NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */
- {
+ { // NOLINT
NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */
NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */
NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */
NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */
},
- {
+ { // NOLINT
vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */
vp9e_encode, /* vpx_codec_encode_fn_t encode; */
vp9e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
@@ -1227,13 +1178,13 @@ CODEC_INTERFACE(vpx_codec_vp9x_cx) = {
vp9e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */
NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */
- {
+ { // NOLINT
NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */
NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */
NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */
NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */
},
- {
+ { // NOLINT
vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */
vp9e_encode, /* vpx_codec_encode_fn_t encode; */
vp9e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c
index 10b3238..5dacab4 100644
--- a/libvpx/vp9/vp9_dx_iface.c
+++ b/libvpx/vp9/vp9_dx_iface.c
@@ -14,7 +14,7 @@
#include "vpx/vpx_decoder.h"
#include "vpx/vp8dx.h"
#include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_version.h"
+#include "./vpx_version.h"
#include "vp9/decoder/vp9_onyxd.h"
#include "vp9/decoder/vp9_onyxd_int.h"
#include "vp9/decoder/vp9_read_bit_buffer.h"
@@ -172,9 +172,9 @@ static vpx_codec_err_t vp9_peek_si(const uint8_t *data,
rb.bit_offset += 1; // show frame
rb.bit_offset += 1; // error resilient
- if (vp9_rb_read_literal(&rb, 8) != SYNC_CODE_0 ||
- vp9_rb_read_literal(&rb, 8) != SYNC_CODE_1 ||
- vp9_rb_read_literal(&rb, 8) != SYNC_CODE_2) {
+ if (vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_0 ||
+ vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_1 ||
+ vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_2) {
return VPX_CODEC_UNSUP_BITSTREAM;
}
@@ -205,7 +205,6 @@ static vpx_codec_err_t vp9_peek_si(const uint8_t *data,
static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t *ctx,
vpx_codec_stream_info_t *si) {
-
unsigned int sz;
if (si->sz >= sizeof(vp9_stream_info_t))
@@ -323,15 +322,20 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
vp9_ppflags_t flags = {0};
if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) {
- flags.post_proc_flag = ctx->postproc_cfg.post_proc_flag
+ flags.post_proc_flag =
#if CONFIG_POSTPROC_VISUALIZER
-
- | ((ctx->dbg_color_ref_frame_flag != 0) ? VP9D_DEBUG_CLR_FRM_REF_BLKS : 0)
- | ((ctx->dbg_color_mb_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)
- | ((ctx->dbg_color_b_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)
- | ((ctx->dbg_display_mv_flag != 0) ? VP9D_DEBUG_DRAW_MV : 0)
+ ((ctx->dbg_color_ref_frame_flag != 0) ?
+ VP9D_DEBUG_CLR_FRM_REF_BLKS : 0)
+ | ((ctx->dbg_color_mb_modes_flag != 0) ?
+ VP9D_DEBUG_CLR_BLK_MODES : 0)
+ | ((ctx->dbg_color_b_modes_flag != 0) ?
+ VP9D_DEBUG_CLR_BLK_MODES : 0)
+ | ((ctx->dbg_display_mv_flag != 0) ?
+ VP9D_DEBUG_DRAW_MV : 0)
+ |
#endif
-;
+ ctx->postproc_cfg.post_proc_flag;
+
flags.deblocking_level = ctx->postproc_cfg.deblocking_level;
flags.noise_level = ctx->postproc_cfg.noise_level;
#if CONFIG_POSTPROC_VISUALIZER
@@ -496,8 +500,9 @@ static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t *ctx,
mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags);
res = VPX_CODEC_OK;
- } else
+ } else {
res = VPX_CODEC_LIST_END;
+ }
} while (!mmap->sz && res != VPX_CODEC_LIST_END);
return res;
@@ -542,7 +547,6 @@ static vpx_codec_err_t vp9_xma_set_mmap(vpx_codec_ctx_t *ctx,
static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx,
int ctr_id,
va_list args) {
-
vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
if (data) {
@@ -553,15 +557,14 @@ static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx,
return vp9_set_reference_dec(ctx->pbi,
(VP9_REFFRAME)frame->frame_type, &sd);
- } else
+ } else {
return VPX_CODEC_INVALID_PARAM;
-
+ }
}
static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx,
int ctr_id,
va_list args) {
-
vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
if (data) {
@@ -572,9 +575,9 @@ static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx,
return vp9_copy_reference_dec(ctx->pbi,
(VP9_REFFRAME)frame->frame_type, &sd);
- } else
+ } else {
return VPX_CODEC_INVALID_PARAM;
-
+ }
}
static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
@@ -603,9 +606,9 @@ static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx,
ctx->postproc_cfg_set = 1;
ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);
return VPX_CODEC_OK;
- } else
+ } else {
return VPX_CODEC_INVALID_PARAM;
-
+ }
#else
return VPX_CODEC_INCAPABLE;
#endif
@@ -642,25 +645,27 @@ static vpx_codec_err_t get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
*update_info = pbi->refresh_frame_flags;
return VPX_CODEC_OK;
- } else
+ } else {
return VPX_CODEC_INVALID_PARAM;
+ }
}
static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
int ctrl_id,
va_list args) {
-
int *corrupted = va_arg(args, int *);
if (corrupted) {
VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
- *corrupted = pbi->common.frame_to_show->corrupted;
-
+ if (pbi)
+ *corrupted = pbi->common.frame_to_show->corrupted;
+ else
+ return VPX_CODEC_ERROR;
return VPX_CODEC_OK;
- } else
+ } else {
return VPX_CODEC_INVALID_PARAM;
-
+ }
}
static vpx_codec_err_t set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
@@ -699,13 +704,13 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = {
ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
vp9_xma_get_mmap, /* vpx_codec_get_mmap_fn_t get_mmap; */
vp9_xma_set_mmap, /* vpx_codec_set_mmap_fn_t set_mmap; */
- {
+ { // NOLINT
vp9_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */
vp9_get_si, /* vpx_codec_get_si_fn_t get_si; */
vp9_decode, /* vpx_codec_decode_fn_t decode; */
vp9_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */
},
- {
+ { // NOLINT
/* encoder functions */
NOT_IMPLEMENTED,
NOT_IMPLEMENTED,
diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk
index 9fbf100..0993c6c 100644
--- a/libvpx/vp9/vp9cx.mk
+++ b/libvpx/vp9/vp9cx.mk
@@ -20,6 +20,7 @@ VP9_CX_SRCS-yes += vp9_cx_iface.c
VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
VP9_CX_SRCS-yes += encoder/vp9_boolhuff.c
VP9_CX_SRCS-yes += encoder/vp9_dct.c
+VP9_CX_SRCS-yes += encoder/vp9_dct.h
VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
VP9_CX_SRCS-yes += encoder/vp9_encodeintra.c
@@ -64,6 +65,8 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
VP9_CX_SRCS-yes += encoder/vp9_variance_c.c
+VP9_CX_SRCS-yes += encoder/vp9_vaq.c
+VP9_CX_SRCS-yes += encoder/vp9_vaq.h
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
diff --git a/libvpx/vp9/vp9dx.mk b/libvpx/vp9/vp9dx.mk
index be3afe8..3a27cdd 100644
--- a/libvpx/vp9/vp9dx.mk
+++ b/libvpx/vp9/vp9dx.mk
@@ -32,12 +32,7 @@ VP9_DX_SRCS-yes += decoder/vp9_thread.c
VP9_DX_SRCS-yes += decoder/vp9_thread.h
VP9_DX_SRCS-yes += decoder/vp9_treereader.h
VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c
-VP9_DX_SRCS-yes += decoder/vp9_idct_blk.c
-VP9_DX_SRCS-yes += decoder/vp9_idct_blk.h
VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h
VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
-
-VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c
-VP9_DX_SRCS-$(HAVE_NEON) += decoder/arm/neon/vp9_add_constant_residual_neon$(ASM)