diff options
Diffstat (limited to 'libvpx/vp9/encoder')
-rw-r--r-- | libvpx/vp9/encoder/vp9_bitstream.c | 161 | ||||
-rw-r--r-- | libvpx/vp9/encoder/vp9_block.h | 49 | ||||
-rw-r--r-- | libvpx/vp9/encoder/vp9_encodeframe.c | 129 | ||||
-rw-r--r-- | libvpx/vp9/encoder/vp9_encodemb.c | 123 | ||||
-rw-r--r-- | libvpx/vp9/encoder/vp9_encodemv.c | 13 | ||||
-rw-r--r-- | libvpx/vp9/encoder/vp9_firstpass.c | 13 | ||||
-rw-r--r-- | libvpx/vp9/encoder/vp9_onyx_if.c | 106 | ||||
-rw-r--r-- | libvpx/vp9/encoder/vp9_onyx_int.h | 1 | ||||
-rw-r--r-- | libvpx/vp9/encoder/vp9_quantize.c | 78 | ||||
-rw-r--r-- | libvpx/vp9/encoder/vp9_rdopt.c | 193 | ||||
-rw-r--r-- | libvpx/vp9/encoder/vp9_tokenize.c | 6 | ||||
-rw-r--r-- | libvpx/vp9/encoder/x86/vp9_dct_sse2.c | 5 | ||||
-rw-r--r-- | libvpx/vp9/encoder/x86/vp9_subpel_variance.asm | 198 |
13 files changed, 650 insertions, 425 deletions
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c index 87bd36c..efbadba 100644 --- a/libvpx/vp9/encoder/vp9_bitstream.c +++ b/libvpx/vp9/encoder/vp9_bitstream.c @@ -169,10 +169,8 @@ static void update_mode(vp9_writer *w, int n, vp9_tree tree, const unsigned int num_events[/* n */]) { int i = 0; - vp9_tree_probs_from_distribution(tree, bct, num_events, 0); - n--; - - for (i = 0; i < n; ++i) + vp9_tree_probs_from_distribution(tree, bct, num_events); + for (i = 0; i < n - 1; ++i) vp9_cond_prob_diff_update(w, &Pcur[i], bct[i]); } @@ -191,12 +189,14 @@ static void update_mbintra_mode_probs(VP9_COMP* const cpi, static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m, TX_SIZE tx_size, BLOCK_SIZE bsize, vp9_writer *w) { + const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; const MACROBLOCKD *const xd = &cpi->mb.e_mbd; - const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs, m); + const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd, + &cpi->common.fc.tx_probs); vp9_write(w, tx_size != TX_4X4, tx_probs[0]); - if (bsize >= BLOCK_16X16 && tx_size != TX_4X4) { + if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { vp9_write(w, tx_size != TX_8X8, tx_probs[1]); - if (bsize >= BLOCK_32X32 && tx_size != TX_8X8) + if (tx_size != TX_8X8 && max_tx_size >= TX_32X32) vp9_write(w, tx_size != TX_16X16, tx_probs[2]); } } @@ -231,7 +231,7 @@ static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) { int i, j; for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) { vp9_tree_probs_from_distribution(vp9_switchable_interp_tree, branch_ct, - cm->counts.switchable_interp[j], 0); + cm->counts.switchable_interp[j]); for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) vp9_cond_prob_diff_update(w, &cm->fc.switchable_interp_prob[j][i], @@ -250,7 +250,7 @@ static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer *w) { for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { unsigned int branch_ct[INTER_MODES - 1][2]; vp9_tree_probs_from_distribution(vp9_inter_mode_tree, branch_ct, - cm->counts.inter_mode[i], NEARESTMV); + cm->counts.inter_mode[i]); for (j = 0; j < INTER_MODES - 1; ++j) vp9_cond_prob_diff_update(w, &cm->fc.inter_mode_probs[i][j], @@ -258,15 +258,15 @@ static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer *w) { } } -static void pack_mb_tokens(vp9_writer* const bc, +static void pack_mb_tokens(vp9_writer* const w, TOKENEXTRA **tp, const TOKENEXTRA *const stop) { TOKENEXTRA *p = *tp; while (p < stop && p->token != EOSB_TOKEN) { const int t = p->token; - const struct vp9_token *const a = vp9_coef_encodings + t; - const vp9_extra_bit *const b = vp9_extra_bits + t; + const struct vp9_token *const a = &vp9_coef_encodings[t]; + const vp9_extra_bit *const b = &vp9_extra_bits[t]; int i = 0; const vp9_prob *pp; int v = a->value; @@ -289,7 +289,7 @@ static void pack_mb_tokens(vp9_writer* const bc, do { const int bb = (v >> --n) & 1; - vp9_write(bc, bb, pp[i >> 1]); + vp9_write(w, bb, pp[i >> 1]); i = vp9_coef_tree[i + bb]; } while (n); @@ -304,12 +304,12 @@ static void pack_mb_tokens(vp9_writer* const bc, do { const int bb = (v >> --n) & 1; - vp9_write(bc, bb, pb[i >> 1]); + vp9_write(w, bb, pb[i >> 1]); i = b->tree[i + bb]; } while (n); } - vp9_write_bit(bc, e & 1); + vp9_write_bit(w, e & 1); } ++p; } @@ -321,7 +321,7 @@ static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode, const vp9_prob *p) { assert(is_inter_mode(mode)); write_token(w, vp9_inter_mode_tree, p, - &vp9_inter_mode_encodings[inter_mode_offset(mode)]); + &vp9_inter_mode_encodings[INTER_OFFSET(mode)]); } @@ -448,7 +448,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { if (bsize >= BLOCK_8X8) { write_sb_mv_ref(bc, mode, mv_ref_p); ++cm->counts.inter_mode[mi->mode_context[rf]] - [inter_mode_offset(mode)]; + [INTER_OFFSET(mode)]; } } @@ -471,7 +471,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode; write_sb_mv_ref(bc, blockmode, mv_ref_p); ++cm->counts.inter_mode[mi->mode_context[rf]] - [inter_mode_offset(blockmode)]; + [INTER_OFFSET(blockmode)]; if (blockmode == NEWMV) { #ifdef ENTROPY_STATS @@ -545,37 +545,33 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, } static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile, - MODE_INFO **mi_8x8, vp9_writer *bc, - TOKENEXTRA **tok, TOKENEXTRA *tok_end, - int mi_row, int mi_col, int index) { + vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end, + int mi_row, int mi_col) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; - MODE_INFO *m = mi_8x8[0]; - - if (m->mbmi.sb_type < BLOCK_8X8) - if (index > 0) - return; + MODE_INFO *m; - xd->mi_8x8 = mi_8x8; + xd->mi_8x8 = cm->mi_grid_visible + (mi_row * cm->mode_info_stride + mi_col); + m = xd->mi_8x8[0]; set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type], mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type], cm->mi_rows, cm->mi_cols); if (frame_is_intra_only(cm)) { - write_mb_modes_kf(cpi, mi_8x8, bc); + write_mb_modes_kf(cpi, xd->mi_8x8, w); #ifdef ENTROPY_STATS active_section = 8; #endif } else { - pack_inter_mode_mvs(cpi, m, bc); + pack_inter_mode_mvs(cpi, m, w); #ifdef ENTROPY_STATS active_section = 1; #endif } assert(*tok < tok_end); - pack_mb_tokens(bc, tok, tok_end); + pack_mb_tokens(w, tok, tok_end); } static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col, @@ -602,59 +598,50 @@ static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col, } static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile, - MODE_INFO **mi_8x8, vp9_writer *bc, - TOKENEXTRA **tok, TOKENEXTRA *tok_end, - int mi_row, int mi_col, BLOCK_SIZE bsize, - int index) { + vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end, + int mi_row, int mi_col, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; - const int mis = cm->mode_info_stride; - int bsl = b_width_log2(bsize); - int bs = (1 << bsl) / 4; // mode_info step for subsize - int n; - PARTITION_TYPE partition = PARTITION_NONE; + const int bsl = b_width_log2(bsize); + const int bs = (1 << bsl) / 4; + PARTITION_TYPE partition; BLOCK_SIZE subsize; - MODE_INFO *m = mi_8x8[0]; + MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mode_info_stride + mi_col]; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; partition = partition_lookup[bsl][m->mbmi.sb_type]; - - if (bsize < BLOCK_8X8) { - if (index > 0) - return; - } else { - write_partition(cpi, bs, mi_row, mi_col, partition, bsize, bc); - } - + write_partition(cpi, bs, mi_row, mi_col, partition, bsize, w); subsize = get_subsize(bsize, partition); - - switch (partition) { - case PARTITION_NONE: - write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); - break; - case PARTITION_HORZ: - write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); - if ((mi_row + bs) < cm->mi_rows) - write_modes_b(cpi, tile, mi_8x8 + bs * mis, bc, tok, tok_end, - mi_row + bs, mi_col, 1); - break; - case PARTITION_VERT: - write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); - if ((mi_col + bs) < cm->mi_cols) - write_modes_b(cpi, tile, mi_8x8 + bs, bc, tok, tok_end, - mi_row, mi_col + bs, 1); - break; - case PARTITION_SPLIT: - for (n = 0; n < 4; n++) { - const int j = n >> 1, i = n & 1; - write_modes_sb(cpi, tile, mi_8x8 + j * bs * mis + i * bs, bc, - tok, tok_end, - mi_row + j * bs, mi_col + i * bs, subsize, n); - } - break; - default: - assert(0); + if (subsize < BLOCK_8X8) { + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + } else { + switch (partition) { + case PARTITION_NONE: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + break; + case PARTITION_HORZ: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + if (mi_row + bs < cm->mi_rows) + write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col); + break; + case PARTITION_VERT: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + if (mi_col + bs < cm->mi_cols) + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs); + break; + case PARTITION_SPLIT: + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize); + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs, + subsize); + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col, + subsize); + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs, + subsize); + break; + default: + assert(0); + } } // update partition context @@ -665,25 +652,15 @@ static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile, } static void write_modes(VP9_COMP *cpi, const TileInfo *const tile, - vp9_writer* const bc, - TOKENEXTRA **tok, TOKENEXTRA *tok_end) { - VP9_COMMON *const cm = &cpi->common; - const int mis = cm->mode_info_stride; + vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end) { int mi_row, mi_col; - MODE_INFO **mi_8x8 = cm->mi_grid_visible; - MODE_INFO **m_8x8; - - mi_8x8 += tile->mi_col_start + tile->mi_row_start * mis; for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; - mi_row += 8, mi_8x8 += 8 * mis) { - m_8x8 = mi_8x8; - vp9_zero(cpi->left_seg_context); + mi_row += MI_BLOCK_SIZE) { + vp9_zero(cpi->left_seg_context); for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; - mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) { - write_modes_sb(cpi, tile, m_8x8, bc, tok, tok_end, mi_row, mi_col, - BLOCK_64X64, 0); - } + mi_col += MI_BLOCK_SIZE) + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, BLOCK_64X64); } } @@ -703,7 +680,7 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) { continue; vp9_tree_probs_from_distribution(vp9_coef_tree, coef_branch_ct[i][j][k][l], - coef_counts[i][j][k][l], 0); + coef_counts[i][j][k][l]); coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] - coef_branch_ct[i][j][k][l][0][0]; for (m = 0; m < UNCONSTRAINED_NODES; ++m) @@ -1217,7 +1194,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { for (tile_col = 0; tile_col < tile_cols; tile_col++) { TileInfo tile; - vp9_tile_init(&tile, cm, 0, tile_col); + vp9_tile_init(&tile, cm, tile_row, tile_col); tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col]; if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h index 8033a4d..4445970 100644 --- a/libvpx/vp9/encoder/vp9_block.h +++ b/libvpx/vp9/encoder/vp9_block.h @@ -27,6 +27,18 @@ typedef struct { typedef struct { MODE_INFO mic; uint8_t *zcoeff_blk; + int16_t *coeff[MAX_MB_PLANE][3]; + int16_t *qcoeff[MAX_MB_PLANE][3]; + int16_t *dqcoeff[MAX_MB_PLANE][3]; + uint16_t *eobs[MAX_MB_PLANE][3]; + + // dual buffer pointers, 0: in use, 1: best in store + int16_t *coeff_pbuf[MAX_MB_PLANE][3]; + int16_t *qcoeff_pbuf[MAX_MB_PLANE][3]; + int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3]; + uint16_t *eobs_pbuf[MAX_MB_PLANE][3]; + + int is_coded; int num_4x4_blk; int skip; int_mv best_ref_mv; @@ -57,7 +69,7 @@ typedef struct { struct macroblock_plane { DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]); - DECLARE_ALIGNED(16, int16_t, coeff[64 * 64]); + int16_t *coeff; struct buf_2d src; // Quantizer setings @@ -81,6 +93,10 @@ struct macroblock { MACROBLOCKD e_mbd; int skip_block; + int select_txfm_size; + int skip_recode; + int skip_optimize; + int q_index; search_site *ss; int ss_count; @@ -120,6 +136,11 @@ struct macroblock { int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; + unsigned char sb_index; // index of 32x32 block inside the 64x64 block + unsigned char mb_index; // index of 16x16 block inside the 32x32 block + unsigned char b_index; // index of 8x8 block inside the 16x16 block + unsigned char ab_index; // index of 4x4 block inside the 8x8 block + // These define limits to motion vector components to prevent them // from extending outside the UMV borders int mv_col_min; @@ -179,35 +200,33 @@ struct macroblock { // refactoring on organizing the temporary buffers, when recursive // partition down to 4x4 block size is enabled. static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) { - MACROBLOCKD *const xd = &x->e_mbd; - switch (bsize) { case BLOCK_64X64: return &x->sb64_context; case BLOCK_64X32: - return &x->sb64x32_context[xd->sb_index]; + return &x->sb64x32_context[x->sb_index]; case BLOCK_32X64: - return &x->sb32x64_context[xd->sb_index]; + return &x->sb32x64_context[x->sb_index]; case BLOCK_32X32: - return &x->sb32_context[xd->sb_index]; + return &x->sb32_context[x->sb_index]; case BLOCK_32X16: - return &x->sb32x16_context[xd->sb_index][xd->mb_index]; + return &x->sb32x16_context[x->sb_index][x->mb_index]; case BLOCK_16X32: - return &x->sb16x32_context[xd->sb_index][xd->mb_index]; + return &x->sb16x32_context[x->sb_index][x->mb_index]; case BLOCK_16X16: - return &x->mb_context[xd->sb_index][xd->mb_index]; + return &x->mb_context[x->sb_index][x->mb_index]; case BLOCK_16X8: - return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index]; + return &x->sb16x8_context[x->sb_index][x->mb_index][x->b_index]; case BLOCK_8X16: - return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index]; + return &x->sb8x16_context[x->sb_index][x->mb_index][x->b_index]; case BLOCK_8X8: - return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index]; + return &x->sb8x8_context[x->sb_index][x->mb_index][x->b_index]; case BLOCK_8X4: - return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index]; + return &x->sb8x4_context[x->sb_index][x->mb_index][x->b_index]; case BLOCK_4X8: - return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index]; + return &x->sb4x8_context[x->sb_index][x->mb_index][x->b_index]; case BLOCK_4X4: - return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index]; + return &x->ab4x4_context[x->sb_index][x->mb_index][x->b_index]; default: assert(0); return NULL; diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c index a45299b..3e75f3b 100644 --- a/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/libvpx/vp9/encoder/vp9_encodeframe.c @@ -50,25 +50,25 @@ int enc_debug = 0; #endif -static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) { +static INLINE uint8_t *get_sb_index(MACROBLOCK *x, BLOCK_SIZE subsize) { switch (subsize) { case BLOCK_64X64: case BLOCK_64X32: case BLOCK_32X64: case BLOCK_32X32: - return &xd->sb_index; + return &x->sb_index; case BLOCK_32X16: case BLOCK_16X32: case BLOCK_16X16: - return &xd->mb_index; + return &x->mb_index; case BLOCK_16X8: case BLOCK_8X16: case BLOCK_8X8: - return &xd->b_index; + return &x->b_index; case BLOCK_8X4: case BLOCK_4X8: case BLOCK_4X4: - return &xd->ab_index; + return &x->ab_index; default: assert(0); return NULL; @@ -367,6 +367,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; MODE_INFO *mi = &ctx->mic; MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; MODE_INFO *mi_addr = xd->mi_8x8[0]; @@ -375,6 +377,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, const int mis = cm->mode_info_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; + int max_plane; assert(mi->mbmi.mode < MB_MODE_COUNT); assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES); @@ -383,6 +386,21 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, *mi_addr = *mi; + max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1; + for (i = 0; i < max_plane; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][1]; + pd[i].qcoeff = ctx->qcoeff_pbuf[i][1]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; + pd[i].eobs = ctx->eobs_pbuf[i][1]; + } + + for (i = max_plane; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][2]; + pd[i].qcoeff = ctx->qcoeff_pbuf[i][2]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2]; + pd[i].eobs = ctx->eobs_pbuf[i][2]; + } + // Restore the coding context of the MB to that that was in place // when the mode was picked for it for (y = 0; y < mi_height; y++) @@ -578,6 +596,9 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + int i; int orig_rdmult = x->rdmult; double rdmult_ratio; @@ -590,7 +611,7 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 // there is nothing to be done. - if (xd->ab_index != 0) { + if (x->ab_index != 0) { *totalrate = 0; *totaldist = 0; return; @@ -600,6 +621,15 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, set_offsets(cpi, tile, mi_row, mi_col, bsize); xd->mi_8x8[0]->mbmi.sb_type = bsize; + for (i = 0; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][0]; + pd[i].qcoeff = ctx->qcoeff_pbuf[i][0]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0]; + pd[i].eobs = ctx->eobs_pbuf[i][0]; + } + ctx->is_coded = 0; + x->skip_recode = 0; + // Set to zero to make sure we do not use the previous encoded frame stats xd->mi_8x8[0]->mbmi.skip_coeff = 0; @@ -687,16 +717,15 @@ static void update_stats(VP9_COMP *cpi) { } static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) { - MACROBLOCKD *const xd = &x->e_mbd; switch (bsize) { case BLOCK_64X64: return &x->sb64_partitioning; case BLOCK_32X32: - return &x->sb_partitioning[xd->sb_index]; + return &x->sb_partitioning[x->sb_index]; case BLOCK_16X16: - return &x->mb_partitioning[xd->sb_index][xd->mb_index]; + return &x->mb_partitioning[x->sb_index][x->mb_index]; case BLOCK_8X8: - return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index]; + return &x->b_partitioning[x->sb_index][x->mb_index][x->b_index]; default: assert(0); return NULL; @@ -769,20 +798,19 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, int output_enabled, BLOCK_SIZE bsize, int sub_index) { - VP9_COMMON * const cm = &cpi->common; - MACROBLOCK * const x = &cpi->mb; - MACROBLOCKD * const xd = &x->e_mbd; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; if (sub_index != -1) - *get_sb_index(xd, bsize) = sub_index; + *get_sb_index(x, bsize) = sub_index; if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 // there is nothing to be done. - if (xd->ab_index > 0) + if (x->ab_index > 0) return; } set_offsets(cpi, tile, mi_row, mi_col, bsize); @@ -800,9 +828,8 @@ static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, int output_enabled, BLOCK_SIZE bsize) { - VP9_COMMON * const cm = &cpi->common; - MACROBLOCK * const x = &cpi->mb; - MACROBLOCKD * const xd = &x->e_mbd; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; BLOCK_SIZE c1 = BLOCK_8X8; const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4; int pl = 0; @@ -848,7 +875,7 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile, for (i = 0; i < 4; i++) { const int x_idx = i & 1, y_idx = i >> 1; - *get_sb_index(xd, subsize) = i; + *get_sb_index(x, subsize) = i; encode_sb(cpi, tile, tp, mi_row + y_idx * bs, mi_col + x_idx * bs, output_enabled, subsize); } @@ -975,9 +1002,8 @@ static void rd_use_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, int do_recon) { - VP9_COMMON * const cm = &cpi->common; - MACROBLOCK * const x = &cpi->mb; - MACROBLOCKD *xd = &cpi->mb.e_mbd; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; const int mis = cm->mode_info_stride; int bsl = b_width_log2(bsize); const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; @@ -1012,7 +1038,7 @@ static void rd_use_partition(VP9_COMP *cpi, if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 // there is nothing to be done. - if (xd->ab_index != 0) { + if (x->ab_index != 0) { *rate = 0; *dist = 0; return; @@ -1070,7 +1096,7 @@ static void rd_use_partition(VP9_COMP *cpi, bsize, get_block_context(x, bsize), INT64_MAX); break; case PARTITION_HORZ: - *get_sb_index(xd, subsize) = 0; + *get_sb_index(x, subsize) = 0; pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, subsize, get_block_context(x, subsize), INT64_MAX); if (last_part_rate != INT_MAX && @@ -1079,7 +1105,7 @@ static void rd_use_partition(VP9_COMP *cpi, int64_t dt = 0; update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - *get_sb_index(xd, subsize) = 1; + *get_sb_index(x, subsize) = 1; pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize, get_block_context(x, subsize), INT64_MAX); if (rt == INT_MAX || dt == INT_MAX) { @@ -1093,7 +1119,7 @@ static void rd_use_partition(VP9_COMP *cpi, } break; case PARTITION_VERT: - *get_sb_index(xd, subsize) = 0; + *get_sb_index(x, subsize) = 0; pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, subsize, get_block_context(x, subsize), INT64_MAX); if (last_part_rate != INT_MAX && @@ -1102,7 +1128,7 @@ static void rd_use_partition(VP9_COMP *cpi, int64_t dt = 0; update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - *get_sb_index(xd, subsize) = 1; + *get_sb_index(x, subsize) = 1; pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize, get_block_context(x, subsize), INT64_MAX); if (rt == INT_MAX || dt == INT_MAX) { @@ -1128,7 +1154,7 @@ static void rd_use_partition(VP9_COMP *cpi, if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; - *get_sb_index(xd, subsize) = i; + *get_sb_index(x, subsize) = i; rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp, mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt, @@ -1169,11 +1195,10 @@ static void rd_use_partition(VP9_COMP *cpi, ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; - if ((mi_row + y_idx >= cm->mi_rows) - || (mi_col + x_idx >= cm->mi_cols)) + if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; - *get_sb_index(xd, split_subsize) = i; + *get_sb_index(x, split_subsize) = i; *get_sb_partitioning(x, bsize) = split_subsize; *get_sb_partitioning(x, split_subsize) = split_subsize; @@ -1353,7 +1378,6 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; // Only use 8x8 result for non HD videos. // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0; @@ -1366,9 +1390,9 @@ static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) { PICK_MODE_CONTEXT *block_context = NULL; if (bsize == BLOCK_16X16) { - block_context = x->sb8x8_context[xd->sb_index][xd->mb_index]; + block_context = x->sb8x8_context[x->sb_index][x->mb_index]; } else if (bsize == BLOCK_32X32) { - block_context = x->mb_context[xd->sb_index]; + block_context = x->mb_context[x->sb_index]; } else if (bsize == BLOCK_64X64) { block_context = x->sb32_context; } @@ -1456,9 +1480,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, int do_recon, int64_t best_rd) { - VP9_COMMON * const cm = &cpi->common; - MACROBLOCK * const x = &cpi->mb; - MACROBLOCKD * const xd = &x->e_mbd; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; const int ms = num_8x8_blocks_wide_lookup[bsize] / 2; ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; @@ -1484,7 +1507,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 // there is nothing to be done. - if (xd->ab_index != 0) { + if (x->ab_index != 0) { *rate = 0; *dist = 0; return; @@ -1582,7 +1605,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) continue; - *get_sb_index(xd, subsize) = i; + *get_sb_index(x, subsize) = i; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize, @@ -1629,7 +1652,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // PARTITION_HORZ if (partition_horz_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_HORZ); - *get_sb_index(xd, subsize) = 0; + *get_sb_index(x, subsize) = 0; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, @@ -1640,7 +1663,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - *get_sb_index(xd, subsize) = 1; + *get_sb_index(x, subsize) = 1; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate, @@ -1674,7 +1697,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (partition_vert_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_VERT); - *get_sb_index(xd, subsize) = 0; + *get_sb_index(x, subsize) = 0; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, @@ -1684,7 +1707,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - *get_sb_index(xd, subsize) = 1; + *get_sb_index(x, subsize) = 1; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate, @@ -1765,7 +1788,7 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile, } static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, TOKENEXTRA **tp, int *totalrate) { + int mi_row, TOKENEXTRA **tp) { VP9_COMMON * const cm = &cpi->common; int mi_col; @@ -1910,7 +1933,6 @@ static void encode_frame_internal(VP9_COMP *cpi) { MACROBLOCK * const x = &cpi->mb; VP9_COMMON * const cm = &cpi->common; MACROBLOCKD * const xd = &x->e_mbd; - int totalrate; // fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n", // cpi->common.current_video_frame, cpi->common.show_frame, @@ -1926,8 +1948,6 @@ static void encode_frame_internal(VP9_COMP *cpi) { } #endif - totalrate = 0; - vp9_zero(cm->counts.switchable_interp); vp9_zero(cpi->tx_stepdown_count); @@ -1989,7 +2009,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_tile_init(&tile, cm, tile_row, tile_col); for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end; mi_row += 8) - encode_sb_row(cpi, &tile, mi_row, &tp, &totalrate); + encode_sb_row(cpi, &tile, mi_row, &tp); cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old); assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols)); @@ -2015,10 +2035,6 @@ static void encode_frame_internal(VP9_COMP *cpi) { cpi->sf.skip_encode_frame = 0; } - // 256 rate units to the bit, - // projected_frame_size in units of BYTES - cpi->projected_frame_size = totalrate >> 8; - #if 0 // Keep record of the total distortion this time around for future use cpi->last_frame_distortion = cpi->frame_distortion; @@ -2395,13 +2411,17 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, MODE_INFO **mi_8x8 = xd->mi_8x8; MODE_INFO *mi = mi_8x8[0]; MB_MODE_INFO *mbmi = &mi->mbmi; + PICK_MODE_CONTEXT *ctx = get_block_context(x, bsize); unsigned int segment_id = mbmi->segment_id; const int mis = cm->mode_info_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; + x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8; + x->skip_optimize = ctx->is_coded; + ctx->is_coded = 1; x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct; x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame && - xd->q_index < QIDX_SKIP_THRESH); + x->q_index < QIDX_SKIP_THRESH); if (x->skip_encode) return; @@ -2487,7 +2507,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, (mbmi->skip_coeff || vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) { const uint8_t context = vp9_get_pred_context_tx_size(xd); - ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size]; + ++get_tx_counts(max_txsize_lookup[bsize], + context, &cm->counts.tx)[mbmi->tx_size]; } else { int x, y; TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode]; diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c index 75ed8ea..a85ddee 100644 --- a/libvpx/vp9/encoder/vp9_encodemb.c +++ b/libvpx/vp9/encoder/vp9_encodemb.c @@ -136,14 +136,13 @@ static void optimize_b(MACROBLOCK *mb, const int16_t *scan, *nb; const int mul = 1 + (tx_size == TX_32X32); uint8_t token_cache[1024]; - const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block); const int16_t *dequant_ptr = pd->dequant; const uint8_t *const band_translate = get_band_translate(tx_size); assert((!type && !plane) || (type && plane)); dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block); qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block); - get_scan(xd, tx_size, type, ib, &scan, &nb); + get_scan(xd, tx_size, type, block, &scan, &nb); assert(eob <= default_eob); /* Now set up a Viterbi trellis to evaluate alternative roundings. */ @@ -179,7 +178,7 @@ static void optimize_b(MACROBLOCK *mb, t0 = (vp9_dct_value_tokens_ptr + x)->token; /* Consider both possible successor states. */ if (next < default_eob) { - band = get_coef_band(band_translate, i + 1); + band = band_translate[i + 1]; pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); rate0 += mb->token_costs[tx_size][type][ref][band][0][pt] @@ -230,7 +229,7 @@ static void optimize_b(MACROBLOCK *mb, t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token; } if (next < default_eob) { - band = get_coef_band(band_translate, i + 1); + band = band_translate[i + 1]; if (t0 != DCT_EOB_TOKEN) { pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt] @@ -264,7 +263,7 @@ static void optimize_b(MACROBLOCK *mb, /* There's no choice to make for a zero coefficient, so we don't * add a new trellis node, but we do need to update the costs. */ - band = get_coef_band(band_translate, i + 1); + band = band_translate[i + 1]; t0 = tokens[next][0].token; t1 = tokens[next][1].token; /* Update the cost of each path if we're past the EOB token. */ @@ -284,7 +283,7 @@ static void optimize_b(MACROBLOCK *mb, } /* Now pick the best path through the whole trellis. */ - band = get_coef_band(band_translate, i + 1); + band = band_translate[i + 1]; pt = combine_entropy_contexts(*a, *l); rate0 = tokens[next][0].rate; rate1 = tokens[next][1].rate; @@ -420,28 +419,30 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx *const ctx = args->ctx; struct macroblockd_plane *const pd = &xd->plane[plane]; - const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, - block); - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block, - pd->dst.buf, pd->dst.stride); + int i, j; + uint8_t *dst; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); + dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i]; // TODO(jingning): per transformed block zero forcing only enabled for // luma component. will integrate chroma components as well. if (x->zcoeff_blk[tx_size][block] && plane == 0) { - int i, j; pd->eobs[block] = 0; - txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); ctx->ta[plane][i] = 0; ctx->tl[plane][j] = 0; return; } - vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); + if (!x->skip_recode) + vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); - if (x->optimize) + if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx); + } else { + ctx->ta[plane][i] = pd->eobs[block] > 0; + ctx->tl[plane][j] = pd->eobs[block] > 0; + } if (x->skip_encode || pd->eobs[block] == 0) return; @@ -505,9 +506,10 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { struct optimize_ctx ctx; struct encode_b_args arg = {x, &ctx}; - vp9_subtract_sb(x, bsize); + if (!x->skip_recode) + vp9_subtract_sb(x, bsize); - if (x->optimize) { + if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { int i; for (i = 0; i < MAX_MB_PLANE; ++i) optimize_init_b(i, bsize, &arg); @@ -552,19 +554,22 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, xoff = 32 * (block & twmask); yoff = 32 * (block >> twl); dst = pd->dst.buf + yoff * pd->dst.stride + xoff; - src = p->src.buf + yoff * p->src.stride + xoff; - src_diff = p->src_diff + 4 * bw * yoff + xoff; vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode, dst, pd->dst.stride, dst, pd->dst.stride); - vp9_subtract_block(32, 32, src_diff, bw * 4, - src, p->src.stride, dst, pd->dst.stride); - if (x->use_lp32x32fdct) - vp9_fdct32x32_rd(src_diff, coeff, bw * 4); - else - vp9_fdct32x32(src_diff, coeff, bw * 4); - vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + + if (!x->skip_recode) { + src = p->src.buf + yoff * p->src.stride + xoff; + src_diff = p->src_diff + 4 * bw * yoff + xoff; + vp9_subtract_block(32, 32, src_diff, bw * 4, + src, p->src.stride, dst, pd->dst.stride); + if (x->use_lp32x32fdct) + vp9_fdct32x32_rd(src_diff, coeff, bw * 4); + else + vp9_fdct32x32(src_diff, coeff, bw * 4); + vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); + } if (!x->skip_encode && *eob) vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob); break; @@ -577,16 +582,18 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, xoff = 16 * (block & twmask); yoff = 16 * (block >> twl); dst = pd->dst.buf + yoff * pd->dst.stride + xoff; - src = p->src.buf + yoff * p->src.stride + xoff; - src_diff = p->src_diff + 4 * bw * yoff + xoff; vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode, dst, pd->dst.stride, dst, pd->dst.stride); - vp9_subtract_block(16, 16, src_diff, bw * 4, - src, p->src.stride, dst, pd->dst.stride); - vp9_fht16x16(tx_type, src_diff, coeff, bw * 4); - vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + if (!x->skip_recode) { + src = p->src.buf + yoff * p->src.stride + xoff; + src_diff = p->src_diff + 4 * bw * yoff + xoff; + vp9_subtract_block(16, 16, src_diff, bw * 4, + src, p->src.stride, dst, pd->dst.stride); + vp9_fht16x16(tx_type, src_diff, coeff, bw * 4); + vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); + } if (!x->skip_encode && *eob) vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob); break; @@ -599,16 +606,18 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, xoff = 8 * (block & twmask); yoff = 8 * (block >> twl); dst = pd->dst.buf + yoff * pd->dst.stride + xoff; - src = p->src.buf + yoff * p->src.stride + xoff; - src_diff = p->src_diff + 4 * bw * yoff + xoff; vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode, dst, pd->dst.stride, dst, pd->dst.stride); - vp9_subtract_block(8, 8, src_diff, bw * 4, - src, p->src.stride, dst, pd->dst.stride); - vp9_fht8x8(tx_type, src_diff, coeff, bw * 4); - vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + if (!x->skip_recode) { + src = p->src.buf + yoff * p->src.stride + xoff; + src_diff = p->src_diff + 4 * bw * yoff + xoff; + vp9_subtract_block(8, 8, src_diff, bw * 4, + src, p->src.stride, dst, pd->dst.stride); + vp9_fht8x8(tx_type, src_diff, coeff, bw * 4); + vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); + } if (!x->skip_encode && *eob) vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob); break; @@ -624,19 +633,23 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, xoff = 4 * (block & twmask); yoff = 4 * (block >> twl); dst = pd->dst.buf + yoff * pd->dst.stride + xoff; - src = p->src.buf + yoff * p->src.stride + xoff; - src_diff = p->src_diff + 4 * bw * yoff + xoff; vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode, dst, pd->dst.stride, dst, pd->dst.stride); - vp9_subtract_block(4, 4, src_diff, bw * 4, - src, p->src.stride, dst, pd->dst.stride); - if (tx_type != DCT_DCT) - vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type); - else - x->fwd_txm4x4(src_diff, coeff, bw * 4); - vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + + if (!x->skip_recode) { + src = p->src.buf + yoff * p->src.stride + xoff; + src_diff = p->src_diff + 4 * bw * yoff + xoff; + vp9_subtract_block(4, 4, src_diff, bw * 4, + src, p->src.stride, dst, pd->dst.stride); + if (tx_type != DCT_DCT) + vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type); + else + x->fwd_txm4x4(src_diff, coeff, bw * 4); + vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); + } + if (!x->skip_encode && *eob) { if (tx_type == DCT_DCT) // this is like vp9_short_idct4x4 but has a special case around eob<=1 diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c index e2c6c4c..030ca64 100644 --- a/libvpx/vp9/encoder/vp9_encodemv.c +++ b/libvpx/vp9/encoder/vp9_encodemv.c @@ -155,9 +155,8 @@ static void counts_to_nmv_context( unsigned int (*branch_ct_class0_hp)[2], unsigned int (*branch_ct_hp)[2]) { int i, j, k; - vp9_tree_probs_from_distribution(vp9_mv_joint_tree, - branch_ct_joint, - nmv_count->joints, 0); + vp9_tree_probs_from_distribution(vp9_mv_joint_tree, branch_ct_joint, + nmv_count->joints); for (i = 0; i < 2; ++i) { const uint32_t s0 = nmv_count->comps[i].sign[0]; const uint32_t s1 = nmv_count->comps[i].sign[1]; @@ -166,10 +165,10 @@ static void counts_to_nmv_context( branch_ct_sign[i][1] = s1; vp9_tree_probs_from_distribution(vp9_mv_class_tree, branch_ct_classes[i], - nmv_count->comps[i].classes, 0); + nmv_count->comps[i].classes); vp9_tree_probs_from_distribution(vp9_mv_class0_tree, branch_ct_class0[i], - nmv_count->comps[i].class0, 0); + nmv_count->comps[i].class0); for (j = 0; j < MV_OFFSET_BITS; ++j) { const uint32_t b0 = nmv_count->comps[i].bits[j][0]; const uint32_t b1 = nmv_count->comps[i].bits[j][1]; @@ -182,11 +181,11 @@ static void counts_to_nmv_context( for (k = 0; k < CLASS0_SIZE; ++k) { vp9_tree_probs_from_distribution(vp9_mv_fp_tree, branch_ct_class0_fp[i][k], - nmv_count->comps[i].class0_fp[k], 0); + nmv_count->comps[i].class0_fp[k]); } vp9_tree_probs_from_distribution(vp9_mv_fp_tree, branch_ct_fp[i], - nmv_count->comps[i].fp, 0); + nmv_count->comps[i].fp); } if (usehp) { for (i = 0; i < 2; ++i) { diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c index 6a3555d..974c300 100644 --- a/libvpx/vp9/encoder/vp9_firstpass.c +++ b/libvpx/vp9/encoder/vp9_firstpass.c @@ -482,6 +482,10 @@ void vp9_first_pass(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; TileInfo tile; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + PICK_MODE_CONTEXT *ctx = &x->sb64_context; + int i; int recon_yoffset, recon_uvoffset; const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx]; @@ -525,6 +529,15 @@ void vp9_first_pass(VP9_COMP *cpi) { vp9_frame_init_quantizer(cpi); + for (i = 0; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][1]; + pd[i].qcoeff = ctx->qcoeff_pbuf[i][1]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; + pd[i].eobs = ctx->eobs_pbuf[i][1]; + } + x->skip_recode = 0; + + // Initialise the MV cost table to the defaults // if( cm->current_video_frame == 0) // if ( 0 ) diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_onyx_if.c index f922f90..dd4705d 100644 --- a/libvpx/vp9/encoder/vp9_onyx_if.c +++ b/libvpx/vp9/encoder/vp9_onyx_if.c @@ -834,6 +834,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->adaptive_rd_thresh = 2; sf->recode_loop = 2; + sf->use_lp32x32fdct = 1; sf->mode_skip_start = 11; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; @@ -1436,90 +1437,121 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) { } while (++i <= MV_MAX); } +static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, + PICK_MODE_CONTEXT *ctx) { + int num_pix = num_4x4_blk << 4; + int i, k; + ctx->num_4x4_blk = num_4x4_blk; + CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, + vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (k = 0; k < 3; ++k) { + CHECK_MEM_ERROR(cm, ctx->coeff[i][k], + vpx_memalign(16, num_pix * sizeof(int16_t))); + CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k], + vpx_memalign(16, num_pix * sizeof(int16_t))); + CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k], + vpx_memalign(16, num_pix * sizeof(int16_t))); + CHECK_MEM_ERROR(cm, ctx->eobs[i][k], + vpx_memalign(16, num_pix * sizeof(uint16_t))); + ctx->coeff_pbuf[i][k] = ctx->coeff[i][k]; + ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k]; + ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k]; + ctx->eobs_pbuf[i][k] = ctx->eobs[i][k]; + } + } +} + +static void free_mode_context(PICK_MODE_CONTEXT *ctx) { + int i, k; + vpx_free(ctx->zcoeff_blk); + ctx->zcoeff_blk = 0; + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (k = 0; k < 3; ++k) { + vpx_free(ctx->coeff[i][k]); + ctx->coeff[i][k] = 0; + vpx_free(ctx->qcoeff[i][k]); + ctx->qcoeff[i][k] = 0; + vpx_free(ctx->dqcoeff[i][k]); + ctx->dqcoeff[i][k] = 0; + vpx_free(ctx->eobs[i][k]); + ctx->eobs[i][k] = 0; + } + } +} + static void init_pick_mode_context(VP9_COMP *cpi) { int i; - MACROBLOCK *x = &cpi->mb; - MACROBLOCKD *xd = &x->e_mbd; - VP9_COMMON *cm = &cpi->common; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + for (i = 0; i < BLOCK_SIZES; ++i) { const int num_4x4_w = num_4x4_blocks_wide_lookup[i]; const int num_4x4_h = num_4x4_blocks_high_lookup[i]; const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h); if (i < BLOCK_16X16) { - for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) { - for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) { - for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) { + for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { + for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) { + for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - ctx->num_4x4_blk = num_4x4_blk; - CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, - vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + alloc_mode_context(cm, num_4x4_blk, ctx); } } } } else if (i < BLOCK_32X32) { - for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) { - for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk; - ++xd->mb_index) { + for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { + for (x->mb_index = 0; x->mb_index < 64 / num_4x4_blk; ++x->mb_index) { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); ctx->num_4x4_blk = num_4x4_blk; - CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, - vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + alloc_mode_context(cm, num_4x4_blk, ctx); } } } else if (i < BLOCK_64X64) { - for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) { + for (x->sb_index = 0; x->sb_index < 256 / num_4x4_blk; ++x->sb_index) { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); ctx->num_4x4_blk = num_4x4_blk; - CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, - vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + alloc_mode_context(cm, num_4x4_blk, ctx); } } else { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); ctx->num_4x4_blk = num_4x4_blk; - CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, - vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + alloc_mode_context(cm, num_4x4_blk, ctx); } } } static void free_pick_mode_context(MACROBLOCK *x) { int i; - MACROBLOCKD *xd = &x->e_mbd; for (i = 0; i < BLOCK_SIZES; ++i) { const int num_4x4_w = num_4x4_blocks_wide_lookup[i]; const int num_4x4_h = num_4x4_blocks_high_lookup[i]; const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h); if (i < BLOCK_16X16) { - for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) { - for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) { - for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) { + for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { + for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) { + for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - vpx_free(ctx->zcoeff_blk); - ctx->zcoeff_blk = 0; + free_mode_context(ctx); } } } } else if (i < BLOCK_32X32) { - for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) { - for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk; - ++xd->mb_index) { + for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { + for (x->mb_index = 0; x->mb_index < 64 / num_4x4_blk; ++x->mb_index) { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - vpx_free(ctx->zcoeff_blk); - ctx->zcoeff_blk = 0; + free_mode_context(ctx); } } } else if (i < BLOCK_64X64) { - for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) { + for (x->sb_index = 0; x->sb_index < 256 / num_4x4_blk; ++x->sb_index) { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - vpx_free(ctx->zcoeff_blk); - ctx->zcoeff_blk = 0; + free_mode_context(ctx); } } else { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - vpx_free(ctx->zcoeff_blk); - ctx->zcoeff_blk = 0; + free_mode_context(ctx); } } } @@ -3404,7 +3436,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Post encode loop adjustment of Q prediction. if (!active_worst_qchanged) - vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop) ? 2 : 0); + vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop || + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0); + cpi->last_q[cm->frame_type] = cm->base_qindex; diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_onyx_int.h index 9429c7f..9e80212 100644 --- a/libvpx/vp9/encoder/vp9_onyx_int.h +++ b/libvpx/vp9/encoder/vp9_onyx_int.h @@ -312,7 +312,6 @@ typedef struct VP9_COMP { VP9_COMMON common; VP9_CONFIG oxcf; struct rdcost_block_args rdcost_stack; - struct lookahead_ctx *lookahead; struct lookahead_entry *source; #if CONFIG_MULTIPLE_ARF diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c index fca7525..d24be96 100644 --- a/libvpx/vp9/encoder/vp9_quantize.c +++ b/libvpx/vp9/encoder/vp9_quantize.c @@ -22,7 +22,7 @@ extern int enc_debug; #endif -void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, +void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, @@ -30,58 +30,44 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - int i, rc, eob; - int zbins[2], nzbins[2], zbin; - int x, y, z, sz; - int zero_flag = n_coeffs; + int i, non_zero_count = count, eob = -1; + const int zbins[2] = { zbin_ptr[0] + zbin_oq_value, + zbin_ptr[1] + zbin_oq_value }; + const int nzbins[2] = { zbins[0] * -1, + zbins[1] * -1 }; - vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); - vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); - - eob = -1; - - // Base ZBIN - zbins[0] = zbin_ptr[0] + zbin_oq_value; - zbins[1] = zbin_ptr[1] + zbin_oq_value; - nzbins[0] = zbins[0] * -1; - nzbins[1] = zbins[1] * -1; + vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); if (!skip_block) { // Pre-scan pass - for (i = n_coeffs - 1; i >= 0; i--) { - rc = scan[i]; - z = coeff_ptr[rc]; + for (i = count - 1; i >= 0; i--) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; - if (z < zbins[rc != 0] && z > nzbins[rc != 0]) { - zero_flag--; - } else { + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) + non_zero_count--; + else break; - } } // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. - for (i = 0; i < zero_flag; i++) { - rc = scan[i]; - z = coeff_ptr[rc]; - - zbin = (zbins[rc != 0]); - - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; - - if (x >= zbin) { - x += (round_ptr[rc != 0]); - x = clamp(x, INT16_MIN, INT16_MAX); - y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) * - quant_shift_ptr[rc != 0]) >> 16; // quantize (x) - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value - - if (y) { - eob = i; // last nonzero coeffs - } + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= zbins[rc != 0]) { + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> 16; // quantization + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + + if (tmp) + eob = i; } } } @@ -315,17 +301,17 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { SEG_LVL_SKIP); /* save this macroblock QIndex for vp9_update_zbin_extra() */ - x->e_mbd.q_index = qindex; + x->q_index = qindex; /* R/D setup */ cpi->mb.errorperbit = rdmult >> 6; cpi->mb.errorperbit += (cpi->mb.errorperbit == 0); - vp9_initialize_me_consts(cpi, xd->q_index); + vp9_initialize_me_consts(cpi, x->q_index); } void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) { - const int qindex = x->e_mbd.q_index; + const int qindex = x->q_index; const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] * (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] * diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c index 993919e..78cb06b 100644 --- a/libvpx/vp9/encoder/vp9_rdopt.c +++ b/libvpx/vp9/encoder/vp9_rdopt.c @@ -246,6 +246,10 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { vp9_set_speed_features(cpi); + cpi->mb.select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL && + cm->frame_type != KEY_FRAME) ? + 0 : 1; + set_block_thresholds(cpi); fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs); @@ -268,10 +272,10 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { MB_PREDICTION_MODE m; for (m = NEARESTMV; m < MB_MODE_COUNT; m++) - cpi->mb.inter_mode_cost[i][inter_mode_offset(m)] = + cpi->mb.inter_mode_cost[i][INTER_OFFSET(m)] = cost_token(vp9_inter_mode_tree, cm->fc.inter_mode_probs[i], - &vp9_inter_mode_encodings[inter_mode_offset(m)]); + &vp9_inter_mode_encodings[INTER_OFFSET(m)]); } } } @@ -609,7 +613,7 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, // TODO(jingning): temporarily enabled only for luma component rd = MIN(rd1, rd2); - if (plane == 0) + if (!xd->lossless && plane == 0) x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block]; args->this_rate += args->rate; @@ -740,7 +744,7 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int n, m; int s0, s1; - const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]); + const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs); for (n = TX_4X4; n <= max_tx_size; n++) { r[n][1] = r[n][0]; @@ -845,7 +849,7 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00}; // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00}; - const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]); + const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs); // for (n = TX_4X4; n <= max_txfm_size; n++) // r[n][0] = (r[n][0] * scale_r[n]); @@ -1326,6 +1330,7 @@ static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x, } static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, + PICK_MODE_CONTEXT *ctx, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, BLOCK_SIZE bsize) { @@ -1361,6 +1366,27 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, *rate_tokenonly = this_rate_tokenonly; *distortion = this_distortion; *skippable = s; + if (!x->select_txfm_size) { + int i; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = x->e_mbd.plane; + for (i = 1; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][2]; + pd[i].qcoeff = ctx->qcoeff_pbuf[i][2]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2]; + pd[i].eobs = ctx->eobs_pbuf[i][2]; + + ctx->coeff_pbuf[i][2] = ctx->coeff_pbuf[i][0]; + ctx->qcoeff_pbuf[i][2] = ctx->qcoeff_pbuf[i][0]; + ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0]; + ctx->eobs_pbuf[i][2] = ctx->eobs_pbuf[i][0]; + + ctx->coeff_pbuf[i][0] = p[i].coeff; + ctx->qcoeff_pbuf[i][0] = pd[i].qcoeff; + ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff; + ctx->eobs_pbuf[i][0] = pd[i].eobs; + } + } } } @@ -1386,8 +1412,9 @@ static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x, return this_rd; } -static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize, - int *rate_uv, int *rate_uv_tokenonly, +static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, + BLOCK_SIZE bsize, int *rate_uv, + int *rate_uv_tokenonly, int64_t *dist_uv, int *skip_uv, MB_PREDICTION_MODE *mode_uv) { MACROBLOCK *const x = &cpi->mb; @@ -1400,7 +1427,7 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize, // Else do a proper rd search for each possible transform size that may // be considered in the main rd loop. } else { - rd_pick_intra_sbuv_mode(cpi, x, + rd_pick_intra_sbuv_mode(cpi, x, ctx, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); } @@ -1416,7 +1443,7 @@ static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode, // Don't account for mode here if segment skip is enabled. if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) { assert(is_inter_mode(mode)); - return x->inter_mode_cost[mode_context][inter_mode_offset(mode)]; + return x->inter_mode_cost[mode_context][INTER_OFFSET(mode)]; } else { return 0; } @@ -1707,7 +1734,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, const struct buf_2d orig_src = x->plane[0].src; struct buf_2d orig_pre[2]; - mode_idx = inter_mode_offset(this_mode); + mode_idx = INTER_OFFSET(this_mode); bsi->rdstat[i][mode_idx].brdcost = INT64_MAX; // if we're near/nearest and mv == 0,0, compare to zeromv @@ -1901,6 +1928,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost, x->mvcost, cpi); + bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int; if (num_4x4_blocks_wide > 1) bsi->rdstat[i + 1][mode_idx].mvs[0].as_int = @@ -2002,7 +2030,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, return; } - mode_idx = inter_mode_offset(mode_selected); + mode_idx = INTER_OFFSET(mode_selected); vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above)); vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left)); @@ -2078,7 +2106,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, return INT64_MAX; /* set it to the best */ for (i = 0; i < 4; i++) { - mode_idx = inter_mode_offset(bsi->modes[i]); + mode_idx = INTER_OFFSET(bsi->modes[i]); mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int; if (has_second_ref(mbmi)) mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int; @@ -2477,54 +2505,41 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize); MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - int refs[2] = { mbmi->ref_frame[0], - (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; + const int refs[2] = { mbmi->ref_frame[0], + mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] }; int_mv ref_mv[2]; const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]); - int ite; + int ite, ref; // Prediction buffer from second frame. uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t)); // Do joint motion search in compound mode to get more accurate mv. - struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}}; - struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}}; - struct buf_2d scaled_first_yv12; + struct buf_2d backup_yv12[2][MAX_MB_PLANE]; + struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0]; int last_besterr[2] = {INT_MAX, INT_MAX}; - YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL}; - scaled_ref_frame[0] = get_scaled_ref_frame(cpi, mbmi->ref_frame[0]); - scaled_ref_frame[1] = get_scaled_ref_frame(cpi, mbmi->ref_frame[1]); - - ref_mv[0] = mbmi->ref_mvs[refs[0]][0]; - ref_mv[1] = mbmi->ref_mvs[refs[1]][0]; - - if (scaled_ref_frame[0]) { - int i; - // Swap out the reference frame for a version that's been scaled to - // match the resolution of the current frame, allowing the existing - // motion search code to be used without additional modifications. - for (i = 0; i < MAX_MB_PLANE; i++) - backup_yv12[i] = xd->plane[i].pre[0]; - setup_pre_planes(xd, 0, scaled_ref_frame[0], mi_row, mi_col, NULL); - } + YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = { + get_scaled_ref_frame(cpi, mbmi->ref_frame[0]), + get_scaled_ref_frame(cpi, mbmi->ref_frame[1]) + }; - if (scaled_ref_frame[1]) { - int i; - for (i = 0; i < MAX_MB_PLANE; i++) - backup_second_yv12[i] = xd->plane[i].pre[1]; + for (ref = 0; ref < 2; ++ref) { + ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0]; + + if (scaled_ref_frame[ref]) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) + backup_yv12[ref][i] = xd->plane[i].pre[ref]; + setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, NULL); + } - setup_pre_planes(xd, 1, scaled_ref_frame[1], mi_row, mi_col, NULL); + xd->scale_factor[ref].sfc->set_scaled_offsets(&xd->scale_factor[ref], + mi_row, mi_col); + frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int; } - xd->scale_factor[0].sfc->set_scaled_offsets(&xd->scale_factor[0], - mi_row, mi_col); - xd->scale_factor[1].sfc->set_scaled_offsets(&xd->scale_factor[1], - mi_row, mi_col); - scaled_first_yv12 = xd->plane[0].pre[0]; - - // Initialize mv using single prediction mode result. - frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int; - frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int; - // Allow joint search multiple times iteratively for each ref frame // and break out the search loop if it couldn't find better mv. for (ite = 0; ite < 4; ite++) { @@ -2604,24 +2619,20 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } } - // restore the predictor - if (scaled_ref_frame[0]) { - int i; - for (i = 0; i < MAX_MB_PLANE; i++) - xd->plane[i].pre[0] = backup_yv12[i]; - } + *rate_mv = 0; - if (scaled_ref_frame[1]) { - int i; - for (i = 0; i < MAX_MB_PLANE; i++) - xd->plane[i].pre[1] = backup_second_yv12[i]; + for (ref = 0; ref < 2; ++ref) { + if (scaled_ref_frame[ref]) { + // restore the predictor + int i; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[ref] = backup_yv12[ref][i]; + } + + *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv, + &mbmi->ref_mvs[refs[ref]][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); } - *rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv, - &mbmi->ref_mvs[refs[0]][0].as_mv, - x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv, - &mbmi->ref_mvs[refs[1]][0].as_mv, - x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); vpx_free(second_pred); } @@ -3046,6 +3057,30 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, return this_rd; // if 0, this will be re-calculated by caller } +static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, + int max_plane) { + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = x->e_mbd.plane; + int i; + + for (i = 0; i < max_plane; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][1]; + pd[i].qcoeff = ctx->qcoeff_pbuf[i][1]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; + pd[i].eobs = ctx->eobs_pbuf[i][1]; + + ctx->coeff_pbuf[i][1] = ctx->coeff_pbuf[i][0]; + ctx->qcoeff_pbuf[i][1] = ctx->qcoeff_pbuf[i][0]; + ctx->dqcoeff_pbuf[i][1] = ctx->dqcoeff_pbuf[i][0]; + ctx->eobs_pbuf[i][1] = ctx->eobs_pbuf[i][0]; + + ctx->coeff_pbuf[i][0] = p[i].coeff; + ctx->qcoeff_pbuf[i][0] = pd[i].qcoeff; + ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff; + ctx->eobs_pbuf[i][0] = pd[i].eobs; + } +} + void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int *returnrate, int64_t *returndist, BLOCK_SIZE bsize, @@ -3065,7 +3100,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, *returnrate = INT_MAX; return; } - rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, + rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly, &dist_uv, &uv_skip, bsize); } else { y_skip = 0; @@ -3074,7 +3109,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, *returnrate = INT_MAX; return; } - rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, + rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly, &dist_uv, &uv_skip, BLOCK_8X8); } @@ -3157,7 +3192,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, const int bhs = num_8x8_blocks_high_lookup[bsize] / 2; int best_skip2 = 0; - x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH; + x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; // Everywhere the flag is set the error is much higher than its neighbors. ctx->frames_with_high_error = 0; @@ -3196,8 +3231,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, case BLOCK_32X32: for (i = 0; i < 4; i++) { ref_frame_mask |= - x->mb_context[xd->sb_index][i].frames_with_high_error; - mode_mask |= x->mb_context[xd->sb_index][i].modes_with_high_error; + x->mb_context[x->sb_index][i].frames_with_high_error; + mode_mask |= x->mb_context[x->sb_index][i].modes_with_high_error; } break; default: @@ -3440,7 +3475,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]); if (rate_uv_intra[uv_tx] == INT_MAX) { - choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx], + choose_intra_uv_mode(cpi, ctx, bsize, &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]); @@ -3574,6 +3609,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // Did this mode help.. i.e. is it the new best mode if (this_rd < best_rd || x->skip) { + int max_plane = MAX_MB_PLANE; if (!mode_excluded) { // Note index of best mode so far best_mode_index = mode_index; @@ -3581,6 +3617,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (ref_frame == INTRA_FRAME) { /* required for left and above block mv */ mbmi->mv[0].as_int = 0; + max_plane = 1; } *returnrate = rate2; @@ -3588,6 +3625,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_rd = this_rd; best_mbmode = *mbmi; best_skip2 = this_skip2; + if (!x->select_txfm_size) + swap_block_ptr(x, ctx, max_plane); vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], sizeof(uint8_t) * ctx->num_4x4_blk); @@ -3694,7 +3733,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // Do Intra UV best rd mode selection if best mode choice above was intra. if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) { TX_SIZE uv_tx_size = get_uv_tx_size(mbmi); - rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size], + rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size], &rate_uv_tokenonly[uv_tx_size], &dist_uv[uv_tx_size], &skip_uv[uv_tx_size], @@ -3850,7 +3889,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, b_mode_info best_bmodes[4]; int best_skip2 = 0; - x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH; + x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4); for (i = 0; i < 4; i++) { @@ -4063,7 +4102,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, distortion2 += distortion_y; if (rate_uv_intra[TX_4X4] == INT_MAX) { - choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4], + choose_intra_uv_mode(cpi, ctx, bsize, &rate_uv_intra[TX_4X4], &rate_uv_tokenonly[TX_4X4], &dist_uv[TX_4X4], &skip_uv[TX_4X4], &mode_uv[TX_4X4]); @@ -4317,12 +4356,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // Did this mode help.. i.e. is it the new best mode if (this_rd < best_rd || x->skip) { if (!mode_excluded) { + int max_plane = MAX_MB_PLANE; // Note index of best mode so far best_mode_index = mode_index; if (ref_frame == INTRA_FRAME) { /* required for left and above block mv */ mbmi->mv[0].as_int = 0; + max_plane = 1; } *returnrate = rate2; @@ -4332,6 +4373,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv); best_mbmode = *mbmi; best_skip2 = this_skip2; + if (!x->select_txfm_size) + swap_block_ptr(x, ctx, max_plane); vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], sizeof(uint8_t) * ctx->num_4x4_blk); @@ -4438,7 +4481,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // Do Intra UV best rd mode selection if best mode choice above was intra. if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) { TX_SIZE uv_tx_size = get_uv_tx_size(mbmi); - rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size], + rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size], &rate_uv_tokenonly[uv_tx_size], &dist_uv[uv_tx_size], &skip_uv[uv_tx_size], diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c index 7d4676e..c7336d0 100644 --- a/libvpx/vp9/encoder/vp9_tokenize.c +++ b/libvpx/vp9/encoder/vp9_tokenize.c @@ -57,7 +57,7 @@ static void fill_value_tokens() { // initialize the cost for extra bits for all possible coefficient value. { int cost = 0; - const vp9_extra_bit *p = vp9_extra_bits + t[i].token; + const vp9_extra_bit *p = &vp9_extra_bits[t[i].token]; if (p->base_val) { const int extra = t[i].extra; @@ -73,7 +73,7 @@ static void fill_value_tokens() { } while (++i < DCT_MAX_VALUE); vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE; - vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE; + vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE; } struct tokenize_b_args { @@ -127,7 +127,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, get_scan(xd, tx_size, type, block, &scan, &nb); c = 0; do { - const int band = get_coef_band(band_translate, c); + const int band = band_translate[c]; int token; int v = 0; rc = scan[c]; diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c index dc11501..fefca66 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c @@ -206,12 +206,12 @@ void fadst4_1d_sse2(__m128i *in) { const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u[8], v[8]; __m128i in7 = _mm_add_epi16(in[0], in[1]); - in7 = _mm_sub_epi16(in7, in[3]); u[0] = _mm_unpacklo_epi16(in[0], in[1]); u[1] = _mm_unpacklo_epi16(in[2], in[3]); u[2] = _mm_unpacklo_epi16(in7, kZero); u[3] = _mm_unpacklo_epi16(in[2], kZero); + u[4] = _mm_unpacklo_epi16(in[3], kZero); v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 @@ -219,9 +219,10 @@ void fadst4_1d_sse2(__m128i *in) { v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 + v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = v[2]; + u[1] = _mm_sub_epi32(v[2], v[6]); u[2] = _mm_add_epi32(v[3], v[4]); u[3] = _mm_sub_epi32(u[2], u[0]); u[4] = _mm_slli_epi32(v[5], 2); diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm index 533456b..1a9e4e8 100644 --- a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm +++ b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm @@ -118,6 +118,14 @@ SECTION .text RET %endmacro +%macro INC_SRC_BY_SRC_STRIDE 0 +%if ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp +%else + add srcq, src_strideq +%endif +%endmacro + %macro SUBPEL_VARIANCE 1-2 0 ; W %if cpuflag(ssse3) %define bilin_filter_m bilin_filter_m_ssse3 @@ -129,41 +137,85 @@ SECTION .text ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses ; 11, not 13, if the registers are ordered correctly. May make a minor speed ; difference on Win64 -%ifdef PIC -%if %2 == 1 ; avg -cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse -%define sec_str sec_strideq -%else -cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \ - dst, dst_stride, height, sse -%endif -%define h heightd -%define bilin_filter sseq -%else -%if %2 == 1 ; avg -cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ - 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse -%if ARCH_X86_64 -%define h heightd -%define sec_str sec_strideq -%else -%define h dword heightm -%define sec_str sec_stridemp -%endif + +%ifdef PIC ; 64bit PIC + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse + %endif + %define h heightd + %define bilin_filter sseq %else -cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ - dst, dst_stride, height, sse -%define h heightd -%endif -%define bilin_filter bilin_filter_m + %if ARCH_X86=1 && CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse, g_bilin_filter, g_pw_8 + %define h dword heightm + %define sec_str sec_stridemp + + ;Store bilin_filter and pw_8 location in stack + GET_GOT eax + add esp, 4 ; restore esp + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse, \ + g_bilin_filter, g_pw_8 + %define h heightd + + ;Store bilin_filter and pw_8 location in stack + GET_GOT eax + add esp, 4 ; restore esp + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %endif + %else + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ + 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse + %if ARCH_X86_64 + %define h heightd + %define sec_str sec_strideq + %else + %define h dword heightm + %define sec_str sec_stridemp + %endif + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse + %define h heightd + %endif + + %define bilin_filter bilin_filter_m + %endif %endif + ASSERT %1 <= 16 ; m6 overflows if w > 16 pxor m6, m6 ; sum pxor m7, m7 ; sse @@ -329,11 +381,22 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %define filter_y_b m9 %define filter_rnd m10 %else ; x86-32 or mmx +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] %define filter_rnd [pw_8] %endif +%endif + .x_zero_y_other_loop: %if %1 == 16 movu m0, [srcq] @@ -615,12 +678,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 +%else ;x86_32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] %else add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] %define filter_rnd [pw_8] %endif +%endif + %if %1 == 16 movu m0, [srcq] movu m3, [srcq+1] @@ -752,12 +826,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +;y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] %else add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] %define filter_rnd [pw_8] %endif +%endif + .x_other_y_zero_loop: %if %1 == 16 movu m0, [srcq] @@ -873,12 +958,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] %else add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] %define filter_rnd [pw_8] %endif +%endif + %if %1 == 16 movu m0, [srcq] movu m1, [srcq+1] @@ -1057,6 +1153,21 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %define filter_y_a m10 %define filter_y_b m11 %define filter_rnd m12 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] %else add x_offsetq, bilin_filter add y_offsetq, bilin_filter @@ -1066,6 +1177,8 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %define filter_y_b [y_offsetq+16] %define filter_rnd [pw_8] %endif +%endif + ; x_offset == bilin interpolation && y_offset == bilin interpolation %if %1 == 16 movu m0, [srcq] @@ -1093,7 +1206,9 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %endif psraw m0, 4 psraw m2, 4 - add srcq, src_strideq + + INC_SRC_BY_SRC_STRIDE + packuswb m0, m2 .x_other_y_other_loop: %if cpuflag(ssse3) @@ -1163,7 +1278,7 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 - add srcq, src_strideq + INC_SRC_BY_SRC_STRIDE add dstq, dst_strideq %else ; %1 < 16 movh m0, [srcq] @@ -1184,12 +1299,17 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %if cpuflag(ssse3) packuswb m0, m0 %endif - add srcq, src_strideq + + INC_SRC_BY_SRC_STRIDE + .x_other_y_other_loop: movh m2, [srcq] movh m1, [srcq+1] - movh m4, [srcq+src_strideq] - movh m3, [srcq+src_strideq+1] + + INC_SRC_BY_SRC_STRIDE + movh m4, [srcq] + movh m3, [srcq+1] + %if cpuflag(ssse3) punpcklbw m2, m1 punpcklbw m4, m3 @@ -1253,7 +1373,7 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 - lea srcq, [srcq+src_strideq*2] + INC_SRC_BY_SRC_STRIDE lea dstq, [dstq+dst_strideq*2] %endif %if %2 == 1 ; avg |