summaryrefslogtreecommitdiffstats
path: root/libvpx/vp9/encoder
diff options
context:
space:
mode:
authorhkuang <hkuang@google.com>2013-09-16 15:09:58 -0700
committerHangyu Kuang <hkuang@google.com>2013-09-17 22:05:28 +0000
commit1184aebb761cbeac9124c37189a80a1a58f04b6b (patch)
treeb1ce6b3d29c43ffd22eb18999c5c3bad26513a48 /libvpx/vp9/encoder
parentf3bed9137f66ef693bd406e43b17e9a1114f1e14 (diff)
downloadandroid_external_libvpx-1184aebb761cbeac9124c37189a80a1a58f04b6b.tar.gz
android_external_libvpx-1184aebb761cbeac9124c37189a80a1a58f04b6b.tar.bz2
android_external_libvpx-1184aebb761cbeac9124c37189a80a1a58f04b6b.zip
Roll latest libvpx into Android.
The latest libvpx has more neon optimizations and a lot of algorithm optimizations which make the vp9 decode much more faster. bug:10804666 Change-Id: I75eaacea57ecc7542a780be778f0e9e157978524 (cherry picked from commit 3df0563f1b24dac6c0bd122fc922a48211269061)
Diffstat (limited to 'libvpx/vp9/encoder')
-rw-r--r--libvpx/vp9/encoder/vp9_bitstream.c525
-rw-r--r--libvpx/vp9/encoder/vp9_block.h24
-rw-r--r--libvpx/vp9/encoder/vp9_dct.c76
-rw-r--r--libvpx/vp9/encoder/vp9_encodeframe.c1512
-rw-r--r--libvpx/vp9/encoder/vp9_encodeintra.c12
-rw-r--r--libvpx/vp9/encoder/vp9_encodeintra.h10
-rw-r--r--libvpx/vp9/encoder/vp9_encodemb.c226
-rw-r--r--libvpx/vp9/encoder/vp9_encodemb.h48
-rw-r--r--libvpx/vp9/encoder/vp9_encodemv.c177
-rw-r--r--libvpx/vp9/encoder/vp9_firstpass.c161
-rw-r--r--libvpx/vp9/encoder/vp9_mbgraph.c20
-rw-r--r--libvpx/vp9/encoder/vp9_mcomp.c1655
-rw-r--r--libvpx/vp9/encoder/vp9_mcomp.h75
-rw-r--r--libvpx/vp9/encoder/vp9_modecosts.c16
-rw-r--r--libvpx/vp9/encoder/vp9_onyx_if.c894
-rw-r--r--libvpx/vp9/encoder/vp9_onyx_int.h130
-rw-r--r--libvpx/vp9/encoder/vp9_picklpf.c68
-rw-r--r--libvpx/vp9/encoder/vp9_picklpf.h3
-rw-r--r--libvpx/vp9/encoder/vp9_quantize.c163
-rw-r--r--libvpx/vp9/encoder/vp9_ratectrl.c20
-rw-r--r--libvpx/vp9/encoder/vp9_rdopt.c1710
-rw-r--r--libvpx/vp9/encoder/vp9_rdopt.h6
-rw-r--r--libvpx/vp9/encoder/vp9_segmentation.c113
-rw-r--r--libvpx/vp9/encoder/vp9_temporal_filter.c19
-rw-r--r--libvpx/vp9/encoder/vp9_tokenize.c147
-rw-r--r--libvpx/vp9/encoder/vp9_tokenize.h8
-rw-r--r--libvpx/vp9/encoder/vp9_variance_c.c154
-rw-r--r--libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c2650
-rw-r--r--libvpx/vp9/encoder/x86/vp9_dct_sse2.c1232
-rw-r--r--libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm28
-rw-r--r--libvpx/vp9/encoder/x86/vp9_subpel_variance.asm12
-rw-r--r--libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm16
-rw-r--r--libvpx/vp9/encoder/x86/vp9_variance_sse2.c8
33 files changed, 6534 insertions, 5384 deletions
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index 98ef420..957cfd2 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c
@@ -41,9 +41,9 @@ unsigned __int64 Sectionbits[500];
#endif
#ifdef ENTROPY_STATS
-int intra_mode_stats[VP9_INTRA_MODES]
- [VP9_INTRA_MODES]
- [VP9_INTRA_MODES];
+int intra_mode_stats[INTRA_MODES]
+ [INTRA_MODES]
+ [INTRA_MODES];
vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES];
extern unsigned int active_section;
@@ -54,8 +54,8 @@ extern unsigned int active_section;
int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES];
int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1];
int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-int64_t switchable_interp_stats[VP9_SWITCHABLE_FILTERS+1]
- [VP9_SWITCHABLE_FILTERS];
+int64_t switchable_interp_stats[SWITCHABLE_FILTERS+1]
+ [SWITCHABLE_FILTERS];
void init_tx_count_stats() {
vp9_zero(tx_count_32x32p_stats);
@@ -88,8 +88,8 @@ static void update_tx_count_stats(VP9_COMMON *cm) {
static void update_switchable_interp_stats(VP9_COMMON *cm) {
int i, j;
- for (i = 0; i < VP9_SWITCHABLE_FILTERS+1; ++i)
- for (j = 0; j < VP9_SWITCHABLE_FILTERS; ++j) {
+ for (i = 0; i < SWITCHABLE_FILTERS+1; ++i)
+ for (j = 0; j < SWITCHABLE_FILTERS; ++j) {
switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j];
}
}
@@ -141,11 +141,11 @@ void write_switchable_interp_stats() {
fclose(fp);
printf(
- "vp9_default_switchable_filter_count[VP9_SWITCHABLE_FILTERS+1]"
- "[VP9_SWITCHABLE_FILTERS] = {\n");
- for (i = 0; i < VP9_SWITCHABLE_FILTERS+1; i++) {
+ "vp9_default_switchable_filter_count[SWITCHABLE_FILTERS+1]"
+ "[SWITCHABLE_FILTERS] = {\n");
+ for (i = 0; i < SWITCHABLE_FILTERS+1; i++) {
printf(" { ");
- for (j = 0; j < VP9_SWITCHABLE_FILTERS; j++) {
+ for (j = 0; j < SWITCHABLE_FILTERS; j++) {
printf("%"PRId64", ", switchable_interp_stats[i][j]);
}
printf("},\n");
@@ -181,7 +181,7 @@ static void update_mode(
n--;
for (i = 0; i < n; ++i) {
- vp9_cond_prob_diff_update(w, &Pcur[i], VP9_MODE_UPDATE_PROB, bct[i]);
+ vp9_cond_prob_diff_update(w, &Pcur[i], MODE_UPDATE_PROB, bct[i]);
}
}
@@ -189,19 +189,20 @@ static void update_mbintra_mode_probs(VP9_COMP* const cpi,
vp9_writer* const bc) {
VP9_COMMON *const cm = &cpi->common;
int j;
- vp9_prob pnew[VP9_INTRA_MODES - 1];
- unsigned int bct[VP9_INTRA_MODES - 1][2];
+ vp9_prob pnew[INTRA_MODES - 1];
+ unsigned int bct[INTRA_MODES - 1][2];
for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
- update_mode(bc, VP9_INTRA_MODES, vp9_intra_mode_tree, pnew,
+ update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, pnew,
cm->fc.y_mode_prob[j], bct,
(unsigned int *)cpi->y_mode_count[j]);
}
-static void write_selected_tx_size(const VP9_COMP *cpi, TX_SIZE tx_size,
- BLOCK_SIZE_TYPE bsize, vp9_writer *w) {
+static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m,
+ TX_SIZE tx_size, BLOCK_SIZE bsize,
+ vp9_writer *w) {
const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs);
+ const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs, m);
vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
if (bsize >= BLOCK_16X16 && tx_size != TX_4X4) {
vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
@@ -213,10 +214,10 @@ static void write_selected_tx_size(const VP9_COMP *cpi, TX_SIZE tx_size,
static int write_skip_coeff(const VP9_COMP *cpi, int segment_id, MODE_INFO *m,
vp9_writer *w) {
const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) {
+ if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
return 1;
} else {
- const int skip_coeff = m->mbmi.mb_skip_coeff;
+ const int skip_coeff = m->mbmi.skip_coeff;
vp9_write(w, skip_coeff, vp9_get_pred_prob_mbskip(&cpi->common, xd));
return skip_coeff;
}
@@ -228,7 +229,7 @@ void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) {
for (k = 0; k < MBSKIP_CONTEXTS; ++k)
vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k],
- VP9_MODE_UPDATE_PROB, cm->counts.mbskip[k]);
+ MODE_UPDATE_PROB, cm->counts.mbskip[k]);
}
static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
@@ -237,43 +238,43 @@ static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
static void update_switchable_interp_probs(VP9_COMP *const cpi,
vp9_writer* const bc) {
- VP9_COMMON *const pc = &cpi->common;
- unsigned int branch_ct[VP9_SWITCHABLE_FILTERS + 1]
- [VP9_SWITCHABLE_FILTERS - 1][2];
- vp9_prob new_prob[VP9_SWITCHABLE_FILTERS + 1][VP9_SWITCHABLE_FILTERS - 1];
+ VP9_COMMON *const cm = &cpi->common;
+ unsigned int branch_ct[SWITCHABLE_FILTERS + 1]
+ [SWITCHABLE_FILTERS - 1][2];
+ vp9_prob new_prob[SWITCHABLE_FILTERS + 1][SWITCHABLE_FILTERS - 1];
int i, j;
- for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
+ for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
vp9_tree_probs_from_distribution(
vp9_switchable_interp_tree,
new_prob[j], branch_ct[j],
- pc->counts.switchable_interp[j], 0);
+ cm->counts.switchable_interp[j], 0);
}
- for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
- for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
- vp9_cond_prob_diff_update(bc, &pc->fc.switchable_interp_prob[j][i],
- VP9_MODE_UPDATE_PROB, branch_ct[j][i]);
+ for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
+ for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
+ vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
+ MODE_UPDATE_PROB, branch_ct[j][i]);
}
}
#ifdef MODE_STATS
if (!cpi->dummy_packing)
- update_switchable_interp_stats(pc);
+ update_switchable_interp_stats(cm);
#endif
}
-static void update_inter_mode_probs(VP9_COMMON *pc, vp9_writer* const bc) {
+static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) {
int i, j;
for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
- unsigned int branch_ct[VP9_INTER_MODES - 1][2];
- vp9_prob new_prob[VP9_INTER_MODES - 1];
+ unsigned int branch_ct[INTER_MODES - 1][2];
+ vp9_prob new_prob[INTER_MODES - 1];
vp9_tree_probs_from_distribution(vp9_inter_mode_tree,
new_prob, branch_ct,
- pc->counts.inter_mode[i], NEARESTMV);
+ cm->counts.inter_mode[i], NEARESTMV);
- for (j = 0; j < VP9_INTER_MODES - 1; ++j)
- vp9_cond_prob_diff_update(bc, &pc->fc.inter_mode_probs[i][j],
- VP9_MODE_UPDATE_PROB, branch_ct[j]);
+ for (j = 0; j < INTER_MODES - 1; ++j)
+ vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j],
+ MODE_UPDATE_PROB, branch_ct[j]);
}
}
@@ -356,39 +357,39 @@ static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
// This function encodes the reference frame
static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
- VP9_COMMON *const pc = &cpi->common;
+ VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *mi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO *mi = &xd->this_mi->mbmi;
const int segment_id = mi->segment_id;
- int seg_ref_active = vp9_segfeature_active(&xd->seg, segment_id,
+ int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
SEG_LVL_REF_FRAME);
// If segment level coding of this signal is disabled...
// or the segment allows multiple reference frame options
if (!seg_ref_active) {
// does the feature use compound prediction or not
// (if not specified at the frame/segment level)
- if (pc->comp_pred_mode == HYBRID_PREDICTION) {
+ if (cm->comp_pred_mode == HYBRID_PREDICTION) {
vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,
- vp9_get_pred_prob_comp_inter_inter(pc, xd));
+ vp9_get_pred_prob_comp_inter_inter(cm, xd));
} else {
assert((mi->ref_frame[1] <= INTRA_FRAME) ==
- (pc->comp_pred_mode == SINGLE_PREDICTION_ONLY));
+ (cm->comp_pred_mode == SINGLE_PREDICTION_ONLY));
}
if (mi->ref_frame[1] > INTRA_FRAME) {
vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME,
- vp9_get_pred_prob_comp_ref_p(pc, xd));
+ vp9_get_pred_prob_comp_ref_p(cm, xd));
} else {
vp9_write(bc, mi->ref_frame[0] != LAST_FRAME,
- vp9_get_pred_prob_single_ref_p1(pc, xd));
+ vp9_get_pred_prob_single_ref_p1(cm, xd));
if (mi->ref_frame[0] != LAST_FRAME)
vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME,
- vp9_get_pred_prob_single_ref_p2(pc, xd));
+ vp9_get_pred_prob_single_ref_p2(cm, xd));
}
} else {
assert(mi->ref_frame[1] <= INTRA_FRAME);
- assert(vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME) ==
+ assert(vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ==
mi->ref_frame[0]);
}
@@ -397,20 +398,20 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
}
static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
- VP9_COMMON *const pc = &cpi->common;
- const nmv_context *nmvc = &pc->fc.nmvc;
+ VP9_COMMON *const cm = &cpi->common;
+ const nmv_context *nmvc = &cm->fc.nmvc;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- struct segmentation *seg = &xd->seg;
+ struct segmentation *seg = &cm->seg;
MB_MODE_INFO *const mi = &m->mbmi;
const MV_REFERENCE_FRAME rf = mi->ref_frame[0];
const MB_PREDICTION_MODE mode = mi->mode;
const int segment_id = mi->segment_id;
int skip_coeff;
- const BLOCK_SIZE_TYPE bsize = mi->sb_type;
+ const BLOCK_SIZE bsize = mi->sb_type;
const int allow_hp = xd->allow_high_precision_mv;
- x->partition_info = x->pi + (m - pc->mi);
+ x->partition_info = x->pi + (m - cm->mi);
#ifdef ENTROPY_STATS
active_section = 9;
@@ -419,7 +420,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
if (seg->update_map) {
if (seg->temporal_update) {
const int pred_flag = mi->seg_id_predicted;
- vp9_prob pred_prob = vp9_get_pred_prob_seg_id(xd);
+ vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
vp9_write(bc, pred_flag, pred_prob);
if (!pred_flag)
write_segment_id(bc, seg, segment_id);
@@ -432,12 +433,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
vp9_write(bc, rf != INTRA_FRAME,
- vp9_get_pred_prob_intra_inter(pc, xd));
+ vp9_get_pred_prob_intra_inter(cm, xd));
- if (bsize >= BLOCK_SIZE_SB8X8 && pc->tx_mode == TX_MODE_SELECT &&
+ if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
!(rf != INTRA_FRAME &&
(skip_coeff || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
- write_selected_tx_size(cpi, mi->txfm_size, bsize, bc);
+ write_selected_tx_size(cpi, m, mi->tx_size, bsize, bc);
}
if (rf == INTRA_FRAME) {
@@ -445,8 +446,8 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
active_section = 6;
#endif
- if (bsize >= BLOCK_SIZE_SB8X8) {
- write_intra_mode(bc, mode, pc->fc.y_mode_prob[size_group_lookup[bsize]]);
+ if (bsize >= BLOCK_8X8) {
+ write_intra_mode(bc, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
} else {
int idx, idy;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
@@ -454,15 +455,15 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode;
- write_intra_mode(bc, bm, pc->fc.y_mode_prob[0]);
+ write_intra_mode(bc, bm, cm->fc.y_mode_prob[0]);
}
}
}
- write_intra_mode(bc, mi->uv_mode, pc->fc.uv_mode_prob[mode]);
+ write_intra_mode(bc, mi->uv_mode, cm->fc.uv_mode_prob[mode]);
} else {
vp9_prob *mv_ref_p;
encode_ref_frame(cpi, bc);
- mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mb_mode_context[rf]];
+ mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mode_context[rf]];
#ifdef ENTROPY_STATS
active_section = 3;
@@ -470,23 +471,23 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
// If segment skip is not enabled code the mode.
if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
- if (bsize >= BLOCK_SIZE_SB8X8) {
+ if (bsize >= BLOCK_8X8) {
write_sb_mv_ref(bc, mode, mv_ref_p);
- ++pc->counts.inter_mode[mi->mb_mode_context[rf]]
+ ++cm->counts.inter_mode[mi->mode_context[rf]]
[inter_mode_offset(mode)];
}
}
- if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+ if (cm->mcomp_filter_type == SWITCHABLE) {
+ const int ctx = vp9_get_pred_context_switchable_interp(xd);
write_token(bc, vp9_switchable_interp_tree,
- vp9_get_pred_probs_switchable_interp(&cpi->common, xd),
- vp9_switchable_interp_encodings +
- vp9_switchable_interp_map[mi->interp_filter]);
+ cm->fc.switchable_interp_prob[ctx],
+ &vp9_switchable_interp_encodings[mi->interp_filter]);
} else {
- assert(mi->interp_filter == cpi->common.mcomp_filter_type);
+ assert(mi->interp_filter == cm->mcomp_filter_type);
}
- if (bsize < BLOCK_SIZE_SB8X8) {
+ if (bsize < BLOCK_8X8) {
int j;
MB_PREDICTION_MODE blockmode;
int_mv blockmv;
@@ -499,7 +500,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
blockmode = x->partition_info->bmi[j].mode;
blockmv = m->bmi[j].as_mv[0];
write_sb_mv_ref(bc, blockmode, mv_ref_p);
- ++pc->counts.inter_mode[mi->mb_mode_context[rf]]
+ ++cm->counts.inter_mode[mi->mode_context[rf]]
[inter_mode_offset(blockmode)];
if (blockmode == NEWMV) {
@@ -531,26 +532,29 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
}
}
-static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO *m,
+static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
vp9_writer *bc) {
- const VP9_COMMON *const c = &cpi->common;
+ const VP9_COMMON *const cm = &cpi->common;
const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ const struct segmentation *const seg = &cm->seg;
+ MODE_INFO *m = mi_8x8[0];
const int ym = m->mbmi.mode;
- const int mis = c->mode_info_stride;
const int segment_id = m->mbmi.segment_id;
+ MODE_INFO *above_mi = mi_8x8[-xd->mode_info_stride];
+ MODE_INFO *left_mi = mi_8x8[-1];
- if (xd->seg.update_map)
- write_segment_id(bc, &xd->seg, m->mbmi.segment_id);
+ if (seg->update_map)
+ write_segment_id(bc, seg, m->mbmi.segment_id);
write_skip_coeff(cpi, segment_id, m, bc);
- if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->tx_mode == TX_MODE_SELECT)
- write_selected_tx_size(cpi, m->mbmi.txfm_size, m->mbmi.sb_type, bc);
+ if (m->mbmi.sb_type >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
+ write_selected_tx_size(cpi, m, m->mbmi.tx_size, m->mbmi.sb_type, bc);
- if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
- const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
+ if (m->mbmi.sb_type >= BLOCK_8X8) {
+ const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0);
const MB_PREDICTION_MODE L = xd->left_available ?
- left_block_mode(m, 0) : DC_PRED;
+ left_block_mode(m, left_mi, 0) : DC_PRED;
write_intra_mode(bc, ym, vp9_kf_y_mode_prob[A][L]);
} else {
int idx, idy;
@@ -558,10 +562,10 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO *m,
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[m->mbmi.sb_type];
for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
- const int i = idy * 2 + idx;
- const MB_PREDICTION_MODE A = above_block_mode(m, i, mis);
+ int i = idy * 2 + idx;
+ const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, i);
const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
- left_block_mode(m, i) : DC_PRED;
+ left_block_mode(m, left_mi, i) : DC_PRED;
const int bm = m->bmi[i].as_mode;
#ifdef ENTROPY_STATS
++intra_mode_stats[A][L][bm];
@@ -574,21 +578,25 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO *m,
write_intra_mode(bc, m->mbmi.uv_mode, vp9_kf_uv_mode_prob[ym]);
}
-static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
+static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
TOKENEXTRA **tok, TOKENEXTRA *tok_end,
int mi_row, int mi_col) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ MODE_INFO *m = mi_8x8[0];
- if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8)
+ if (m->mbmi.sb_type < BLOCK_8X8)
if (xd->ab_index > 0)
return;
- xd->mode_info_context = m;
- set_mi_row_col(&cpi->common, xd, mi_row,
- 1 << mi_height_log2(m->mbmi.sb_type),
- mi_col, 1 << mi_width_log2(m->mbmi.sb_type));
+
+ xd->this_mi = mi_8x8[0];
+ xd->mi_8x8 = mi_8x8;
+
+ set_mi_row_col(&cpi->common, xd,
+ mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
+ mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type]);
if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
- write_mb_modes_kf(cpi, m, bc);
+ write_mb_modes_kf(cpi, mi_8x8, bc);
#ifdef ENTROPY_STATS
active_section = 8;
#endif
@@ -603,10 +611,9 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
pack_mb_tokens(bc, tok, tok_end);
}
-static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
+static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
TOKENEXTRA **tok, TOKENEXTRA *tok_end,
- int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &cpi->mb.e_mbd;
const int mis = cm->mode_info_stride;
@@ -614,20 +621,22 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
int bs = (1 << bsl) / 4; // mode_info step for subsize
int n;
PARTITION_TYPE partition = PARTITION_NONE;
- BLOCK_SIZE_TYPE subsize;
+ BLOCK_SIZE subsize;
+ MODE_INFO *m = mi_8x8[0];
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
partition = partition_lookup[bsl][m->mbmi.sb_type];
- if (bsize < BLOCK_SIZE_SB8X8)
+ if (bsize < BLOCK_8X8)
if (xd->ab_index > 0)
return;
- if (bsize >= BLOCK_SIZE_SB8X8) {
+ if (bsize >= BLOCK_8X8) {
int pl;
- const int idx = check_bsize_coverage(cm, mi_row, mi_col, bsize);
+ const int idx = check_bsize_coverage(bs, cm->mi_rows, cm->mi_cols,
+ mi_row, mi_col);
set_partition_seg_context(cm, xd, mi_row, mi_col);
pl = partition_plane_context(xd, bsize);
// encode the partition information
@@ -645,25 +654,26 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
switch (partition) {
case PARTITION_NONE:
- write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
break;
case PARTITION_HORZ:
- write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
*(get_sb_index(xd, subsize)) = 1;
if ((mi_row + bs) < cm->mi_rows)
- write_modes_b(cpi, m + bs * mis, bc, tok, tok_end, mi_row + bs, mi_col);
+ write_modes_b(cpi, mi_8x8 + bs * mis, bc, tok, tok_end, mi_row + bs,
+ mi_col);
break;
case PARTITION_VERT:
- write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
*(get_sb_index(xd, subsize)) = 1;
if ((mi_col + bs) < cm->mi_cols)
- write_modes_b(cpi, m + bs, bc, tok, tok_end, mi_row, mi_col + bs);
+ write_modes_b(cpi, mi_8x8 + bs, bc, tok, tok_end, mi_row, mi_col + bs);
break;
case PARTITION_SPLIT:
for (n = 0; n < 4; n++) {
int j = n >> 1, i = n & 0x01;
*(get_sb_index(xd, subsize)) = n;
- write_modes_sb(cpi, m + j * bs * mis + i * bs, bc, tok, tok_end,
+ write_modes_sb(cpi, mi_8x8 + j * bs * mis + i * bs, bc, tok, tok_end,
mi_row + j * bs, mi_col + i * bs, subsize);
}
break;
@@ -672,8 +682,8 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
}
// update partition context
- if (bsize >= BLOCK_SIZE_SB8X8 &&
- (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
+ if (bsize >= BLOCK_8X8 &&
+ (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) {
set_partition_seg_context(cm, xd, mi_row, mi_col);
update_partition_context(xd, subsize, bsize);
}
@@ -681,20 +691,23 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
- VP9_COMMON *const c = &cpi->common;
- const int mis = c->mode_info_stride;
- MODE_INFO *m, *m_ptr = c->mi;
+ VP9_COMMON *const cm = &cpi->common;
+ const int mis = cm->mode_info_stride;
int mi_row, mi_col;
-
- m_ptr += c->cur_tile_mi_col_start + c->cur_tile_mi_row_start * mis;
-
- for (mi_row = c->cur_tile_mi_row_start; mi_row < c->cur_tile_mi_row_end;
- mi_row += 8, m_ptr += 8 * mis) {
- m = m_ptr;
- vp9_zero(c->left_seg_context);
- for (mi_col = c->cur_tile_mi_col_start; mi_col < c->cur_tile_mi_col_end;
- mi_col += MI_BLOCK_SIZE, m += MI_BLOCK_SIZE)
- write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col, BLOCK_64X64);
+ MODE_INFO **mi_8x8 = cm->mi_grid_visible;
+ MODE_INFO **m_8x8;
+
+ mi_8x8 += cm->cur_tile_mi_col_start + cm->cur_tile_mi_row_start * mis;
+
+ for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
+ mi_row += 8, mi_8x8 += 8 * mis) {
+ m_8x8 = mi_8x8;
+ vp9_zero(cm->left_seg_context);
+ for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+ mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) {
+ write_modes_sb(cpi, m_8x8, bc, tok, tok_end, mi_row, mi_col,
+ BLOCK_64X64);
+ }
}
}
@@ -781,94 +794,170 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
vp9_coeff_probs_model *old_frame_coef_probs =
cpi->common.fc.coef_probs[tx_size];
vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size];
- int i, j, k, l, t;
- int update[2] = {0, 0};
- int savings;
-
+ const vp9_prob upd = VP9_COEF_UPDATE_PROB;
const int entropy_nodes_update = UNCONSTRAINED_NODES;
+ int i, j, k, l, t;
+ switch (cpi->sf.use_fast_coef_updates) {
+ case 0: {
+ /* dry run to see if there is any udpate at all needed */
+ int savings = 0;
+ int update[2] = {0, 0};
+ for (i = 0; i < BLOCK_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
+ const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
+ int s;
+ int u = 0;
+
+ if (l >= 3 && k == 0)
+ continue;
+ if (t == PIVOT_NODE)
+ s = vp9_prob_diff_update_savings_search_model(
+ frame_branch_ct[i][j][k][l][0],
+ old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+ else
+ s = vp9_prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
+ if (s > 0 && newp != oldp)
+ u = 1;
+ if (u)
+ savings += s - (int)(vp9_cost_zero(upd));
+ else
+ savings -= (int)(vp9_cost_zero(upd));
+ update[u]++;
+ }
+ }
+ }
+ }
+ }
- const int tstart = 0;
- /* dry run to see if there is any udpate at all needed */
- savings = 0;
- for (i = 0; i < BLOCK_TYPES; ++i) {
- for (j = 0; j < REF_TYPES; ++j) {
- for (k = 0; k < COEF_BANDS; ++k) {
- // int prev_coef_savings[ENTROPY_NODES] = {0};
- for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
- for (t = tstart; t < entropy_nodes_update; ++t) {
- vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
- const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
- const vp9_prob upd = VP9_COEF_UPDATE_PROB;
- int s;
- int u = 0;
-
- if (l >= 3 && k == 0)
- continue;
- if (t == PIVOT_NODE)
- s = vp9_prob_diff_update_savings_search_model(
- frame_branch_ct[i][j][k][l][0],
- old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
- else
- s = vp9_prob_diff_update_savings_search(
- frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
- if (s > 0 && newp != oldp)
- u = 1;
- if (u)
- savings += s - (int)(vp9_cost_zero(upd));
- else
- savings -= (int)(vp9_cost_zero(upd));
- update[u]++;
+ // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
+ /* Is coef updated at all */
+ if (update[1] == 0 || savings < 0) {
+ vp9_write_bit(bc, 0);
+ return;
+ }
+ vp9_write_bit(bc, 1);
+ for (i = 0; i < BLOCK_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+ // calc probs and branch cts for this frame only
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
+ vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
+ const vp9_prob upd = VP9_COEF_UPDATE_PROB;
+ int s;
+ int u = 0;
+ if (l >= 3 && k == 0)
+ continue;
+ if (t == PIVOT_NODE)
+ s = vp9_prob_diff_update_savings_search_model(
+ frame_branch_ct[i][j][k][l][0],
+ old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+ else
+ s = vp9_prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][l][t],
+ *oldp, &newp, upd);
+ if (s > 0 && newp != *oldp)
+ u = 1;
+ vp9_write(bc, u, upd);
+#ifdef ENTROPY_STATS
+ if (!cpi->dummy_packing)
+ ++tree_update_hist[tx_size][i][j][k][l][t][u];
+#endif
+ if (u) {
+ /* send/use new probability */
+ vp9_write_prob_diff_update(bc, newp, *oldp);
+ *oldp = newp;
+ }
+ }
+ }
}
}
}
+ return;
}
- }
- // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
- /* Is coef updated at all */
- if (update[1] == 0 || savings < 0) {
- vp9_write_bit(bc, 0);
- return;
- }
- vp9_write_bit(bc, 1);
- for (i = 0; i < BLOCK_TYPES; ++i) {
- for (j = 0; j < REF_TYPES; ++j) {
- for (k = 0; k < COEF_BANDS; ++k) {
- // int prev_coef_savings[ENTROPY_NODES] = {0};
- for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
- // calc probs and branch cts for this frame only
- for (t = tstart; t < entropy_nodes_update; ++t) {
- vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
- vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
- const vp9_prob upd = VP9_COEF_UPDATE_PROB;
- int s;
- int u = 0;
- if (l >= 3 && k == 0)
- continue;
- if (t == PIVOT_NODE)
- s = vp9_prob_diff_update_savings_search_model(
- frame_branch_ct[i][j][k][l][0],
- old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
- else
- s = vp9_prob_diff_update_savings_search(
- frame_branch_ct[i][j][k][l][t],
- *oldp, &newp, upd);
- if (s > 0 && newp != *oldp)
- u = 1;
- vp9_write(bc, u, upd);
+ case 1:
+ case 2: {
+ const int prev_coef_contexts_to_update =
+ (cpi->sf.use_fast_coef_updates == 2 ?
+ PREV_COEF_CONTEXTS >> 1 : PREV_COEF_CONTEXTS);
+ const int coef_band_to_update =
+ (cpi->sf.use_fast_coef_updates == 2 ?
+ COEF_BANDS >> 1 : COEF_BANDS);
+ int updates = 0;
+ int noupdates_before_first = 0;
+ for (i = 0; i < BLOCK_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+ // calc probs and branch cts for this frame only
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
+ vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
+ int s;
+ int u = 0;
+ if (l >= 3 && k == 0)
+ continue;
+ if (l >= prev_coef_contexts_to_update ||
+ k >= coef_band_to_update) {
+ u = 0;
+ } else {
+ if (t == PIVOT_NODE)
+ s = vp9_prob_diff_update_savings_search_model(
+ frame_branch_ct[i][j][k][l][0],
+ old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+ else
+ s = vp9_prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][l][t],
+ *oldp, &newp, upd);
+ if (s > 0 && newp != *oldp)
+ u = 1;
+ }
+ updates += u;
+ if (u == 0 && updates == 0) {
+ noupdates_before_first++;
#ifdef ENTROPY_STATS
- if (!cpi->dummy_packing)
- ++tree_update_hist[tx_size][i][j][k][l][t][u];
+ if (!cpi->dummy_packing)
+ ++tree_update_hist[tx_size][i][j][k][l][t][u];
#endif
- if (u) {
- /* send/use new probability */
- vp9_write_prob_diff_update(bc, newp, *oldp);
- *oldp = newp;
+ continue;
+ }
+ if (u == 1 && updates == 1) {
+ int v;
+ // first update
+ vp9_write_bit(bc, 1);
+ for (v = 0; v < noupdates_before_first; ++v)
+ vp9_write(bc, 0, upd);
+ }
+ vp9_write(bc, u, upd);
+#ifdef ENTROPY_STATS
+ if (!cpi->dummy_packing)
+ ++tree_update_hist[tx_size][i][j][k][l][t][u];
+#endif
+ if (u) {
+ /* send/use new probability */
+ vp9_write_prob_diff_update(bc, newp, *oldp);
+ *oldp = newp;
+ }
+ }
}
}
}
}
+ if (updates == 0) {
+ vp9_write_bit(bc, 0); // no updates
+ }
+ return;
}
+
+ default:
+ assert(0);
}
}
@@ -967,7 +1056,7 @@ static void encode_segmentation(VP9_COMP *cpi,
struct vp9_write_bit_buffer *wb) {
int i, j;
- struct segmentation *seg = &cpi->mb.e_mbd.seg;
+ struct segmentation *seg = &cpi->common.seg;
vp9_wb_write_bit(wb, seg->enabled);
if (!seg->enabled)
@@ -1047,7 +1136,7 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
ct_8x8p);
for (j = 0; j < TX_SIZES - 3; j++)
vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j],
- VP9_MODE_UPDATE_PROB, ct_8x8p[j]);
+ MODE_UPDATE_PROB, ct_8x8p[j]);
}
for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
@@ -1055,14 +1144,14 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
ct_16x16p);
for (j = 0; j < TX_SIZES - 2; j++)
vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
- VP9_MODE_UPDATE_PROB, ct_16x16p[j]);
+ MODE_UPDATE_PROB, ct_16x16p[j]);
}
for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
for (j = 0; j < TX_SIZES - 1; j++)
vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
- VP9_MODE_UPDATE_PROB, ct_32x32p[j]);
+ MODE_UPDATE_PROB, ct_32x32p[j]);
}
#ifdef MODE_STATS
if (!cpi->dummy_packing)
@@ -1073,9 +1162,11 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
static void write_interp_filter_type(INTERPOLATIONFILTERTYPE type,
struct vp9_write_bit_buffer *wb) {
+ const int type_to_literal[] = { 1, 0, 2 };
+
vp9_wb_write_bit(wb, type == SWITCHABLE);
if (type != SWITCHABLE)
- vp9_wb_write_literal(wb, type, 2);
+ vp9_wb_write_literal(wb, type_to_literal[type], 2);
}
static void fix_mcomp_filter_type(VP9_COMP *cpi) {
@@ -1083,19 +1174,19 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) {
if (cm->mcomp_filter_type == SWITCHABLE) {
// Check to see if only one of the filters is actually used
- int count[VP9_SWITCHABLE_FILTERS];
+ int count[SWITCHABLE_FILTERS];
int i, j, c = 0;
- for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
count[i] = 0;
- for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j)
+ for (j = 0; j <= SWITCHABLE_FILTERS; ++j)
count[i] += cm->counts.switchable_interp[j][i];
c += (count[i] > 0);
}
if (c == 1) {
// Only one filter is used. So set the filter at frame level
- for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
if (count[i]) {
- cm->mcomp_filter_type = vp9_switchable_interp[i];
+ cm->mcomp_filter_type = i;
break;
}
}
@@ -1127,7 +1218,8 @@ static int get_refresh_mask(VP9_COMP *cpi) {
if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame &&
!cpi->refresh_alt_ref_frame) {
#else
- if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
+ if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame &&
+ !cpi->use_svc) {
#endif
// Preserve the previously existing golden frame and update the frame in
// the alt ref slot instead. This is highly specific to the use of
@@ -1239,9 +1331,16 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[refs[i]]];
found = cm->width == cfg->y_crop_width &&
cm->height == cfg->y_crop_height;
+
+ // TODO(ivan): This prevents a bug while more than 3 buffers are used. Do it
+ // in a better way.
+ if (cpi->use_svc) {
+ found = 0;
+ }
vp9_wb_write_bit(wb, found);
- if (found)
+ if (found) {
break;
+ }
}
if (!found) {
@@ -1340,7 +1439,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,
vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LOG2);
- encode_loopfilter(&xd->lf, wb);
+ encode_loopfilter(&cm->lf, wb);
encode_quantization(cm, wb);
encode_segmentation(cpi, wb);
@@ -1382,7 +1481,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
- VP9_MODE_UPDATE_PROB,
+ MODE_UPDATE_PROB,
cpi->intra_inter_count[i]);
if (cm->allow_comp_inter_inter) {
@@ -1396,7 +1495,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
if (use_hybrid_pred)
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
- VP9_MODE_UPDATE_PROB,
+ MODE_UPDATE_PROB,
cpi->comp_inter_count[i]);
}
}
@@ -1404,10 +1503,10 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
for (i = 0; i < REF_CONTEXTS; i++) {
vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
- VP9_MODE_UPDATE_PROB,
+ MODE_UPDATE_PROB,
cpi->single_ref_count[i][0]);
vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
- VP9_MODE_UPDATE_PROB,
+ MODE_UPDATE_PROB,
cpi->single_ref_count[i][1]);
}
}
@@ -1415,7 +1514,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
for (i = 0; i < REF_CONTEXTS; i++)
vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
- VP9_MODE_UPDATE_PROB,
+ MODE_UPDATE_PROB,
cpi->comp_ref_count[i]);
update_mbintra_mode_probs(cpi, &header_bc);
@@ -1453,7 +1552,7 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
vp9_compute_update_table();
#ifdef ENTROPY_STATS
- if (pc->frame_type == INTER_FRAME)
+ if (cm->frame_type == INTER_FRAME)
active_section = 0;
else
active_section = 7;
diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h
index 3e377cf..013047e 100644
--- a/libvpx/vp9/encoder/vp9_block.h
+++ b/libvpx/vp9/encoder/vp9_block.h
@@ -48,7 +48,11 @@ typedef struct {
int comp_pred_diff;
int single_pred_diff;
int64_t tx_rd_diff[TX_MODES];
- int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
+ int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+
+ // motion vector cache for adaptive motion search control in partition
+ // search loop
+ int_mv pred_mv[MAX_REF_FRAMES];
// Bit flag for each mode whether it has high error in comparison to others.
unsigned int modes_with_high_error;
@@ -121,9 +125,9 @@ struct macroblock {
int mbmode_cost[MB_MODE_COUNT];
unsigned inter_mode_cost[INTER_MODE_CONTEXTS][MB_MODE_COUNT - NEARESTMV];
int intra_uv_mode_cost[2][MB_MODE_COUNT];
- int y_mode_costs[VP9_INTRA_MODES][VP9_INTRA_MODES][VP9_INTRA_MODES];
- int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
- [VP9_SWITCHABLE_FILTERS];
+ int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+ int switchable_interp_costs[SWITCHABLE_FILTERS + 1]
+ [SWITCHABLE_FILTERS];
// These define limits to motion vector components to prevent them
// from extending outside the UMV borders
@@ -144,12 +148,12 @@ struct macroblock {
int optimize;
// indicate if it is in the rd search loop or encoding process
- int rd_search;
+ int use_lp32x32fdct;
int skip_encode;
// Used to store sub partition's choices.
int fast_ms;
- int_mv pred_mv;
+ int_mv pred_mv[MAX_REF_FRAMES];
int subblock_ref;
// TODO(jingning): Need to refactor the structure arrays that buffers the
@@ -170,10 +174,10 @@ struct macroblock {
PICK_MODE_CONTEXT sb64_context;
int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
- BLOCK_SIZE_TYPE b_partitioning[4][4][4];
- BLOCK_SIZE_TYPE mb_partitioning[4][4];
- BLOCK_SIZE_TYPE sb_partitioning[4];
- BLOCK_SIZE_TYPE sb64_partitioning;
+ BLOCK_SIZE b_partitioning[4][4][4];
+ BLOCK_SIZE mb_partitioning[4][4];
+ BLOCK_SIZE sb_partitioning[4];
+ BLOCK_SIZE sb64_partitioning;
void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
diff --git a/libvpx/vp9/encoder/vp9_dct.c b/libvpx/vp9/encoder/vp9_dct.c
index 3112dad..4f4ad04 100644
--- a/libvpx/vp9/encoder/vp9_dct.c
+++ b/libvpx/vp9/encoder/vp9_dct.c
@@ -1077,6 +1077,44 @@ static void dct32_1d(int *input, int *output, int round) {
output[30] = step[30];
output[31] = step[31];
+ // dump the magnitude by 4, hence the intermediate values are within
+ // the range of 16 bits.
+ if (round) {
+ output[0] = half_round_shift(output[0]);
+ output[1] = half_round_shift(output[1]);
+ output[2] = half_round_shift(output[2]);
+ output[3] = half_round_shift(output[3]);
+ output[4] = half_round_shift(output[4]);
+ output[5] = half_round_shift(output[5]);
+ output[6] = half_round_shift(output[6]);
+ output[7] = half_round_shift(output[7]);
+ output[8] = half_round_shift(output[8]);
+ output[9] = half_round_shift(output[9]);
+ output[10] = half_round_shift(output[10]);
+ output[11] = half_round_shift(output[11]);
+ output[12] = half_round_shift(output[12]);
+ output[13] = half_round_shift(output[13]);
+ output[14] = half_round_shift(output[14]);
+ output[15] = half_round_shift(output[15]);
+
+ output[16] = half_round_shift(output[16]);
+ output[17] = half_round_shift(output[17]);
+ output[18] = half_round_shift(output[18]);
+ output[19] = half_round_shift(output[19]);
+ output[20] = half_round_shift(output[20]);
+ output[21] = half_round_shift(output[21]);
+ output[22] = half_round_shift(output[22]);
+ output[23] = half_round_shift(output[23]);
+ output[24] = half_round_shift(output[24]);
+ output[25] = half_round_shift(output[25]);
+ output[26] = half_round_shift(output[26]);
+ output[27] = half_round_shift(output[27]);
+ output[28] = half_round_shift(output[28]);
+ output[29] = half_round_shift(output[29]);
+ output[30] = half_round_shift(output[30]);
+ output[31] = half_round_shift(output[31]);
+ }
+
// Stage 3
step[0] = output[0] + output[(8 - 1)];
step[1] = output[1] + output[(8 - 2)];
@@ -1112,44 +1150,6 @@ static void dct32_1d(int *input, int *output, int round) {
step[30] = output[30] + output[25];
step[31] = output[31] + output[24];
- // dump the magnitude by half, hence the intermediate values are within 1108
- // the range of 16 bits.
- if (round) {
- step[0] = half_round_shift(step[0]);
- step[1] = half_round_shift(step[1]);
- step[2] = half_round_shift(step[2]);
- step[3] = half_round_shift(step[3]);
- step[4] = half_round_shift(step[4]);
- step[5] = half_round_shift(step[5]);
- step[6] = half_round_shift(step[6]);
- step[7] = half_round_shift(step[7]);
- step[8] = half_round_shift(step[8]);
- step[9] = half_round_shift(step[9]);
- step[10] = half_round_shift(step[10]);
- step[11] = half_round_shift(step[11]);
- step[12] = half_round_shift(step[12]);
- step[13] = half_round_shift(step[13]);
- step[14] = half_round_shift(step[14]);
- step[15] = half_round_shift(step[15]);
-
- step[16] = half_round_shift(step[16]);
- step[17] = half_round_shift(step[17]);
- step[18] = half_round_shift(step[18]);
- step[19] = half_round_shift(step[19]);
- step[20] = half_round_shift(step[20]);
- step[21] = half_round_shift(step[21]);
- step[22] = half_round_shift(step[22]);
- step[23] = half_round_shift(step[23]);
- step[24] = half_round_shift(step[24]);
- step[25] = half_round_shift(step[25]);
- step[26] = half_round_shift(step[26]);
- step[27] = half_round_shift(step[27]);
- step[28] = half_round_shift(step[28]);
- step[29] = half_round_shift(step[29]);
- step[30] = half_round_shift(step[30]);
- step[31] = half_round_shift(step[31]);
- }
-
// Stage 4
output[0] = step[0] + step[3];
output[1] = step[1] + step[2];
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index 66eae41..44ab02d 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -8,43 +8,55 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "./vpx_config.h"
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
#include "./vp9_rtcd.h"
-#include "vp9/encoder/vp9_encodeframe.h"
-#include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_encodemv.h"
+#include "./vpx_config.h"
+
+#include "vpx_ports/vpx_timer.h"
+
#include "vp9/common/vp9_common.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/common/vp9_extend.h"
#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_entropymode.h"
-#include "vp9/common/vp9_quant_common.h"
-#include "vp9/encoder/vp9_segmentation.h"
-#include "vp9/encoder/vp9_encodeintra.h"
-#include "vp9/common/vp9_reconinter.h"
-#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/common/vp9_extend.h"
#include "vp9/common/vp9_findnearmv.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_quant_common.h"
#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_tile_common.h"
+
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encodeintra.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_segmentation.h"
#include "vp9/encoder/vp9_tokenize.h"
-#include "./vp9_rtcd.h"
-#include <stdio.h>
-#include <math.h>
-#include <limits.h>
-#include "vpx_ports/vpx_timer.h"
-#include "vp9/common/vp9_pred_common.h"
-#include "vp9/common/vp9_mvref_common.h"
#define DBG_PRNT_SEGMAP 0
+
+static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
+ TX_4X4, // ONLY_4X4
+ TX_8X8, // ONLY_8X8
+ TX_16X16, // ONLY_16X16
+ TX_32X32, // ONLY_32X32
+ TX_32X32, // TX_MODE_SELECT
+};
+
// #define ENC_DEBUG
#ifdef ENC_DEBUG
int enc_debug = 0;
#endif
static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
- int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize);
+ int mi_row, int mi_col, BLOCK_SIZE bsize);
static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
@@ -53,7 +65,10 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
* This also avoids the need for divide by zero checks in
* vp9_activity_masking().
*/
-#define VP9_ACTIVITY_AVG_MIN (64)
+#define ACTIVITY_AVG_MIN (64)
+
+/* Motion vector component magnitude threshold for defining fast motion. */
+#define FAST_MOTION_MV_THRESH (24)
/* This is used as a reference when computing the source variance for the
* purposes of activity masking.
@@ -71,13 +86,14 @@ static const uint8_t VP9_VAR_OFFS[64] = {
128, 128, 128, 128, 128, 128, 128, 128
};
-static unsigned int get_sb_variance(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bs) {
+static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bs) {
unsigned int var, sse;
var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
x->plane[0].src.stride,
VP9_VAR_OFFS, 0, &sse);
- return var >> num_pels_log2_lookup[bs];
+ return (var + (1 << (num_pels_log2_lookup[bs] - 1))) >>
+ num_pels_log2_lookup[bs];
}
// Original activity measure from Tim T's code.
@@ -103,31 +119,29 @@ static unsigned int tt_activity_measure(MACROBLOCK *x) {
}
// Stub for alternative experimental activity measures.
-static unsigned int alt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,
- int use_dc_pred) {
- return vp9_encode_intra(cpi, x, use_dc_pred);
+static unsigned int alt_activity_measure(MACROBLOCK *x, int use_dc_pred) {
+ return vp9_encode_intra(x, use_dc_pred);
}
DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = {0};
// Measure the activity of the current macroblock
// What we measure here is TBD so abstracted to this function
#define ALT_ACT_MEASURE 1
-static unsigned int mb_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,
- int mb_row, int mb_col) {
+static unsigned int mb_activity_measure(MACROBLOCK *x, int mb_row, int mb_col) {
unsigned int mb_activity;
if (ALT_ACT_MEASURE) {
int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
// Or use and alternative.
- mb_activity = alt_activity_measure(cpi, x, use_dc_pred);
+ mb_activity = alt_activity_measure(x, use_dc_pred);
} else {
// Original activity measure from Tim T's code.
mb_activity = tt_activity_measure(x);
}
- if (mb_activity < VP9_ACTIVITY_AVG_MIN)
- mb_activity = VP9_ACTIVITY_AVG_MIN;
+ if (mb_activity < ACTIVITY_AVG_MIN)
+ mb_activity = ACTIVITY_AVG_MIN;
return mb_activity;
}
@@ -175,10 +189,10 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
#else
// Simple mean for now
cpi->activity_avg = (unsigned int) (activity_sum / cpi->common.MBs);
-#endif
+#endif // ACT_MEDIAN
- if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN)
- cpi->activity_avg = VP9_ACTIVITY_AVG_MIN;
+ if (cpi->activity_avg < ACTIVITY_AVG_MIN)
+ cpi->activity_avg = ACTIVITY_AVG_MIN;
// Experimental code: return fixed value normalized for several clips
if (ALT_ACT_MEASURE)
@@ -240,7 +254,7 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
#endif
}
-#endif
+#endif // USE_ACT_INDEX
// Loop through all MBs. Note activity of each, average activity and
// calculate a normalized activity for each
@@ -277,7 +291,7 @@ static void build_activity_map(VP9_COMP *cpi) {
#endif
// measure activity
- mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col);
+ mb_activity = mb_activity_measure(x, mb_row, mb_col);
// Keep frame sum
activity_sum += mb_activity;
@@ -331,15 +345,17 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
}
static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
- BLOCK_SIZE_TYPE bsize, int output_enabled) {
+ BLOCK_SIZE bsize, int output_enabled) {
int i, x_idx, y;
- MACROBLOCK * const x = &cpi->mb;
- MACROBLOCKD * const xd = &x->e_mbd;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
MODE_INFO *mi = &ctx->mic;
- MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO * const mbmi = &xd->this_mi->mbmi;
+ MODE_INFO *mi_addr = xd->this_mi;
int mb_mode_index = ctx->best_mode_index;
- const int mis = cpi->common.mode_info_stride;
+ const int mis = cm->mode_info_stride;
const int mi_width = num_8x8_blocks_wide_lookup[bsize];
const int mi_height = num_8x8_blocks_high_lookup[bsize];
@@ -349,17 +365,16 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);
assert(mi->mbmi.sb_type == bsize);
+ *mi_addr = *mi;
+
// Restore the coding context of the MB to that that was in place
// when the mode was picked for it
- for (y = 0; y < mi_height; y++) {
- for (x_idx = 0; x_idx < mi_width; x_idx++) {
- if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > x_idx
- && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > y) {
- MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis;
- *mi_addr = *mi;
- }
- }
- }
+ for (y = 0; y < mi_height; y++)
+ for (x_idx = 0; x_idx < mi_width; x_idx++)
+ if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
+ && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y)
+ xd->mi_8x8[x_idx + y * mis] = mi_addr;
+
// FIXME(rbultje) I'm pretty sure this should go to the end of this block
// (i.e. after the output_enabled)
if (bsize < BLOCK_32X32) {
@@ -378,12 +393,12 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
if (!output_enabled)
return;
- if (!vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ if (!vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
for (i = 0; i < TX_MODES; i++)
cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i];
}
- if (cpi->common.frame_type == KEY_FRAME) {
+ if (cm->frame_type == KEY_FRAME) {
// Restore the coding modes to that held in the coding context
// if (mb_mode == I4X4_PRED)
// for (i = 0; i < 16; i++)
@@ -401,7 +416,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
THR_D135_PRED /*D135_PRED*/,
THR_D117_PRED /*D117_PRED*/,
THR_D153_PRED /*D153_PRED*/,
- THR_D27_PRED /*D27_PRED*/,
+ THR_D207_PRED /*D207_PRED*/,
THR_D63_PRED /*D63_PRED*/,
THR_TM /*TM_PRED*/,
THR_B_PRED /*I4X4_PRED*/,
@@ -412,7 +427,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
// Note how often each mode chosen as best
cpi->mode_chosen_counts[mb_mode_index]++;
if (is_inter_block(mbmi)
- && (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) {
+ && (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
int_mv best_mv, best_second_mv;
const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];
const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
@@ -427,29 +442,17 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv);
}
- if (bsize > BLOCK_SIZE_SB8X8 && mbmi->mode == NEWMV) {
- int i, j;
- for (j = 0; j < mi_height; ++j)
- for (i = 0; i < mi_width; ++i)
- if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > i
- && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > j)
- xd->mode_info_context[mis * j + i].mbmi = *mbmi;
- }
-
- if (cpi->common.mcomp_filter_type == SWITCHABLE
- && is_inter_mode(mbmi->mode)) {
- ++cpi->common.counts.switchable_interp[
- vp9_get_pred_context_switchable_interp(xd)]
- [vp9_switchable_interp_map[mbmi->interp_filter]];
+ if (cm->mcomp_filter_type == SWITCHABLE && is_inter_mode(mbmi->mode)) {
+ const int ctx = vp9_get_pred_context_switchable_interp(xd);
+ ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
}
cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
- for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i <= SWITCHABLE_FILTERS; i++)
cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
- }
}
}
@@ -469,10 +472,10 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
}
static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
- MACROBLOCK * const x = &cpi->mb;
- VP9_COMMON * const cm = &cpi->common;
- MACROBLOCKD * const xd = &x->e_mbd;
+ BLOCK_SIZE bsize) {
+ MACROBLOCK *const x = &cpi->mb;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mbmi;
const int dst_fb_idx = cm->new_fb_idx;
const int idx_str = xd->mode_info_stride * mi_row + mi_col;
@@ -481,18 +484,9 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
const int mb_row = mi_row >> 1;
const int mb_col = mi_col >> 1;
const int idx_map = mb_row * cm->mb_cols + mb_col;
- const struct segmentation *const seg = &xd->seg;
- int i;
+ const struct segmentation *const seg = &cm->seg;
- // entropy context structures
- for (i = 0; i < MAX_MB_PLANE; i++) {
- xd->plane[i].above_context = cm->above_context[i]
- + (mi_col * 2 >> xd->plane[i].subsampling_x);
- xd->plane[i].left_context = cm->left_context[i]
- + (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
- }
-
- // partition contexts
+ set_skip_context(cm, xd, mi_row, mi_col);
set_partition_seg_context(cm, xd, mi_row, mi_col);
// Activity map pointer
@@ -501,23 +495,28 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
/* pointers to mode info contexts */
x->partition_info = x->pi + idx_str;
- xd->mode_info_context = cm->mi + idx_str;
- mbmi = &xd->mode_info_context->mbmi;
+
+ xd->mi_8x8 = cm->mi_grid_visible + idx_str;
+ xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
+
// Special case: if prev_mi is NULL, the previous mode info context
// cannot be used.
- xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + idx_str : NULL;
+ xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
+
+ xd->this_mi =
+ xd->mi_8x8[0] = cm->mi + idx_str;
+
+ mbmi = &xd->this_mi->mbmi;
// Set up destination pointers
setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col);
- /* Set up limit values for MV components to prevent them from
- * extending beyond the UMV borders assuming 16x16 block size */
- x->mv_row_min = -((mi_row * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
- x->mv_col_min = -((mi_col * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
- x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE
- + (VP9BORDERINPIXELS - MI_SIZE * mi_height - VP9_INTERP_EXTEND));
- x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE
- + (VP9BORDERINPIXELS - MI_SIZE * mi_width - VP9_INTERP_EXTEND));
+ // Set up limit values for MV components
+ // mv beyond the range do not produce new/different prediction block
+ x->mv_row_min = -(((mi_row + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND);
+ x->mv_col_min = -(((mi_col + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND);
+ x->mv_row_max = (cm->mi_rows - mi_row) * MI_SIZE + VP9_INTERP_EXTEND;
+ x->mv_col_max = (cm->mi_cols - mi_col) * MI_SIZE + VP9_INTERP_EXTEND;
// Set up distance of MB to edge of frame in 1/8th pel units
assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
@@ -564,25 +563,33 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
int *totalrate, int64_t *totaldist,
- BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
int64_t best_rd) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- x->rd_search = 1;
+ // Use the lower precision, but faster, 32x32 fdct for mode selection.
+ x->use_lp32x32fdct = 1;
- if (bsize < BLOCK_SIZE_SB8X8) {
+ if (bsize < BLOCK_8X8) {
// When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
// there is nothing to be done.
- if (xd->ab_index != 0)
+ if (xd->ab_index != 0) {
+ *totalrate = 0;
+ *totaldist = 0;
return;
+ }
}
set_offsets(cpi, mi_row, mi_col, bsize);
- xd->mode_info_context->mbmi.sb_type = bsize;
+ xd->this_mi->mbmi.sb_type = bsize;
+
+ // Set to zero to make sure we do not use the previous encoded frame stats
+ xd->this_mi->mbmi.skip_coeff = 0;
+
+ x->source_variance = get_sby_perpixel_variance(cpi, x, bsize);
- x->source_variance = get_sb_variance(cpi, x, bsize);
if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
vp9_activity_masking(cpi, x);
@@ -600,38 +607,39 @@ static void update_stats(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- MODE_INFO *mi = xd->mode_info_context;
+ MODE_INFO *mi = xd->this_mi;
MB_MODE_INFO *const mbmi = &mi->mbmi;
if (cm->frame_type != KEY_FRAME) {
- const int seg_ref_active = vp9_segfeature_active(&xd->seg, mbmi->segment_id,
+ const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
SEG_LVL_REF_FRAME);
if (!seg_ref_active)
- cpi->intra_inter_count[vp9_get_pred_context_intra_inter(xd)][mbmi
- ->ref_frame[0] > INTRA_FRAME]++;
+ cpi->intra_inter_count[vp9_get_pred_context_intra_inter(xd)]
+ [is_inter_block(mbmi)]++;
// If the segment reference feature is enabled we have only a single
// reference frame allowed for the segment so exclude it from
// the reference frame counts used to work out probabilities.
- if ((mbmi->ref_frame[0] > INTRA_FRAME) && !seg_ref_active) {
+ if (is_inter_block(mbmi) && !seg_ref_active) {
if (cm->comp_pred_mode == HYBRID_PREDICTION)
cpi->comp_inter_count[vp9_get_pred_context_comp_inter_inter(cm, xd)]
- [mbmi->ref_frame[1] > INTRA_FRAME]++;
+ [has_second_ref(mbmi)]++;
- if (mbmi->ref_frame[1] > INTRA_FRAME) {
- cpi->comp_ref_count[vp9_get_pred_context_comp_ref_p(cm, xd)][mbmi
- ->ref_frame[0] == GOLDEN_FRAME]++;
+ if (has_second_ref(mbmi)) {
+ cpi->comp_ref_count[vp9_get_pred_context_comp_ref_p(cm, xd)]
+ [mbmi->ref_frame[0] == GOLDEN_FRAME]++;
} else {
- cpi->single_ref_count[vp9_get_pred_context_single_ref_p1(xd)]
- [0][mbmi->ref_frame[0] != LAST_FRAME]++;
+ cpi->single_ref_count[vp9_get_pred_context_single_ref_p1(xd)][0]
+ [mbmi->ref_frame[0] != LAST_FRAME]++;
if (mbmi->ref_frame[0] != LAST_FRAME)
cpi->single_ref_count[vp9_get_pred_context_single_ref_p2(xd)][1]
- [mbmi->ref_frame[0] != GOLDEN_FRAME]++;
+ [mbmi->ref_frame[0] != GOLDEN_FRAME]++;
}
}
+
// Count of last ref frame 0,0 usage
- if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame[0] == LAST_FRAME))
+ if (mbmi->mode == ZEROMV && mbmi->ref_frame[0] == LAST_FRAME)
cpi->inter_zz_count++;
}
}
@@ -639,9 +647,8 @@ static void update_stats(VP9_COMP *cpi) {
// TODO(jingning): the variables used here are little complicated. need further
// refactoring on organizing the temporary buffers, when recursive
// partition down to 4x4 block size is enabled.
-static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize) {
- MACROBLOCKD * const xd = &x->e_mbd;
+static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &x->e_mbd;
switch (bsize) {
case BLOCK_64X64:
@@ -676,9 +683,8 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
}
}
-static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize) {
- MACROBLOCKD *xd = &x->e_mbd;
+static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &x->e_mbd;
switch (bsize) {
case BLOCK_64X64:
return &x->sb64_partitioning;
@@ -698,7 +704,7 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
- BLOCK_SIZE_TYPE bsize) {
+ BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -729,7 +735,7 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
- BLOCK_SIZE_TYPE bsize) {
+ BLOCK_SIZE bsize) {
const VP9_COMMON *const cm = &cpi->common;
const MACROBLOCK *const x = &cpi->mb;
const MACROBLOCKD *const xd = &x->e_mbd;
@@ -760,7 +766,7 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
}
static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
- int output_enabled, BLOCK_SIZE_TYPE bsize, int sub_index) {
+ int output_enabled, BLOCK_SIZE bsize, int sub_index) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD * const xd = &x->e_mbd;
@@ -769,9 +775,9 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
return;
if (sub_index != -1)
- *(get_sb_index(xd, bsize)) = sub_index;
+ *get_sb_index(xd, bsize) = sub_index;
- if (bsize < BLOCK_SIZE_SB8X8) {
+ if (bsize < BLOCK_8X8) {
// When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
// there is nothing to be done.
if (xd->ab_index > 0)
@@ -790,22 +796,22 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
}
static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
- int output_enabled, BLOCK_SIZE_TYPE bsize) {
+ int output_enabled, BLOCK_SIZE bsize) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD * const xd = &x->e_mbd;
- BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;
+ BLOCK_SIZE c1 = BLOCK_8X8;
const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;
- int UNINITIALIZED_IS_SAFE(pl);
+ int pl = 0;
PARTITION_TYPE partition;
- BLOCK_SIZE_TYPE subsize;
+ BLOCK_SIZE subsize;
int i;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
c1 = BLOCK_4X4;
- if (bsize >= BLOCK_SIZE_SB8X8) {
+ if (bsize >= BLOCK_8X8) {
set_partition_seg_context(cm, xd, mi_row, mi_col);
pl = partition_plane_context(xd, bsize);
c1 = *(get_sb_partitioning(x, bsize));
@@ -814,7 +820,7 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
switch (partition) {
case PARTITION_NONE:
- if (output_enabled && bsize >= BLOCK_SIZE_SB8X8)
+ if (output_enabled && bsize >= BLOCK_8X8)
cpi->partition_count[pl][PARTITION_NONE]++;
encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
break;
@@ -839,7 +845,7 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
for (i = 0; i < 4; i++) {
const int x_idx = i & 1, y_idx = i >> 1;
- *(get_sb_index(xd, subsize)) = i;
+ *get_sb_index(xd, subsize) = i;
encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
output_enabled, subsize);
}
@@ -849,52 +855,114 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
break;
}
- if (partition != PARTITION_SPLIT || bsize == BLOCK_SIZE_SB8X8) {
+ if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) {
set_partition_seg_context(cm, xd, mi_row, mi_col);
update_partition_context(xd, c1, bsize);
}
}
-static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
- BLOCK_SIZE_TYPE bsize) {
+// Check to see if the given partition size is allowed for a specified number
+// of 8x8 block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
+ int rows_left, int cols_left,
+ int *bh, int *bw) {
+ if ((rows_left <= 0) || (cols_left <= 0)) {
+ return MIN(bsize, BLOCK_8X8);
+ } else {
+ for (; bsize > 0; --bsize) {
+ *bh = num_8x8_blocks_high_lookup[bsize];
+ *bw = num_8x8_blocks_wide_lookup[bsize];
+ if ((*bh <= rows_left) && (*bw <= cols_left)) {
+ break;
+ }
+ }
+ }
+ return bsize;
+}
+
+// This function attempts to set all mode info entries in a given SB64
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+static void set_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+ int mi_row, int mi_col) {
VP9_COMMON *const cm = &cpi->common;
+ BLOCK_SIZE bsize = cpi->sf.always_this_block_size;
const int mis = cm->mode_info_stride;
+ int row8x8_remaining = cm->cur_tile_mi_row_end - mi_row;
+ int col8x8_remaining = cm->cur_tile_mi_col_end - mi_col;
int block_row, block_col;
- for (block_row = 0; block_row < 8; ++block_row) {
- for (block_col = 0; block_col < 8; ++block_col) {
- m[block_row * mis + block_col].mbmi.sb_type = bsize;
+ MODE_INFO * mi_upper_left = cm->mi + mi_row * mis + mi_col;
+ int bh = num_8x8_blocks_high_lookup[bsize];
+ int bw = num_8x8_blocks_wide_lookup[bsize];
+
+ assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+ // Apply the requested partition size to the SB64 if it is all "in image"
+ if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+ (row8x8_remaining >= MI_BLOCK_SIZE)) {
+ for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
+ for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
+ int index = block_row * mis + block_col;
+ mi_8x8[index] = mi_upper_left + index;
+ mi_8x8[index]->mbmi.sb_type = bsize;
+ }
+ }
+ } else {
+ // Else this is a partial SB64.
+ for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
+ for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
+ int index = block_row * mis + block_col;
+ // Find a partition size that fits
+ bsize = find_partition_size(cpi->sf.always_this_block_size,
+ (row8x8_remaining - block_row),
+ (col8x8_remaining - block_col), &bh, &bw);
+ mi_8x8[index] = mi_upper_left + index;
+ mi_8x8[index]->mbmi.sb_type = bsize;
+ }
}
}
}
-static void copy_partitioning(VP9_COMP *cpi, MODE_INFO *m, MODE_INFO *p) {
+
+static void copy_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+ MODE_INFO **prev_mi_8x8) {
VP9_COMMON *const cm = &cpi->common;
const int mis = cm->mode_info_stride;
int block_row, block_col;
+
for (block_row = 0; block_row < 8; ++block_row) {
for (block_col = 0; block_col < 8; ++block_col) {
- m[block_row * mis + block_col].mbmi.sb_type =
- p[block_row * mis + block_col].mbmi.sb_type;
+ MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col];
+ BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
+ int offset;
+
+ if (prev_mi) {
+ offset = prev_mi - cm->prev_mi;
+ mi_8x8[block_row * mis + block_col] = cm->mi + offset;
+ mi_8x8[block_row * mis + block_col]->mbmi.sb_type = sb_type;
+ }
}
}
}
-static void set_block_size(VP9_COMMON * const cm, MODE_INFO *m,
- BLOCK_SIZE_TYPE bsize, int mis, int mi_row,
+static void set_block_size(VP9_COMMON * const cm, MODE_INFO **mi_8x8,
+ BLOCK_SIZE bsize, int mis, int mi_row,
int mi_col) {
- int row, col;
- int bwl = b_width_log2(bsize);
- int bhl = b_height_log2(bsize);
- int bsl = (bwl > bhl ? bwl : bhl);
-
- int bs = (1 << bsl) / 2; // Block size in units of 8 pels.
- MODE_INFO *m2 = m + mi_row * mis + mi_col;
- for (row = 0; row < bs; row++) {
- for (col = 0; col < bs; col++) {
- if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols)
- continue;
- m2[row * mis + col].mbmi.sb_type = bsize;
- }
- }
+ int r, c;
+ const int bs = MAX(num_8x8_blocks_wide_lookup[bsize],
+ num_8x8_blocks_high_lookup[bsize]);
+ const int idx_str = mis * mi_row + mi_col;
+ MODE_INFO **const mi2 = &mi_8x8[idx_str];
+
+ mi2[0] = cm->mi + idx_str;
+ mi2[0]->mbmi.sb_type = bsize;
+
+ for (r = 0; r < bs; r++)
+ for (c = 0; c < bs; c++)
+ if (mi_row + r < cm->mi_rows && mi_col + c < cm->mi_cols)
+ mi2[r * mis + c] = mi2[0];
}
typedef struct {
@@ -931,9 +999,9 @@ typedef enum {
V64X64,
} TREE_LEVEL;
-static void tree_to_node(void *data, BLOCK_SIZE_TYPE block_size, vt_node *node) {
+static void tree_to_node(void *data, BLOCK_SIZE bsize, vt_node *node) {
int i;
- switch (block_size) {
+ switch (bsize) {
case BLOCK_64X64: {
v64x64 *vt = (v64x64 *) data;
node->vt = &vt->vt;
@@ -990,9 +1058,9 @@ void sum_2_variances(var *r, var *a, var*b) {
a->sum_error + b->sum_error, a->count + b->count);
}
-static void fill_variance_tree(void *data, BLOCK_SIZE_TYPE block_size) {
+static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
vt_node node;
- tree_to_node(data, block_size, &node);
+ tree_to_node(data, bsize, &node);
sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]);
sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]);
sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]);
@@ -1002,7 +1070,7 @@ static void fill_variance_tree(void *data, BLOCK_SIZE_TYPE block_size) {
#if PERFORM_RANDOM_PARTITIONING
static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
- BLOCK_SIZE_TYPE block_size, int mi_row,
+ BLOCK_SIZE block_size, int mi_row,
int mi_col, int mi_size) {
VP9_COMMON * const cm = &cpi->common;
vt_node vt;
@@ -1038,30 +1106,30 @@ static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
return 0;
}
-#else
+#else // !PERFORM_RANDOM_PARTITIONING
-static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
- BLOCK_SIZE_TYPE block_size, int mi_row,
+static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO **m,
+ BLOCK_SIZE bsize, int mi_row,
int mi_col, int mi_size) {
VP9_COMMON * const cm = &cpi->common;
vt_node vt;
const int mis = cm->mode_info_stride;
int64_t threshold = 50 * cpi->common.base_qindex;
- tree_to_node(data, block_size, &vt);
+ tree_to_node(data, bsize, &vt);
// split none is available only if we have more than half a block size
// in width and height inside the visible image
if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows
&& vt.vt->none.variance < threshold) {
- set_block_size(cm, m, block_size, mis, mi_row, mi_col);
+ set_block_size(cm, m, bsize, mis, mi_row, mi_col);
return 1;
}
// vertical split is available on all but the bottom border
if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
&& vt.vt->vert[1].variance < threshold) {
- set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
+ set_block_size(cm, m, get_subsize(bsize, PARTITION_VERT), mis, mi_row,
mi_col);
return 1;
}
@@ -1069,17 +1137,17 @@ static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
// horizontal split is available on all but the right border
if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
&& vt.vt->horz[1].variance < threshold) {
- set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
+ set_block_size(cm, m, get_subsize(bsize, PARTITION_HORZ), mis, mi_row,
mi_col);
return 1;
}
return 0;
}
-#endif
+#endif // PERFORM_RANDOM_PARTITIONING
-static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
- int mi_col) {
+static void choose_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+ int mi_row, int mi_col) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK *x = &cpi->mb;
MACROBLOCKD *xd = &cpi->mb.e_mbd;
@@ -1095,7 +1163,7 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
int pixels_wide = 64, pixels_high = 64;
vp9_zero(vt);
- set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+ set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
if (xd->mb_to_right_edge < 0)
pixels_wide += (xd->mb_to_right_edge >> 3);
@@ -1122,13 +1190,16 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
&xd->scale_factor[0]);
setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
&xd->scale_factor[1]);
- xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME;
- xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64;
- vp9_find_best_ref_mvs(xd, m->mbmi.ref_mvs[m->mbmi.ref_frame[0]],
+
+ xd->this_mi->mbmi.ref_frame[0] = LAST_FRAME;
+ xd->this_mi->mbmi.sb_type = BLOCK_64X64;
+ vp9_find_best_ref_mvs(xd,
+ mi_8x8[0]->mbmi.ref_mvs[mi_8x8[0]->mbmi.ref_frame[0]],
&nearest_mv, &near_mv);
- xd->mode_info_context->mbmi.mv[0] = nearest_mv;
- vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+ xd->this_mi->mbmi.mv[0] = nearest_mv;
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64);
+
d = xd->plane[0].dst.buf;
dp = xd->plane[0].dst.stride;
}
@@ -1165,24 +1236,24 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
// Now go through the entire structure, splitting every block size until
// we get to one that's got a variance lower than our threshold, or we
// hit 8x8.
- if (!set_vt_partitioning(cpi, &vt, m, BLOCK_64X64, mi_row, mi_col,
+ if (!set_vt_partitioning(cpi, &vt, mi_8x8, BLOCK_64X64, mi_row, mi_col,
4)) {
for (i = 0; i < 4; ++i) {
const int x32_idx = ((i & 1) << 2);
const int y32_idx = ((i >> 1) << 2);
- if (!set_vt_partitioning(cpi, &vt.split[i], m, BLOCK_32X32,
+ if (!set_vt_partitioning(cpi, &vt.split[i], mi_8x8, BLOCK_32X32,
(mi_row + y32_idx), (mi_col + x32_idx), 2)) {
for (j = 0; j < 4; ++j) {
const int x16_idx = ((j & 1) << 1);
const int y16_idx = ((j >> 1) << 1);
- if (!set_vt_partitioning(cpi, &vt.split[i].split[j], m,
+ if (!set_vt_partitioning(cpi, &vt.split[i].split[j], mi_8x8,
BLOCK_16X16,
(mi_row + y32_idx + y16_idx),
(mi_col + x32_idx + x16_idx), 1)) {
for (k = 0; k < 4; ++k) {
const int x8_idx = (k & 1);
const int y8_idx = (k >> 1);
- set_block_size(cm, m, BLOCK_8X8, mis,
+ set_block_size(cm, mi_8x8, BLOCK_8X8, mis,
(mi_row + y32_idx + y16_idx + y8_idx),
(mi_col + x32_idx + x16_idx + x8_idx));
}
@@ -1193,9 +1264,10 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
}
}
-static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
- int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,
- int *rate, int64_t *dist, int do_recon) {
+static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+ TOKENEXTRA **tp, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int *rate, int64_t *dist,
+ int do_recon) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD *xd = &cpi->mb.e_mbd;
@@ -1208,7 +1280,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
int bss = (1 << bsl) / 4;
int i, pl;
PARTITION_TYPE partition = PARTITION_NONE;
- BLOCK_SIZE_TYPE subsize;
+ BLOCK_SIZE subsize;
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
PARTITION_CONTEXT sl[8], sa[8];
int last_part_rate = INT_MAX;
@@ -1219,9 +1291,9 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
int64_t none_dist = INT_MAX;
int chosen_rate = INT_MAX;
int64_t chosen_dist = INT_MAX;
- BLOCK_SIZE_TYPE sub_subsize = BLOCK_4X4;
+ BLOCK_SIZE sub_subsize = BLOCK_4X4;
int splits_below = 0;
- BLOCK_SIZE_TYPE bs_type = m->mbmi.sb_type;
+ BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
@@ -1230,7 +1302,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
subsize = get_subsize(bsize, partition);
- if (bsize < BLOCK_SIZE_SB8X8) {
+ if (bsize < BLOCK_8X8) {
// When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
// there is nothing to be done.
if (xd->ab_index != 0) {
@@ -1244,17 +1316,17 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
x->fast_ms = 0;
- x->pred_mv.as_int = 0;
x->subblock_ref = 0;
if (cpi->sf.adjust_partitioning_from_last_frame) {
// Check if any of the sub blocks are further split.
- if (partition == PARTITION_SPLIT && subsize > BLOCK_SIZE_SB8X8) {
+ if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
splits_below = 1;
for (i = 0; i < 4; i++) {
int jj = i >> 1, ii = i & 0x01;
- if (m[jj * bss * mis + ii * bss].mbmi.sb_type >= sub_subsize) {
+ MODE_INFO * this_mi = mi_8x8[jj * bss * mis + ii * bss];
+ if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
splits_below = 0;
}
}
@@ -1274,7 +1346,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
none_rate += x->partition_cost[pl][PARTITION_NONE];
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- m->mbmi.sb_type = bs_type;
+ mi_8x8[0]->mbmi.sb_type = bs_type;
*(get_sb_partitioning(x, bsize)) = subsize;
}
}
@@ -1285,16 +1357,16 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
bsize, get_block_context(x, bsize), INT64_MAX);
break;
case PARTITION_HORZ:
- *(get_sb_index(xd, subsize)) = 0;
+ *get_sb_index(xd, subsize) = 0;
pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
subsize, get_block_context(x, subsize), INT64_MAX);
if (last_part_rate != INT_MAX &&
- bsize >= BLOCK_SIZE_SB8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
+ bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
int rt = 0;
int64_t dt = 0;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
- *(get_sb_index(xd, subsize)) = 1;
+ *get_sb_index(xd, subsize) = 1;
pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
get_block_context(x, subsize), INT64_MAX);
if (rt == INT_MAX || dt == INT_MAX) {
@@ -1308,16 +1380,16 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
}
break;
case PARTITION_VERT:
- *(get_sb_index(xd, subsize)) = 0;
+ *get_sb_index(xd, subsize) = 0;
pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
subsize, get_block_context(x, subsize), INT64_MAX);
if (last_part_rate != INT_MAX &&
- bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
+ bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
int rt = 0;
int64_t dt = 0;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
- *(get_sb_index(xd, subsize)) = 1;
+ *get_sb_index(xd, subsize) = 1;
pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
get_block_context(x, subsize), INT64_MAX);
if (rt == INT_MAX || dt == INT_MAX) {
@@ -1343,10 +1415,11 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
continue;
- *(get_sb_index(xd, subsize)) = i;
+ *get_sb_index(xd, subsize) = i;
- rd_use_partition(cpi, m + jj * bss * mis + ii * bss, tp, mi_row + y_idx,
- mi_col + x_idx, subsize, &rt, &dt, i != 3);
+ rd_use_partition(cpi, mi_8x8 + jj * bss * mis + ii * bss, tp,
+ mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
+ i != 3);
if (rt == INT_MAX || dt == INT_MAX) {
last_part_rate = INT_MAX;
last_part_dist = INT_MAX;
@@ -1365,10 +1438,10 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
last_part_rate += x->partition_cost[pl][partition];
if (cpi->sf.adjust_partitioning_from_last_frame
- && partition != PARTITION_SPLIT && bsize > BLOCK_SIZE_SB8X8
+ && partition != PARTITION_SPLIT && bsize > BLOCK_8X8
&& (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows)
&& (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) {
- BLOCK_SIZE_TYPE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
+ BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
split_rate = 0;
split_dist = 0;
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -1386,9 +1459,9 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
|| (mi_col + x_idx >= cm->mi_cols))
continue;
- *(get_sb_index(xd, split_subsize)) = i;
- *(get_sb_partitioning(x, bsize)) = split_subsize;
- *(get_sb_partitioning(x, split_subsize)) = split_subsize;
+ *get_sb_index(xd, split_subsize) = i;
+ *get_sb_partitioning(x, bsize) = split_subsize;
+ *get_sb_partitioning(x, split_subsize) = split_subsize;
save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -1427,8 +1500,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
// If last_part is better set the partitioning to that...
if (RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist)
< RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)) {
- m->mbmi.sb_type = bsize;
- if (bsize >= BLOCK_SIZE_SB8X8)
+ mi_8x8[0]->mbmi.sb_type = bsize;
+ if (bsize >= BLOCK_8X8)
*(get_sb_partitioning(x, bsize)) = subsize;
chosen_rate = last_part_rate;
chosen_dist = last_part_dist;
@@ -1436,7 +1509,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
// If none was better set the partitioning to that...
if (RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)
> RDCOST(x->rdmult, x->rddiv, none_rate, none_dist)) {
- if (bsize >= BLOCK_SIZE_SB8X8)
+ if (bsize >= BLOCK_8X8)
*(get_sb_partitioning(x, bsize)) = bsize;
chosen_rate = none_rate;
chosen_dist = none_dist;
@@ -1446,37 +1519,68 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
// We must have chosen a partitioning and encoding or we'll fail later on.
// No other opportunities for success.
- if ( bsize == BLOCK_SIZE_SB64X64)
+ if ( bsize == BLOCK_64X64)
assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
if (do_recon)
- encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
+ encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
*rate = chosen_rate;
*dist = chosen_dist;
}
-static BLOCK_SIZE_TYPE min_partition_size[BLOCK_SIZE_TYPES] =
- { BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
- BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8,
- BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16 };
-static BLOCK_SIZE_TYPE max_partition_size[BLOCK_SIZE_TYPES] =
- { BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
- BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64,
- BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64 };
+static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
+ BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
+ BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8,
+ BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16
+};
+
+static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
+ BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+ BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64,
+ BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64
+};
+// Look at all the mode_info entries for blocks that are part of this
+// partition and find the min and max values for sb_type.
+// At the moment this is designed to work on a 64x64 SB but could be
+// adjusted to use a size parameter.
+//
+// The min and max are assumed to have been initialized prior to calling this
+// function so repeat calls can accumulate a min and max of more than one sb64.
+static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8,
+ BLOCK_SIZE * min_block_size,
+ BLOCK_SIZE * max_block_size ) {
+ MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ int sb_width_in_blocks = MI_BLOCK_SIZE;
+ int sb_height_in_blocks = MI_BLOCK_SIZE;
+ int i, j;
+ int index = 0;
+
+ // Check the sb_type for each block that belongs to this region.
+ for (i = 0; i < sb_height_in_blocks; ++i) {
+ for (j = 0; j < sb_width_in_blocks; ++j) {
+ MODE_INFO * mi = mi_8x8[index+j];
+ BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0;
+ *min_block_size = MIN(*min_block_size, sb_type);
+ *max_block_size = MAX(*max_block_size, sb_type);
+ }
+ index += xd->mode_info_stride;
+ }
+}
// Look at neighboring blocks and set a min and max partition size based on
// what they chose.
-static void rd_auto_partition_range(VP9_COMP *cpi,
- BLOCK_SIZE_TYPE * min_block_size,
- BLOCK_SIZE_TYPE * max_block_size) {
+static void rd_auto_partition_range(VP9_COMP *cpi, int row, int col,
+ BLOCK_SIZE *min_block_size,
+ BLOCK_SIZE *max_block_size) {
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- const MODE_INFO *const mi = xd->mode_info_context;
- const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
- const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
- const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
- const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+ MODE_INFO ** mi_8x8 = xd->mi_8x8;
+ const int left_in_image = xd->left_available && mi_8x8[-1];
+ const int above_in_image = xd->up_available &&
+ mi_8x8[-xd->mode_info_stride];
+ MODE_INFO ** above_sb64_mi_8x8;
+ MODE_INFO ** left_sb64_mi_8x8;
// Frequency check
if (cpi->sf.auto_min_max_partition_count <= 0) {
@@ -1484,51 +1588,182 @@ static void rd_auto_partition_range(VP9_COMP *cpi,
cpi->sf.auto_min_max_partition_interval;
*min_block_size = BLOCK_4X4;
*max_block_size = BLOCK_64X64;
- return;
} else {
--cpi->sf.auto_min_max_partition_count;
+
+ // Set default values if no left or above neighbour
+ if (!left_in_image && !above_in_image) {
+ *min_block_size = BLOCK_4X4;
+ *max_block_size = BLOCK_64X64;
+ } else {
+ VP9_COMMON *const cm = &cpi->common;
+ int row8x8_remaining = cm->cur_tile_mi_row_end - row;
+ int col8x8_remaining = cm->cur_tile_mi_col_end - col;
+ int bh, bw;
+
+ // Default "min to max" and "max to min"
+ *min_block_size = BLOCK_64X64;
+ *max_block_size = BLOCK_4X4;
+
+ // Find the min and max partition sizes used in the left SB64
+ if (left_in_image) {
+ left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE];
+ get_sb_partition_size_range(cpi, left_sb64_mi_8x8,
+ min_block_size, max_block_size);
+ }
+
+ // Find the min and max partition sizes used in the above SB64 taking
+ // the values found for left as a starting point.
+ if (above_in_image) {
+ above_sb64_mi_8x8 = &mi_8x8[-xd->mode_info_stride * MI_BLOCK_SIZE];
+ get_sb_partition_size_range(cpi, above_sb64_mi_8x8,
+ min_block_size, max_block_size);
+ }
+
+ // Give a bit of leaway either side of the observed min and max
+ *min_block_size = min_partition_size[*min_block_size];
+ *max_block_size = max_partition_size[*max_block_size];
+
+ // Check border cases where max and min from neighbours may not be legal.
+ *max_block_size = find_partition_size(*max_block_size,
+ row8x8_remaining, col8x8_remaining,
+ &bh, &bw);
+ *min_block_size = MIN(*min_block_size, *max_block_size);
+ }
}
+}
- // Check for edge cases
- if (!left_in_image && !above_in_image) {
- *min_block_size = BLOCK_4X4;
- *max_block_size = BLOCK_64X64;
- } else if (!left_in_image) {
- *min_block_size = min_partition_size[above_mbmi->sb_type];
- *max_block_size = max_partition_size[above_mbmi->sb_type];
- } else if (!above_in_image) {
- *min_block_size = min_partition_size[left_mbmi->sb_type];
- *max_block_size = max_partition_size[left_mbmi->sb_type];
- } else {
- *min_block_size =
- min_partition_size[MIN(left_mbmi->sb_type, above_mbmi->sb_type)];
- *max_block_size =
- max_partition_size[MAX(left_mbmi->sb_type, above_mbmi->sb_type)];
+static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ // Only use 8x8 result for non HD videos.
+ // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0;
+ int use_8x8 = 1;
+
+ if (cm->frame_type && !cpi->is_src_frame_alt_ref &&
+ ((use_8x8 && bsize == BLOCK_16X16) ||
+ bsize == BLOCK_32X32 || bsize == BLOCK_64X64)) {
+ int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0;
+ PICK_MODE_CONTEXT *block_context = NULL;
+
+ if (bsize == BLOCK_16X16) {
+ block_context = x->sb8x8_context[xd->sb_index][xd->mb_index];
+ } else if (bsize == BLOCK_32X32) {
+ block_context = x->mb_context[xd->sb_index];
+ } else if (bsize == BLOCK_64X64) {
+ block_context = x->sb32_context;
+ }
+
+ if (block_context) {
+ ref0 = block_context[0].mic.mbmi.ref_frame[0];
+ ref1 = block_context[1].mic.mbmi.ref_frame[0];
+ ref2 = block_context[2].mic.mbmi.ref_frame[0];
+ ref3 = block_context[3].mic.mbmi.ref_frame[0];
+ }
+
+ // Currently, only consider 4 inter reference frames.
+ if (ref0 && ref1 && ref2 && ref3) {
+ int d01, d23, d02, d13;
+
+ // Motion vectors for the four subblocks.
+ int16_t mvr0 = block_context[0].mic.mbmi.mv[0].as_mv.row;
+ int16_t mvc0 = block_context[0].mic.mbmi.mv[0].as_mv.col;
+ int16_t mvr1 = block_context[1].mic.mbmi.mv[0].as_mv.row;
+ int16_t mvc1 = block_context[1].mic.mbmi.mv[0].as_mv.col;
+ int16_t mvr2 = block_context[2].mic.mbmi.mv[0].as_mv.row;
+ int16_t mvc2 = block_context[2].mic.mbmi.mv[0].as_mv.col;
+ int16_t mvr3 = block_context[3].mic.mbmi.mv[0].as_mv.row;
+ int16_t mvc3 = block_context[3].mic.mbmi.mv[0].as_mv.col;
+
+ // Adjust sign if ref is alt_ref.
+ if (cm->ref_frame_sign_bias[ref0]) {
+ mvr0 *= -1;
+ mvc0 *= -1;
+ }
+
+ if (cm->ref_frame_sign_bias[ref1]) {
+ mvr1 *= -1;
+ mvc1 *= -1;
+ }
+
+ if (cm->ref_frame_sign_bias[ref2]) {
+ mvr2 *= -1;
+ mvc2 *= -1;
+ }
+
+ if (cm->ref_frame_sign_bias[ref3]) {
+ mvr3 *= -1;
+ mvc3 *= -1;
+ }
+
+ // Calculate mv distances.
+ d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1));
+ d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3));
+ d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2));
+ d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3));
+
+ if (d01 < FAST_MOTION_MV_THRESH && d23 < FAST_MOTION_MV_THRESH &&
+ d02 < FAST_MOTION_MV_THRESH && d13 < FAST_MOTION_MV_THRESH) {
+ // Set fast motion search level.
+ x->fast_ms = 1;
+
+ if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 &&
+ d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) {
+ // Set fast motion search level.
+ x->fast_ms = 2;
+
+ if (!d01 && !d23 && !d02 && !d13) {
+ x->fast_ms = 3;
+ x->subblock_ref = ref0;
+ }
+ }
+ }
+ }
}
}
+static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+ vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
+}
+
+static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+ vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
+}
+
// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
// unlikely to be selected depending on previous rate-distortion optimization
// results, for encoding speed-up.
static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
- int mi_col, BLOCK_SIZE_TYPE bsize, int *rate,
+ int mi_col, BLOCK_SIZE bsize, int *rate,
int64_t *dist, int do_recon, int64_t best_rd) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD * const xd = &x->e_mbd;
- int bsl = b_width_log2(bsize), bs = 1 << bsl;
- int ms = bs / 2;
+ const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
PARTITION_CONTEXT sl[8], sa[8];
TOKENEXTRA *tp_orig = *tp;
int i, pl;
- BLOCK_SIZE_TYPE subsize;
- int srate = INT_MAX;
- int64_t sdist = INT_MAX;
-
+ BLOCK_SIZE subsize;
+ int this_rate, sum_rate = 0, best_rate = INT_MAX;
+ int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX;
+ int64_t sum_rd = 0;
+ int do_split = bsize >= BLOCK_8X8;
+ int do_rect = 1;
+ // Override skipping rectangular partition operations for edge blocks
+ const int force_horz_split = (mi_row + ms >= cm->mi_rows);
+ const int force_vert_split = (mi_col + ms >= cm->mi_cols);
+
+ int partition_none_allowed = !force_horz_split && !force_vert_split;
+ int partition_horz_allowed = !force_vert_split && bsize >= BLOCK_8X8;
+ int partition_vert_allowed = !force_horz_split && bsize >= BLOCK_8X8;
+
+ int partition_split_done = 0;
(void) *tp_orig;
- if (bsize < BLOCK_SIZE_SB8X8) {
+ if (bsize < BLOCK_8X8) {
// When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
// there is nothing to be done.
if (xd->ab_index != 0) {
@@ -1539,320 +1774,228 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
}
assert(mi_height_log2(bsize) == mi_width_log2(bsize));
- save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
-
- // PARTITION_SPLIT
- if (!cpi->sf.auto_min_max_partition_size ||
- bsize >= cpi->sf.min_partition_size) {
- if (bsize > BLOCK_SIZE_SB8X8) {
- int r4 = 0;
- int64_t d4 = 0, sum_rd = 0;
- subsize = get_subsize(bsize, PARTITION_SPLIT);
-
- for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
- int x_idx = (i & 1) * (ms >> 1);
- int y_idx = (i >> 1) * (ms >> 1);
- int r = 0;
- int64_t d = 0;
-
- if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
- continue;
+ // Determine partition types in search according to the speed features.
+ // The threshold set here has to be of square block size.
+ if (cpi->sf.auto_min_max_partition_size) {
+ partition_none_allowed &= (bsize <= cpi->sf.max_partition_size &&
+ bsize >= cpi->sf.min_partition_size);
+ partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size &&
+ bsize > cpi->sf.min_partition_size) ||
+ force_horz_split);
+ partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size &&
+ bsize > cpi->sf.min_partition_size) ||
+ force_vert_split);
+ do_split &= bsize > cpi->sf.min_partition_size;
+ }
+ if (cpi->sf.use_square_partition_only) {
+ partition_horz_allowed &= force_horz_split;
+ partition_vert_allowed &= force_vert_split;
+ }
- *(get_sb_index(xd, subsize)) = i;
- rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r,
- &d, i != 3, best_rd - sum_rd);
+ save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- if (r == INT_MAX) {
- r4 = INT_MAX;
- sum_rd = INT64_MAX;
- } else {
- r4 += r;
- d4 += d;
- sum_rd = RDCOST(x->rdmult, x->rddiv, r4, d4);
- }
- }
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- if (r4 != INT_MAX && i == 4) {
- r4 += x->partition_cost[pl][PARTITION_SPLIT];
- *(get_sb_partitioning(x, bsize)) = subsize;
- assert(r4 >= 0);
- assert(d4 >= 0);
- srate = r4;
- sdist = d4;
- best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r4, d4));
- }
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ if (cpi->sf.disable_split_var_thresh && partition_none_allowed) {
+ unsigned int source_variancey;
+ vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+ source_variancey = get_sby_perpixel_variance(cpi, x, bsize);
+ if (source_variancey < cpi->sf.disable_split_var_thresh) {
+ do_split = 0;
+ if (source_variancey < cpi->sf.disable_split_var_thresh / 2)
+ do_rect = 0;
}
}
- // Use 4 subblocks' motion estimation results to speed up current
- // partition's checking.
- x->fast_ms = 0;
- x->pred_mv.as_int = 0;
- x->subblock_ref = 0;
-
- if (cpi->sf.using_small_partition_info &&
- (!cpi->sf.auto_min_max_partition_size ||
- (bsize <= cpi->sf.max_partition_size &&
- bsize >= cpi->sf.min_partition_size))) {
- // Only use 8x8 result for non HD videos.
- // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0;
- int use_8x8 = 1;
-
- if (cm->frame_type && !cpi->is_src_frame_alt_ref &&
- ((use_8x8 && bsize == BLOCK_16X16) ||
- bsize == BLOCK_32X32 || bsize == BLOCK_64X64)) {
- int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0;
- PICK_MODE_CONTEXT *block_context = NULL;
-
- if (bsize == BLOCK_16X16) {
- block_context = x->sb8x8_context[xd->sb_index][xd->mb_index];
- } else if (bsize == BLOCK_32X32) {
- block_context = x->mb_context[xd->sb_index];
- } else if (bsize == BLOCK_SIZE_SB64X64) {
- block_context = x->sb32_context;
- }
-
- if (block_context) {
- ref0 = block_context[0].mic.mbmi.ref_frame[0];
- ref1 = block_context[1].mic.mbmi.ref_frame[0];
- ref2 = block_context[2].mic.mbmi.ref_frame[0];
- ref3 = block_context[3].mic.mbmi.ref_frame[0];
+ // PARTITION_NONE
+ if (partition_none_allowed) {
+ pick_sb_modes(cpi, mi_row, mi_col, &this_rate, &this_dist, bsize,
+ get_block_context(x, bsize), best_rd);
+ if (this_rate != INT_MAX) {
+ if (bsize >= BLOCK_8X8) {
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ this_rate += x->partition_cost[pl][PARTITION_NONE];
}
+ sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
+ if (sum_rd < best_rd) {
+ int64_t stop_thresh = 2048;
+
+ best_rate = this_rate;
+ best_dist = this_dist;
+ best_rd = sum_rd;
+ if (bsize >= BLOCK_8X8)
+ *(get_sb_partitioning(x, bsize)) = bsize;
- // Currently, only consider 4 inter ref frames.
- if (ref0 && ref1 && ref2 && ref3) {
- int16_t mvr0 = 0, mvc0 = 0, mvr1 = 0, mvc1 = 0, mvr2 = 0, mvc2 = 0,
- mvr3 = 0, mvc3 = 0;
- int d01, d23, d02, d13; // motion vector distance between 2 blocks
-
- // Get each subblock's motion vectors.
- mvr0 = block_context[0].mic.mbmi.mv[0].as_mv.row;
- mvc0 = block_context[0].mic.mbmi.mv[0].as_mv.col;
- mvr1 = block_context[1].mic.mbmi.mv[0].as_mv.row;
- mvc1 = block_context[1].mic.mbmi.mv[0].as_mv.col;
- mvr2 = block_context[2].mic.mbmi.mv[0].as_mv.row;
- mvc2 = block_context[2].mic.mbmi.mv[0].as_mv.col;
- mvr3 = block_context[3].mic.mbmi.mv[0].as_mv.row;
- mvc3 = block_context[3].mic.mbmi.mv[0].as_mv.col;
-
- // Adjust sign if ref is alt_ref
- if (cm->ref_frame_sign_bias[ref0]) {
- mvr0 *= -1;
- mvc0 *= -1;
- }
-
- if (cm->ref_frame_sign_bias[ref1]) {
- mvr1 *= -1;
- mvc1 *= -1;
- }
-
- if (cm->ref_frame_sign_bias[ref2]) {
- mvr2 *= -1;
- mvc2 *= -1;
- }
+ // Adjust threshold according to partition size.
+ stop_thresh >>= 8 - (b_width_log2_lookup[bsize] +
+ b_height_log2_lookup[bsize]);
- if (cm->ref_frame_sign_bias[ref3]) {
- mvr3 *= -1;
- mvc3 *= -1;
+ // If obtained distortion is very small, choose current partition
+ // and stop splitting.
+ if (this_dist < stop_thresh) {
+ do_split = 0;
+ do_rect = 0;
}
+ }
+ }
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
- // Calculate mv distances.
- d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1));
- d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3));
- d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2));
- d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3));
-
- if (d01 < 24 && d23 < 24 && d02 < 24 && d13 < 24) {
- // Set fast motion search level.
- x->fast_ms = 1;
+ // store estimated motion vector
+ if (cpi->sf.adaptive_motion_search)
+ store_pred_mv(x, get_block_context(x, bsize));
- // Calculate prediction MV
- x->pred_mv.as_mv.row = (mvr0 + mvr1 + mvr2 + mvr3) >> 2;
- x->pred_mv.as_mv.col = (mvc0 + mvc1 + mvc2 + mvc3) >> 2;
+ // PARTITION_SPLIT
+ sum_rd = 0;
+ // TODO(jingning): use the motion vectors given by the above search as
+ // the starting point of motion search in the following partition type check.
+ if (do_split) {
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
+ for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+ const int x_idx = (i & 1) * ms;
+ const int y_idx = (i >> 1) * ms;
+
+ if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+ continue;
- if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 &&
- d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) {
- // Set fast motion search level.
- x->fast_ms = 2;
+ *get_sb_index(xd, subsize) = i;
+ if (cpi->sf.adaptive_motion_search)
+ load_pred_mv(x, get_block_context(x, bsize));
+ rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+ &this_rate, &this_dist, i != 3, best_rd - sum_rd);
- if (!d01 && !d23 && !d02 && !d13) {
- x->fast_ms = 3;
- x->subblock_ref = ref0;
- }
- }
- }
+ if (this_rate == INT_MAX) {
+ sum_rd = INT64_MAX;
+ } else {
+ sum_rate += this_rate;
+ sum_dist += this_dist;
+ sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
}
}
- }
-
- if (!cpi->sf.auto_min_max_partition_size ||
- bsize <= cpi->sf.max_partition_size) {
- int larger_is_better = 0;
- // PARTITION_NONE
- if ((mi_row + (ms >> 1) < cm->mi_rows) &&
- (mi_col + (ms >> 1) < cm->mi_cols)) {
- int r;
- int64_t d;
- pick_sb_modes(cpi, mi_row, mi_col, &r, &d, bsize,
- get_block_context(x, bsize), best_rd);
- if (r != INT_MAX && bsize >= BLOCK_SIZE_SB8X8) {
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- r += x->partition_cost[pl][PARTITION_NONE];
- }
-
- if (r != INT_MAX &&
- (bsize == BLOCK_SIZE_SB8X8 ||
- RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, srate, sdist))) {
- best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r, d));
- srate = r;
- sdist = d;
- larger_is_better = 1;
- if (bsize >= BLOCK_SIZE_SB8X8)
- *(get_sb_partitioning(x, bsize)) = bsize;
+ if (sum_rd < best_rd && i == 4) {
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ sum_rate += x->partition_cost[pl][PARTITION_SPLIT];
+ sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+ if (sum_rd < best_rd) {
+ best_rate = sum_rate;
+ best_dist = sum_dist;
+ best_rd = sum_rd;
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ } else {
+ // skip rectangular partition test when larger block size
+ // gives better rd cost
+ if (cpi->sf.less_rectangular_check)
+ do_rect &= !partition_none_allowed;
}
}
+ partition_split_done = 1;
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
- if (bsize == BLOCK_SIZE_SB8X8) {
- int r4 = 0;
- int64_t d4 = 0, sum_rd = 0;
- subsize = get_subsize(bsize, PARTITION_SPLIT);
-
- for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
- int x_idx = (i & 1) * (ms >> 1);
- int y_idx = (i >> 1) * (ms >> 1);
- int r = 0;
- int64_t d = 0;
-
- if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
- continue;
-
- *(get_sb_index(xd, subsize)) = i;
- rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r,
- &d, i != 3, best_rd - sum_rd);
+ x->fast_ms = 0;
+ x->subblock_ref = 0;
- if (r == INT_MAX) {
- r4 = INT_MAX;
- sum_rd = INT64_MAX;
- } else {
- r4 += r;
- d4 += d;
- sum_rd = RDCOST(x->rdmult, x->rddiv, r4, d4);
- }
+ if (partition_split_done &&
+ cpi->sf.using_small_partition_info) {
+ compute_fast_motion_search_level(cpi, bsize);
+ }
+
+ // PARTITION_HORZ
+ if (partition_horz_allowed && do_rect) {
+ subsize = get_subsize(bsize, PARTITION_HORZ);
+ *get_sb_index(xd, subsize) = 0;
+ if (cpi->sf.adaptive_motion_search)
+ load_pred_mv(x, get_block_context(x, bsize));
+ pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+ get_block_context(x, subsize), best_rd);
+ sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+
+ if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
+ update_state(cpi, get_block_context(x, subsize), subsize, 0);
+ encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+
+ *get_sb_index(xd, subsize) = 1;
+ if (cpi->sf.adaptive_motion_search)
+ load_pred_mv(x, get_block_context(x, bsize));
+ pick_sb_modes(cpi, mi_row + ms, mi_col, &this_rate,
+ &this_dist, subsize, get_block_context(x, subsize),
+ best_rd - sum_rd);
+ if (this_rate == INT_MAX) {
+ sum_rd = INT64_MAX;
+ } else {
+ sum_rate += this_rate;
+ sum_dist += this_dist;
+ sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
}
+ }
+ if (sum_rd < best_rd) {
set_partition_seg_context(cm, xd, mi_row, mi_col);
pl = partition_plane_context(xd, bsize);
- if (r4 != INT_MAX && i == 4) {
- r4 += x->partition_cost[pl][PARTITION_SPLIT];
- if (RDCOST(x->rdmult, x->rddiv, r4, d4) <
- RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
- srate = r4;
- sdist = d4;
- larger_is_better = 0;
- *(get_sb_partitioning(x, bsize)) = subsize;
- best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r4, d4));
- }
+ sum_rate += x->partition_cost[pl][PARTITION_HORZ];
+ sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+ if (sum_rd < best_rd) {
+ best_rd = sum_rd;
+ best_rate = sum_rate;
+ best_dist = sum_dist;
+ *(get_sb_partitioning(x, bsize)) = subsize;
}
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
}
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
- if (!cpi->sf.use_square_partition_only &&
- (!cpi->sf.less_rectangular_check ||!larger_is_better)) {
- // PARTITION_HORZ
- if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
- int r2, r = 0;
- int64_t d2, d = 0, h_rd;
- subsize = get_subsize(bsize, PARTITION_HORZ);
- *(get_sb_index(xd, subsize)) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, &r2, &d2, subsize,
- get_block_context(x, subsize), best_rd);
- h_rd = RDCOST(x->rdmult, x->rddiv, r2, d2);
-
- if (r2 != INT_MAX && h_rd < best_rd &&
- mi_row + (ms >> 1) < cm->mi_rows) {
- update_state(cpi, get_block_context(x, subsize), subsize, 0);
- encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-
- *(get_sb_index(xd, subsize)) = 1;
- pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &r, &d, subsize,
- get_block_context(x, subsize), best_rd - h_rd);
- if (r == INT_MAX) {
- r2 = INT_MAX;
- } else {
- r2 += r;
- d2 += d;
- }
- }
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- if (r2 < INT_MAX)
- r2 += x->partition_cost[pl][PARTITION_HORZ];
- if (r2 != INT_MAX && RDCOST(x->rdmult, x->rddiv, r2, d2)
- < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
- best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r2, d2));
- srate = r2;
- sdist = d2;
- *(get_sb_partitioning(x, bsize)) = subsize;
- }
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ // PARTITION_VERT
+ if (partition_vert_allowed && do_rect) {
+ subsize = get_subsize(bsize, PARTITION_VERT);
+
+ *get_sb_index(xd, subsize) = 0;
+ if (cpi->sf.adaptive_motion_search)
+ load_pred_mv(x, get_block_context(x, bsize));
+ pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+ get_block_context(x, subsize), best_rd);
+ sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+ if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
+ update_state(cpi, get_block_context(x, subsize), subsize, 0);
+ encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+
+ *get_sb_index(xd, subsize) = 1;
+ if (cpi->sf.adaptive_motion_search)
+ load_pred_mv(x, get_block_context(x, bsize));
+ pick_sb_modes(cpi, mi_row, mi_col + ms, &this_rate,
+ &this_dist, subsize, get_block_context(x, subsize),
+ best_rd - sum_rd);
+ if (this_rate == INT_MAX) {
+ sum_rd = INT64_MAX;
+ } else {
+ sum_rate += this_rate;
+ sum_dist += this_dist;
+ sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
}
-
- // PARTITION_VERT
- if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {
- int r2;
- int64_t d2, v_rd;
- subsize = get_subsize(bsize, PARTITION_VERT);
- *(get_sb_index(xd, subsize)) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, &r2, &d2, subsize,
- get_block_context(x, subsize), best_rd);
- v_rd = RDCOST(x->rdmult, x->rddiv, r2, d2);
- if (r2 != INT_MAX && v_rd < best_rd &&
- mi_col + (ms >> 1) < cm->mi_cols) {
- int r = 0;
- int64_t d = 0;
- update_state(cpi, get_block_context(x, subsize), subsize, 0);
- encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-
- *(get_sb_index(xd, subsize)) = 1;
- pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &r, &d, subsize,
- get_block_context(x, subsize), best_rd - v_rd);
- if (r == INT_MAX) {
- r2 = INT_MAX;
- } else {
- r2 += r;
- d2 += d;
- }
- }
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- if (r2 < INT_MAX)
- r2 += x->partition_cost[pl][PARTITION_VERT];
- if (r2 != INT_MAX &&
- RDCOST(x->rdmult, x->rddiv, r2, d2)
- < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
- srate = r2;
- sdist = d2;
- *(get_sb_partitioning(x, bsize)) = subsize;
- }
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
+ if (sum_rd < best_rd) {
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ sum_rate += x->partition_cost[pl][PARTITION_VERT];
+ sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+ if (sum_rd < best_rd) {
+ best_rate = sum_rate;
+ best_dist = sum_dist;
+ best_rd = sum_rd;
+ *(get_sb_partitioning(x, bsize)) = subsize;
}
}
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
}
- *rate = srate;
- *dist = sdist;
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- if (srate < INT_MAX && sdist < INT_MAX && do_recon)
- encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
+ *rate = best_rate;
+ *dist = best_dist;
- if (bsize == BLOCK_SIZE_SB64X64) {
+ if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon)
+ encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+ if (bsize == BLOCK_64X64) {
assert(tp_orig < *tp);
- assert(srate < INT_MAX);
- assert(sdist < INT_MAX);
+ assert(best_rate < INT_MAX);
+ assert(best_dist < INT_MAX);
} else {
assert(tp_orig == *tp);
}
@@ -1863,7 +2006,7 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD * const xd = &x->e_mbd;
- int bsl = b_width_log2(BLOCK_SIZE_SB64X64), bs = 1 << bsl;
+ int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl;
int ms = bs / 2;
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
PARTITION_CONTEXT sl[8], sa[8];
@@ -1871,7 +2014,7 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
int r;
int64_t d;
- save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64);
+ save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
// Default is non mask (all reference frames allowed.
cpi->ref_frame_mask = 0;
@@ -1880,17 +2023,17 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
if ((mi_row + (ms >> 1) < cm->mi_rows) &&
(mi_col + (ms >> 1) < cm->mi_cols)) {
cpi->set_ref_frame_mask = 1;
- pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_SIZE_SB64X64,
- get_block_context(x, BLOCK_SIZE_SB64X64), INT64_MAX);
+ pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_64X64,
+ get_block_context(x, BLOCK_64X64), INT64_MAX);
set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
+ pl = partition_plane_context(xd, BLOCK_64X64);
r += x->partition_cost[pl][PARTITION_NONE];
- *(get_sb_partitioning(x, BLOCK_SIZE_SB64X64)) = BLOCK_SIZE_SB64X64;
+ *(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64;
cpi->set_ref_frame_mask = 0;
}
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64);
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
}
static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
@@ -1908,12 +2051,7 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
int dummy_rate;
int64_t dummy_dist;
- // Initialize a mask of modes that we will not consider;
- // cpi->unused_mode_skip_mask = 0x0000000AAE17F800 (test no golden)
- if (cpi->common.frame_type == KEY_FRAME)
- cpi->unused_mode_skip_mask = 0;
- else
- cpi->unused_mode_skip_mask = 0xFFFFFFFFFFFFFE00;
+ vpx_memset(cpi->mb.pred_mv, 0, sizeof(cpi->mb.pred_mv));
if (cpi->sf.reference_masking)
rd_pick_reference_frame(cpi, mi_row, mi_col);
@@ -1921,18 +2059,18 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning ||
cpi->sf.use_one_partition_size_always ) {
const int idx_str = cm->mode_info_stride * mi_row + mi_col;
- MODE_INFO *m = cm->mi + idx_str;
- MODE_INFO *p = cm->prev_mi + idx_str;
+ MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
+ MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
cpi->mb.source_variance = UINT_MAX;
if (cpi->sf.use_one_partition_size_always) {
- set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
- set_partitioning(cpi, m, cpi->sf.always_this_block_size);
- rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
+ set_partitioning(cpi, mi_8x8, mi_row, mi_col);
+ rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1);
} else if (cpi->sf.partition_by_variance) {
- choose_partitioning(cpi, cm->mi, mi_row, mi_col);
- rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ choose_partitioning(cpi, cm->mi_grid_visible, mi_row, mi_col);
+ rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1);
} else {
if ((cpi->common.current_video_frame
@@ -1943,26 +2081,28 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
|| cpi->is_src_frame_alt_ref) {
// If required set upper and lower partition size limits
if (cpi->sf.auto_min_max_partition_size) {
- rd_auto_partition_range(cpi,
+ set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
+ rd_auto_partition_range(cpi, mi_row, mi_col,
&cpi->sf.min_partition_size,
&cpi->sf.max_partition_size);
}
- rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1, INT64_MAX);
} else {
- copy_partitioning(cpi, m, p);
- rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ copy_partitioning(cpi, mi_8x8, prev_mi_8x8);
+ rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1);
}
}
} else {
// If required set upper and lower partition size limits
if (cpi->sf.auto_min_max_partition_size) {
- rd_auto_partition_range(cpi, &cpi->sf.min_partition_size,
+ set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
+ rd_auto_partition_range(cpi, mi_row, mi_col,
+ &cpi->sf.min_partition_size,
&cpi->sf.max_partition_size);
}
-
- rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1, INT64_MAX);
}
}
@@ -1993,8 +2133,8 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
- xd->mode_info_context->mbmi.mode = DC_PRED;
- xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+ xd->this_mi->mbmi.mode = DC_PRED;
+ xd->this_mi->mbmi.uv_mode = DC_PRED;
vp9_zero(cpi->y_mode_count)
vp9_zero(cpi->y_uv_mode_count)
@@ -2023,7 +2163,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
cpi->mb.optimize = 0;
- cpi->mb.e_mbd.lf.filter_level = 0;
+ cpi->common.lf.filter_level = 0;
cpi->zbin_mode_boost_enabled = 0;
cpi->common.tx_mode = ONLY_4X4;
} else {
@@ -2070,8 +2210,14 @@ static void encode_frame_internal(VP9_COMP *cpi) {
vp9_zero(cm->counts.switchable_interp);
vp9_zero(cpi->txfm_stepdown_count);
- xd->mode_info_context = cm->mi;
- xd->prev_mode_info_context = cm->prev_mi;
+ xd->mi_8x8 = cm->mi_grid_visible;
+ // required for vp9_frame_init_quantizer
+ xd->this_mi =
+ xd->mi_8x8[0] = cm->mi;
+ xd->mic_stream_ptr = cm->mi;
+
+ xd->last_mi = cm->prev_mi;
+
vp9_zero(cpi->NMVcount);
vp9_zero(cpi->coef_counts);
@@ -2095,7 +2241,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
build_activity_map(cpi);
}
- // re-initencode frame context.
+ // Re-initialize encode frame context.
init_encode_frame_mb_context(cpi);
vp9_zero(cpi->rd_comp_pred_diff);
@@ -2164,10 +2310,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {
}
static int check_dual_ref_flags(VP9_COMP *cpi) {
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
- int ref_flags = cpi->ref_frame_flags;
+ const int ref_flags = cpi->ref_frame_flags;
- if (vp9_segfeature_active(&xd->seg, 1, SEG_LVL_REF_FRAME)) {
+ if (vp9_segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
return 0;
} else {
return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
@@ -2175,12 +2320,12 @@ static int check_dual_ref_flags(VP9_COMP *cpi) {
}
}
-static int get_skip_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs) {
+static int get_skip_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs) {
int x, y;
for (y = 0; y < ymbs; y++) {
for (x = 0; x < xmbs; x++) {
- if (!mi[y * mis + x].mbmi.mb_skip_coeff)
+ if (!mi_8x8[y * mis + x]->mbmi.skip_coeff)
return 0;
}
}
@@ -2188,85 +2333,75 @@ static int get_skip_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs) {
return 1;
}
-static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs,
- TX_SIZE txfm_size) {
+static void set_txfm_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs,
+ TX_SIZE tx_size) {
int x, y;
for (y = 0; y < ymbs; y++) {
for (x = 0; x < xmbs; x++)
- mi[y * mis + x].mbmi.txfm_size = txfm_size;
+ mi_8x8[y * mis + x]->mbmi.tx_size = tx_size;
}
}
-static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi, int mis,
- TX_SIZE txfm_max, int bw, int bh, int mi_row,
- int mi_col, BLOCK_SIZE_TYPE bsize) {
+static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+ int mis, TX_SIZE max_tx_size, int bw, int bh,
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
VP9_COMMON * const cm = &cpi->common;
- MB_MODE_INFO * const mbmi = &mi->mbmi;
+ MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- if (mbmi->txfm_size > txfm_max) {
- MACROBLOCK * const x = &cpi->mb;
- MACROBLOCKD * const xd = &x->e_mbd;
+ if (mbmi->tx_size > max_tx_size) {
const int ymbs = MIN(bh, cm->mi_rows - mi_row);
const int xmbs = MIN(bw, cm->mi_cols - mi_col);
- xd->mode_info_context = mi;
- assert(vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
- get_skip_flag(mi, mis, ymbs, xmbs));
- set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
+ assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
+ get_skip_flag(mi_8x8, mis, ymbs, xmbs));
+ set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size);
}
}
-static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
- TX_SIZE txfm_max, int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
+static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+ TX_SIZE max_tx_size, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
VP9_COMMON * const cm = &cpi->common;
const int mis = cm->mode_info_stride;
- int bwl, bhl;
- const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
+ int bw, bh;
+ const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- bwl = mi_width_log2(mi->mbmi.sb_type);
- bhl = mi_height_log2(mi->mbmi.sb_type);
+ bw = num_8x8_blocks_wide_lookup[mi_8x8[0]->mbmi.sb_type];
+ bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
- if (bwl == bsl && bhl == bsl) {
- reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl, mi_row,
+ if (bw == bs && bh == bs) {
+ reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, bs, bs, mi_row,
+ mi_col, bsize);
+ } else if (bw == bs && bh < bs) {
+ reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, bs, hbs, mi_row,
mi_col, bsize);
- } else if (bwl == bsl && bhl < bsl) {
- reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs, mi_row, mi_col,
- bsize);
- reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs,
- mi_row + bs, mi_col, bsize);
- } else if (bwl < bsl && bhl == bsl) {
- reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl, mi_row, mi_col,
- bsize);
- reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl, mi_row,
- mi_col + bs, bsize);
+ reset_skip_txfm_size_b(cpi, mi_8x8 + hbs * mis, mis, max_tx_size, bs, hbs,
+ mi_row + hbs, mi_col, bsize);
+ } else if (bw < bs && bh == bs) {
+ reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, hbs, bs, mi_row,
+ mi_col, bsize);
+ reset_skip_txfm_size_b(cpi, mi_8x8 + hbs, mis, max_tx_size, hbs, bs, mi_row,
+ mi_col + hbs, bsize);
+
} else {
- BLOCK_SIZE_TYPE subsize;
+ const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
int n;
- assert(bwl < bsl && bhl < bsl);
- if (bsize == BLOCK_64X64) {
- subsize = BLOCK_32X32;
- } else if (bsize == BLOCK_32X32) {
- subsize = BLOCK_16X16;
- } else {
- assert(bsize == BLOCK_16X16);
- subsize = BLOCK_8X8;
- }
+ assert(bw < bs && bh < bs);
for (n = 0; n < 4; n++) {
- const int y_idx = n >> 1, x_idx = n & 0x01;
+ const int mi_dc = hbs * (n & 1);
+ const int mi_dr = hbs * (n >> 1);
- reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs, txfm_max,
- mi_row + y_idx * bs, mi_col + x_idx * bs,
- subsize);
+ reset_skip_txfm_size_sb(cpi, &mi_8x8[mi_dr * mis + mi_dc], max_tx_size,
+ mi_row + mi_dr, mi_col + mi_dc, subsize);
}
}
}
@@ -2275,13 +2410,14 @@ static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
VP9_COMMON * const cm = &cpi->common;
int mi_row, mi_col;
const int mis = cm->mode_info_stride;
- MODE_INFO *mi, *mi_ptr = cm->mi;
+// MODE_INFO *mi, *mi_ptr = cm->mi;
+ MODE_INFO **mi_8x8, **mi_ptr = cm->mi_grid_visible;
for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) {
- mi = mi_ptr;
- for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi += 8) {
- reset_skip_txfm_size_sb(cpi, mi, txfm_max, mi_row, mi_col,
- BLOCK_SIZE_SB64X64);
+ mi_8x8 = mi_ptr;
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi_8x8 += 8) {
+ reset_skip_txfm_size_sb(cpi, mi_8x8, txfm_max, mi_row, mi_col,
+ BLOCK_64X64);
}
}
}
@@ -2334,7 +2470,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
// decoder such that we allow compound where one of the 3 buffers has a
// different sign bias and that buffer is then the fixed ref. However, this
// requires further work in the rd loop. For now the only supported encoder
- // side behaviour is where the ALT ref buffer has opposite sign bias to
+ // side behavior is where the ALT ref buffer has opposite sign bias to
// the other two.
if ((cm->ref_frame_sign_bias[ALTREF_FRAME]
== cm->ref_frame_sign_bias[GOLDEN_FRAME])
@@ -2387,27 +2523,26 @@ void vp9_encode_frame(VP9_COMP *cpi) {
cpi->rd_filter_threshes[frame_type][1] >
cpi->rd_filter_threshes[frame_type][2] &&
cpi->rd_filter_threshes[frame_type][1] >
- cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
- filter_type = vp9_switchable_interp[1];
+ cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) {
+ filter_type = EIGHTTAP_SMOOTH;
} else if (cpi->rd_filter_threshes[frame_type][2] >
cpi->rd_filter_threshes[frame_type][0] &&
cpi->rd_filter_threshes[frame_type][2] >
- cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
- filter_type = vp9_switchable_interp[2];
+ cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) {
+ filter_type = EIGHTTAP_SHARP;
} else if (cpi->rd_filter_threshes[frame_type][0] >
- cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
- filter_type = vp9_switchable_interp[0];
+ cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) {
+ filter_type = EIGHTTAP;
} else {
filter_type = SWITCHABLE;
}
- /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */
-
cpi->mb.e_mbd.lossless = 0;
if (cpi->oxcf.lossless) {
cpi->mb.e_mbd.lossless = 1;
}
+ /* transform size selection (4x4, 8x8, 16x16 or select-per-mb) */
select_tx_mode(cpi);
cpi->common.comp_pred_mode = pred_type;
cpi->common.mcomp_filter_type = filter_type;
@@ -2419,7 +2554,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
}
- for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs;
cpi->rd_filter_threshes[frame_type][i] =
(cpi->rd_filter_threshes[frame_type][i] + diff) / 2;
@@ -2495,29 +2630,22 @@ void vp9_encode_frame(VP9_COMP *cpi) {
}
-static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
- const MACROBLOCKD *xd = &x->e_mbd;
- const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
- const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;
-
- ++cpi->y_uv_mode_count[m][uvm];
- if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
- const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
- const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
- const int bsl = MIN(bwl, bhl);
- ++cpi->y_mode_count[MIN(bsl, 3)][m];
- } else {
+static void sum_intra_stats(VP9_COMP *cpi, const MODE_INFO *mi) {
+ const MB_PREDICTION_MODE y_mode = mi->mbmi.mode;
+ const MB_PREDICTION_MODE uv_mode = mi->mbmi.uv_mode;
+ const BLOCK_SIZE bsize = mi->mbmi.sb_type;
+
+ ++cpi->y_uv_mode_count[y_mode][uv_mode];
+
+ if (bsize < BLOCK_8X8) {
int idx, idy;
- int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[
- xd->mode_info_context->mbmi.sb_type];
- int num_4x4_blocks_high = num_4x4_blocks_high_lookup[
- xd->mode_info_context->mbmi.sb_type];
- for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
- for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
- int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode;
- ++cpi->y_mode_count[0][m];
- }
- }
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high)
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide)
+ ++cpi->y_mode_count[0][mi->bmi[idy * 2 + idx].as_mode];
+ } else {
+ ++cpi->y_mode_count[size_group_lookup[bsize]][y_mode];
}
}
@@ -2541,19 +2669,19 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
x->act_zbin_adj = 1 - (int) (((int64_t) a + (b >> 1)) / b);
#endif
}
-
static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
- int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD * const xd = &x->e_mbd;
- MODE_INFO *mi = xd->mode_info_context;
+ MODE_INFO **mi_8x8 = xd->mi_8x8;
+ MODE_INFO *mi = mi_8x8[0];
MB_MODE_INFO *mbmi = &mi->mbmi;
unsigned int segment_id = mbmi->segment_id;
const int mis = cm->mode_info_stride;
const int mi_width = num_8x8_blocks_wide_lookup[bsize];
const int mi_height = num_8x8_blocks_high_lookup[bsize];
- x->rd_search = 0;
+ x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
xd->q_index < QIDX_SKIP_THRESH);
if (x->skip_encode)
@@ -2582,7 +2710,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
else
cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
- } else if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+ } else if (mbmi->sb_type < BLOCK_8X8) {
cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;
} else {
cpi->zbin_mode_boost = MV_ZBIN_BOOST;
@@ -2595,13 +2723,11 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
vp9_update_zbin_extra(cpi, x);
}
- if (mbmi->ref_frame[0] == INTRA_FRAME) {
- vp9_encode_intra_block_y(
- cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
- vp9_encode_intra_block_uv(
- cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+ if (!is_inter_block(mbmi)) {
+ vp9_encode_intra_block_y(x, MAX(bsize, BLOCK_8X8));
+ vp9_encode_intra_block_uv(x, MAX(bsize, BLOCK_8X8));
if (output_enabled)
- sum_intra_stats(cpi, x);
+ sum_intra_stats(cpi, mi);
} else {
int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[0])];
YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx];
@@ -2619,44 +2745,37 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
&xd->scale_factor[1]);
- vp9_build_inter_predictors_sb(
- xd, mi_row, mi_col,
- bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8 : bsize);
+ vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
}
- if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
- vp9_tokenize_sb(cpi, t, !output_enabled,
- (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+ if (!is_inter_block(mbmi)) {
+ vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
} else if (!x->skip) {
- vp9_encode_sb(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
- vp9_tokenize_sb(cpi, t, !output_enabled,
- (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+ vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
+ vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
} else {
- int mb_skip_context = xd->left_available ? (mi - 1)->mbmi.mb_skip_coeff : 0;
- mb_skip_context += (mi - mis)->mbmi.mb_skip_coeff;
+ int mb_skip_context = xd->left_available ? mi_8x8[-1]->mbmi.skip_coeff : 0;
+ mb_skip_context += mi_8x8[-mis] ? mi_8x8[-mis]->mbmi.skip_coeff : 0;
- mbmi->mb_skip_coeff = 1;
+ mbmi->skip_coeff = 1;
if (output_enabled)
cm->counts.mbskip[mb_skip_context][1]++;
- vp9_reset_sb_tokens_context(
- xd, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+ reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
}
- // copy skip flag on all mb_mode_info contexts in this SB
- // if this was a skip at this txfm size
- vp9_set_pred_flag_mbskip(cm, bsize, mi_row, mi_col, mi->mbmi.mb_skip_coeff);
-
if (output_enabled) {
if (cm->tx_mode == TX_MODE_SELECT &&
- mbmi->sb_type >= BLOCK_SIZE_SB8X8 &&
+ mbmi->sb_type >= BLOCK_8X8 &&
!(is_inter_block(mbmi) &&
- (mbmi->mb_skip_coeff ||
- vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)))) {
+ (mbmi->skip_coeff ||
+ vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
const uint8_t context = vp9_get_pred_context_tx_size(xd);
- update_tx_counts(bsize, context, mbmi->txfm_size, &cm->counts.tx);
+ update_tx_counts(bsize, context, mbmi->tx_size, &cm->counts.tx);
} else {
int x, y;
- TX_SIZE sz = (cm->tx_mode == TX_MODE_SELECT) ? TX_32X32 : cm->tx_mode;
+ TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode];
+ assert(sizeof(tx_mode_to_biggest_tx_size) /
+ sizeof(tx_mode_to_biggest_tx_size[0]) == TX_MODES);
// The new intra coding scheme requires no change of transform size
if (is_inter_block(&mi->mbmi)) {
if (sz == TX_32X32 && bsize < BLOCK_32X32)
@@ -2666,18 +2785,15 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
if (sz == TX_8X8 && bsize < BLOCK_8X8)
sz = TX_4X4;
} else if (bsize >= BLOCK_8X8) {
- sz = mbmi->txfm_size;
+ sz = mbmi->tx_size;
} else {
sz = TX_4X4;
}
- for (y = 0; y < mi_height; y++) {
- for (x = 0; x < mi_width; x++) {
- if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) {
- mi[mis * y + x].mbmi.txfm_size = sz;
- }
- }
- }
+ for (y = 0; y < mi_height; y++)
+ for (x = 0; x < mi_width; x++)
+ if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
+ mi_8x8[mis * y + x]->mbmi.tx_size = sz;
}
}
}
diff --git a/libvpx/vp9/encoder/vp9_encodeintra.c b/libvpx/vp9/encoder/vp9_encodeintra.c
index edbd2d9..c5e5dff 100644
--- a/libvpx/vp9/encoder/vp9_encodeintra.c
+++ b/libvpx/vp9/encoder/vp9_encodeintra.c
@@ -15,14 +15,14 @@
#include "vp9/encoder/vp9_encodemb.h"
#include "vp9/encoder/vp9_encodeintra.h"
-int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
- MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
- (void) cpi;
+int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) {
+ MB_MODE_INFO * mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
x->skip_encode = 0;
mbmi->mode = DC_PRED;
mbmi->ref_frame[0] = INTRA_FRAME;
- mbmi->txfm_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ?
- TX_16X16 : TX_8X8) : TX_4X4;
- vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);
+ mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
+ : TX_8X8)
+ : TX_4X4;
+ vp9_encode_intra_block_y(x, mbmi->sb_type);
return vp9_get_mb_ss(x->plane[0].src_diff);
}
diff --git a/libvpx/vp9/encoder/vp9_encodeintra.h b/libvpx/vp9/encoder/vp9_encodeintra.h
index 16ac59e..e217924 100644
--- a/libvpx/vp9/encoder/vp9_encodeintra.h
+++ b/libvpx/vp9/encoder/vp9_encodeintra.h
@@ -13,12 +13,8 @@
#include "vp9/encoder/vp9_onyx_int.h"
-int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
-void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg);
-void vp9_encode_intra_block_y(VP9_COMMON *const cm, MACROBLOCK *mb,
- BLOCK_SIZE_TYPE bs);
-void vp9_encode_intra_block_uv(VP9_COMMON *const cm, MACROBLOCK *mb,
- BLOCK_SIZE_TYPE bs);
+int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred);
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg);
#endif // VP9_ENCODER_VP9_ENCODEINTRA_H_
diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c
index 40b0a4e..8dd80a5 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libvpx/vp9/encoder/vp9_encodemb.c
@@ -69,7 +69,7 @@ static void inverse_transform_b_16x16_add(int eob,
vp9_short_idct16x16_add(dqcoeff, dest, stride);
}
-static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {
+static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
struct macroblock_plane *const p = &x->plane[plane];
const MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -81,18 +81,18 @@ static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {
pd->dst.buf, pd->dst.stride);
}
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
subtract_plane(x, bsize, 0);
}
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) {
int i;
for (i = 1; i < MAX_MB_PLANE; i++)
subtract_plane(x, bsize, i);
}
-void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
vp9_subtract_sby(x, bsize);
vp9_subtract_sbuv(x, bsize);
}
@@ -142,37 +142,36 @@ static int trellis_get_coeff_context(const int16_t *scan,
}
static void optimize_b(MACROBLOCK *mb,
- int plane, int block, BLOCK_SIZE_TYPE bsize,
+ int plane, int block, BLOCK_SIZE plane_bsize,
ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
TX_SIZE tx_size) {
MACROBLOCKD *const xd = &mb->e_mbd;
- const int ref = is_inter_block(&xd->mode_info_context->mbmi);
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const int ref = is_inter_block(&xd->this_mi->mbmi);
vp9_token_state tokens[1025][2];
unsigned best_index[1025][2];
- const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff,
- block, 16);
+ const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block);
int16_t *qcoeff_ptr;
int16_t *dqcoeff_ptr;
- int eob = xd->plane[plane].eobs[block], final_eob, sz = 0;
+ int eob = pd->eobs[block], final_eob, sz = 0;
const int i0 = 0;
int rc, x, next, i;
int64_t rdmult, rddiv, rd_cost0, rd_cost1;
int rate0, rate1, error0, error1, t0, t1;
int best, band, pt;
- PLANE_TYPE type = xd->plane[plane].plane_type;
+ PLANE_TYPE type = pd->plane_type;
int err_mult = plane_rd_mult[type];
int default_eob;
const int16_t *scan, *nb;
const int mul = 1 + (tx_size == TX_32X32);
uint8_t token_cache[1024];
- const int ib = txfrm_block_to_raster_block(xd, bsize, plane,
- block, 2 * tx_size);
- const int16_t *dequant_ptr = xd->plane[plane].dequant;
+ const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block);
+ const int16_t *dequant_ptr = pd->dequant;
const uint8_t * band_translate;
assert((!type && !plane) || (type && plane));
- dqcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16);
- qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
+ dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
+ qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
switch (tx_size) {
default:
case TX_4X4:
@@ -200,7 +199,7 @@ static void optimize_b(MACROBLOCK *mb,
/* Now set up a Viterbi trellis to evaluate alternative roundings. */
rdmult = mb->rdmult * err_mult;
- if (mb->e_mbd.mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
+ if (mb->e_mbd.mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME)
rdmult = (rdmult * 9) >> 4;
rddiv = mb->rddiv;
/* Initialize the sentinel node of the trellis. */
@@ -371,59 +370,48 @@ static void optimize_b(MACROBLOCK *mb,
*a = *l = (final_eob > 0);
}
-void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, MACROBLOCK *mb,
- struct optimize_ctx *ctx) {
- MACROBLOCKD *const xd = &mb->e_mbd;
+void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) {
int x, y;
-
- // find current entropy context
- txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
-
- optimize_b(mb, plane, block, bsize,
- &ctx->ta[plane][x], &ctx->tl[plane][y], ss_txfrm_size / 2);
-}
-
-static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg) {
- const struct encode_b_args* const args = arg;
- vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->x, args->ctx);
+ txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+ optimize_b(mb, plane, block, plane_bsize,
+ &ctx->ta[plane][x], &ctx->tl[plane][y], tx_size);
}
-void optimize_init_b(int plane, BLOCK_SIZE_TYPE bsize, void *arg) {
- const struct encode_b_args* const args = arg;
+static void optimize_init_b(int plane, BLOCK_SIZE bsize,
+ struct encode_b_args *args) {
const MACROBLOCKD *xd = &args->x->e_mbd;
const struct macroblockd_plane* const pd = &xd->plane[plane];
- const int bwl = b_width_log2(bsize) - pd->subsampling_x;
- const int bhl = b_height_log2(bsize) - pd->subsampling_y;
- const int bw = 1 << bwl, bh = 1 << bhl;
- const MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
- const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->txfm_size;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+ const MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+ const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
int i;
switch (tx_size) {
case TX_4X4:
vpx_memcpy(args->ctx->ta[plane], pd->above_context,
- sizeof(ENTROPY_CONTEXT) * bw);
+ sizeof(ENTROPY_CONTEXT) * num_4x4_w);
vpx_memcpy(args->ctx->tl[plane], pd->left_context,
- sizeof(ENTROPY_CONTEXT) * bh);
+ sizeof(ENTROPY_CONTEXT) * num_4x4_h);
break;
case TX_8X8:
- for (i = 0; i < bw; i += 2)
+ for (i = 0; i < num_4x4_w; i += 2)
args->ctx->ta[plane][i] = !!*(uint16_t *)&pd->above_context[i];
- for (i = 0; i < bh; i += 2)
+ for (i = 0; i < num_4x4_h; i += 2)
args->ctx->tl[plane][i] = !!*(uint16_t *)&pd->left_context[i];
break;
case TX_16X16:
- for (i = 0; i < bw; i += 4)
+ for (i = 0; i < num_4x4_w; i += 4)
args->ctx->ta[plane][i] = !!*(uint32_t *)&pd->above_context[i];
- for (i = 0; i < bh; i += 4)
+ for (i = 0; i < num_4x4_h; i += 4)
args->ctx->tl[plane][i] = !!*(uint32_t *)&pd->left_context[i];
break;
case TX_32X32:
- for (i = 0; i < bw; i += 8)
+ for (i = 0; i < num_4x4_w; i += 8)
args->ctx->ta[plane][i] = !!*(uint64_t *)&pd->above_context[i];
- for (i = 0; i < bh; i += 8)
+ for (i = 0; i < num_4x4_h; i += 8)
args->ctx->tl[plane][i] = !!*(uint64_t *)&pd->left_context[i];
break;
default:
@@ -431,38 +419,19 @@ void optimize_init_b(int plane, BLOCK_SIZE_TYPE bsize, void *arg) {
}
}
-void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
- struct optimize_ctx ctx;
- struct encode_b_args arg = {cm, x, &ctx};
- optimize_init_b(0, bsize, &arg);
- foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, optimize_block, &arg);
-}
-
-void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize) {
- struct optimize_ctx ctx;
- struct encode_b_args arg = {cm, x, &ctx};
- int i;
- for (i = 1; i < MAX_MB_PLANE; ++i)
- optimize_init_b(i, bsize, &arg);
-
- foreach_transformed_block_uv(&x->e_mbd, bsize, optimize_block, &arg);
-}
-
-void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg) {
+void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
struct encode_b_args* const args = arg;
MACROBLOCK* const x = args->x;
MACROBLOCKD* const xd = &x->e_mbd;
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16);
- int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16);
- int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
- const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
+ int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
+ int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+ int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
const int16_t *scan, *iscan;
uint16_t *eob = &pd->eobs[block];
- const int bwl = plane_block_width_log2by4(bsize, pd), bw = 1 << bwl;
+ const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl;
const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
int xoff, yoff;
int16_t *src_diff;
@@ -475,7 +444,7 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
xoff = 32 * (block & twmask);
yoff = 32 * (block >> twl);
src_diff = p->src_diff + 4 * bw * yoff + xoff;
- if (x->rd_search)
+ if (x->use_lp32x32fdct)
vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
else
vp9_short_fdct32x32(src_diff, coeff, bw * 8);
@@ -523,29 +492,27 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
}
}
-static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg) {
+static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
struct encode_b_args *const args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
- const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
- block, ss_txfrm_size);
struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
- uint8_t *const dst = raster_block_offset_uint8(xd, bsize, plane,
- raster_block,
+ const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
+ block);
+
+ int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
pd->dst.buf, pd->dst.stride);
- xform_quant(plane, block, bsize, ss_txfrm_size, arg);
+ vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
if (x->optimize)
- vp9_optimize_b(plane, block, bsize, ss_txfrm_size, x, args->ctx);
+ vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
- if (x->skip_encode)
- return;
- if (pd->eobs[block] == 0)
+ if (x->skip_encode || pd->eobs[block] == 0)
return;
- switch (ss_txfrm_size / 2) {
+ switch (tx_size) {
case TX_32X32:
vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
break;
@@ -564,28 +531,15 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
dst, pd->dst.stride);
break;
+ default:
+ assert(!"Invalid transform size");
}
}
-void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
- MACROBLOCKD* const xd = &x->e_mbd;
- struct encode_b_args arg = {cm, x, NULL};
-
- foreach_transformed_block_in_plane(xd, bsize, 0, xform_quant, &arg);
-}
-
-void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize) {
- MACROBLOCKD* const xd = &x->e_mbd;
- struct encode_b_args arg = {cm, x, NULL};
-
- foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
-}
-
-void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
struct optimize_ctx ctx;
- struct encode_b_args arg = {cm, x, &ctx};
+ struct encode_b_args arg = {x, &ctx};
vp9_subtract_sby(x, bsize);
if (x->optimize)
@@ -594,25 +548,10 @@ void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
}
-void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
- MACROBLOCKD *const xd = &x->e_mbd;
- struct optimize_ctx ctx;
- struct encode_b_args arg = {cm, x, &ctx};
-
- vp9_subtract_sbuv(x, bsize);
- if (x->optimize) {
- int i;
- for (i = 1; i < MAX_MB_PLANE; ++i)
- optimize_init_b(i, bsize, &arg);
- }
-
- foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
-}
-
-void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
struct optimize_ctx ctx;
- struct encode_b_args arg = {cm, x, &ctx};
+ struct encode_b_args arg = {x, &ctx};
vp9_subtract_sb(x, bsize);
@@ -625,35 +564,32 @@ void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
foreach_transformed_block(xd, bsize, encode_block, &arg);
}
-void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg) {
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
struct encode_b_args* const args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
- const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
+ MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16);
- int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16);
- int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+ int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
+ int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+ int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
const int16_t *scan, *iscan;
TX_TYPE tx_type;
MB_PREDICTION_MODE mode;
- const int bwl = b_width_log2(bsize) - pd->subsampling_x, bw = 1 << bwl;
+ const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl;
const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
int xoff, yoff;
uint8_t *src, *dst;
int16_t *src_diff;
uint16_t *eob = &pd->eobs[block];
- if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
- extend_for_intra(xd, plane, block, bsize, ss_txfrm_size);
- }
+ if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0)
+ extend_for_intra(xd, plane_bsize, plane, block, tx_size);
// if (x->optimize)
- // vp9_optimize_b(plane, block, bsize, ss_txfrm_size,
- // x, args->ctx);
+ // vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
switch (tx_size) {
case TX_32X32:
@@ -670,7 +606,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
dst, pd->dst.stride, dst, pd->dst.stride);
vp9_subtract_block(32, 32, src_diff, bw * 4,
src, p->src.stride, dst, pd->dst.stride);
- if (x->rd_search)
+ if (x->use_lp32x32fdct)
vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
else
vp9_short_fdct32x32(src_diff, coeff, bw * 8);
@@ -699,8 +635,8 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type);
else
x->fwd_txm16x16(src_diff, coeff, bw * 8);
- vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff,
+ vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
if (!x->skip_encode && *eob) {
if (tx_type == DCT_DCT)
@@ -743,7 +679,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
scan = get_scan_4x4(tx_type);
iscan = get_iscan_4x4(tx_type);
if (mbmi->sb_type < BLOCK_8X8 && plane == 0)
- mode = xd->mode_info_context->bmi[block].as_mode;
+ mode = xd->this_mi->bmi[block].as_mode;
else
mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
@@ -778,20 +714,18 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
}
}
-void vp9_encode_intra_block_y(VP9_COMMON *cm, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
struct optimize_ctx ctx;
- struct encode_b_args arg = {cm, x, &ctx};
+ struct encode_b_args arg = {x, &ctx};
- foreach_transformed_block_in_plane(xd, bsize, 0,
- encode_block_intra, &arg);
+ foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra,
+ &arg);
}
-void vp9_encode_intra_block_uv(VP9_COMMON *cm, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
struct optimize_ctx ctx;
- struct encode_b_args arg = {cm, x, &ctx};
- foreach_transformed_block_uv(xd, bsize, encode_block_intra, &arg);
+ struct encode_b_args arg = {x, &ctx};
+ foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg);
}
diff --git a/libvpx/vp9/encoder/vp9_encodemb.h b/libvpx/vp9/encoder/vp9_encodemb.h
index f647fd9..54e69fd 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/libvpx/vp9/encoder/vp9_encodemb.h
@@ -16,8 +16,28 @@
#include "vp9/encoder/vp9_onyx_int.h"
#include "vp9/common/vp9_onyxc_int.h"
+typedef enum {
+ RD_DC_PRED = DC_PRED,
+ RD_V_PRED = V_PRED,
+ RD_H_PRED = H_PRED,
+ RD_D45_PRED = D45_PRED,
+ RD_D135_PRED = D135_PRED,
+ RD_D117_PRED = D117_PRED,
+ RD_D153_PRED = D153_PRED,
+ RD_D207_PRED = D207_PRED,
+ RD_D63_PRED = D63_PRED,
+ RD_TM_PRED = TM_PRED,
+ RD_NEARESTMV = NEARESTMV,
+ RD_NEARMV = NEARMV,
+ RD_ZEROMV = ZEROMV,
+ RD_NEWMV = NEWMV,
+ RD_I4X4_PRED,
+ RD_SPLITMV,
+ RD_MODE_COUNT
+} RD_PREDICTION_MODE;
+
typedef struct {
- MB_PREDICTION_MODE mode;
+ RD_PREDICTION_MODE mode;
MV_REFERENCE_FRAME ref_frame;
MV_REFERENCE_FRAME second_ref_frame;
} MODE_DEFINITION;
@@ -28,28 +48,22 @@ struct optimize_ctx {
};
struct encode_b_args {
- VP9_COMMON *cm;
MACROBLOCK *x;
struct optimize_ctx *ctx;
};
-void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, MACROBLOCK *x,
- struct optimize_ctx *ctx);
-void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_optimize_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
+
+void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg);
-void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
-void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg);
-void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize);
#endif // VP9_ENCODER_VP9_ENCODEMB_H_
diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c
index 1c6fa3a..ed3a2bb 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/libvpx/vp9/encoder/vp9_encodemv.c
@@ -20,10 +20,6 @@
extern unsigned int active_section;
#endif
-#ifdef NMV_STATS
-nmv_context_counts tnmvcounts;
-#endif
-
static void encode_mv_component(vp9_writer* w, int comp,
const nmv_component* mvcomp, int usehp) {
int offset;
@@ -159,7 +155,6 @@ static void counts_to_nmv_context(
unsigned int (*branch_ct_class0_hp)[2],
unsigned int (*branch_ct_hp)[2]) {
int i, j, k;
- vp9_counts_process(nmv_count, usehp);
vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
prob->joints,
branch_ct_joint,
@@ -218,152 +213,6 @@ static void counts_to_nmv_context(
}
}
-#ifdef NMV_STATS
-void init_nmvstats() {
- vp9_zero(tnmvcounts);
-}
-
-void print_nmvstats() {
- nmv_context prob;
- unsigned int branch_ct_joint[MV_JOINTS - 1][2];
- unsigned int branch_ct_sign[2][2];
- unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
- unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
- unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
- unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
- unsigned int branch_ct_fp[2][4 - 1][2];
- unsigned int branch_ct_class0_hp[2][2];
- unsigned int branch_ct_hp[2][2];
- int i, j, k;
- counts_to_nmv_context(&tnmvcounts, &prob, 1,
- branch_ct_joint, branch_ct_sign, branch_ct_classes,
- branch_ct_class0, branch_ct_bits,
- branch_ct_class0_fp, branch_ct_fp,
- branch_ct_class0_hp, branch_ct_hp);
-
- printf("\nCounts =\n { ");
- for (j = 0; j < MV_JOINTS; ++j)
- printf("%d, ", tnmvcounts.joints[j]);
- printf("},\n");
- for (i = 0; i < 2; ++i) {
- printf(" {\n");
- printf(" %d/%d,\n", tnmvcounts.comps[i].sign[0],
- tnmvcounts.comps[i].sign[1]);
- printf(" { ");
- for (j = 0; j < MV_CLASSES; ++j)
- printf("%d, ", tnmvcounts.comps[i].classes[j]);
- printf("},\n");
- printf(" { ");
- for (j = 0; j < CLASS0_SIZE; ++j)
- printf("%d, ", tnmvcounts.comps[i].class0[j]);
- printf("},\n");
- printf(" { ");
- for (j = 0; j < MV_OFFSET_BITS; ++j)
- printf("%d/%d, ", tnmvcounts.comps[i].bits[j][0],
- tnmvcounts.comps[i].bits[j][1]);
- printf("},\n");
-
- printf(" {");
- for (j = 0; j < CLASS0_SIZE; ++j) {
- printf("{");
- for (k = 0; k < 4; ++k)
- printf("%d, ", tnmvcounts.comps[i].class0_fp[j][k]);
- printf("}, ");
- }
- printf("},\n");
-
- printf(" { ");
- for (j = 0; j < 4; ++j)
- printf("%d, ", tnmvcounts.comps[i].fp[j]);
- printf("},\n");
-
- printf(" %d/%d,\n",
- tnmvcounts.comps[i].class0_hp[0],
- tnmvcounts.comps[i].class0_hp[1]);
- printf(" %d/%d,\n",
- tnmvcounts.comps[i].hp[0],
- tnmvcounts.comps[i].hp[1]);
- printf(" },\n");
- }
-
- printf("\nProbs =\n { ");
- for (j = 0; j < MV_JOINTS - 1; ++j)
- printf("%d, ", prob.joints[j]);
- printf("},\n");
- for (i=0; i< 2; ++i) {
- printf(" {\n");
- printf(" %d,\n", prob.comps[i].sign);
- printf(" { ");
- for (j = 0; j < MV_CLASSES - 1; ++j)
- printf("%d, ", prob.comps[i].classes[j]);
- printf("},\n");
- printf(" { ");
- for (j = 0; j < CLASS0_SIZE - 1; ++j)
- printf("%d, ", prob.comps[i].class0[j]);
- printf("},\n");
- printf(" { ");
- for (j = 0; j < MV_OFFSET_BITS; ++j)
- printf("%d, ", prob.comps[i].bits[j]);
- printf("},\n");
- printf(" { ");
- for (j = 0; j < CLASS0_SIZE; ++j) {
- printf("{");
- for (k = 0; k < 3; ++k)
- printf("%d, ", prob.comps[i].class0_fp[j][k]);
- printf("}, ");
- }
- printf("},\n");
- printf(" { ");
- for (j = 0; j < 3; ++j)
- printf("%d, ", prob.comps[i].fp[j]);
- printf("},\n");
-
- printf(" %d,\n", prob.comps[i].class0_hp);
- printf(" %d,\n", prob.comps[i].hp);
- printf(" },\n");
- }
-}
-
-static void add_nmvcount(nmv_context_counts* const dst,
- const nmv_context_counts* const src) {
- int i, j, k;
- for (j = 0; j < MV_JOINTS; ++j) {
- dst->joints[j] += src->joints[j];
- }
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < MV_VALS; ++j) {
- dst->comps[i].mvcount[j] += src->comps[i].mvcount[j];
- }
- dst->comps[i].sign[0] += src->comps[i].sign[0];
- dst->comps[i].sign[1] += src->comps[i].sign[1];
- for (j = 0; j < MV_CLASSES; ++j) {
- dst->comps[i].classes[j] += src->comps[i].classes[j];
- }
- for (j = 0; j < CLASS0_SIZE; ++j) {
- dst->comps[i].class0[j] += src->comps[i].class0[j];
- }
- for (j = 0; j < MV_OFFSET_BITS; ++j) {
- dst->comps[i].bits[j][0] += src->comps[i].bits[j][0];
- dst->comps[i].bits[j][1] += src->comps[i].bits[j][1];
- }
- }
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < CLASS0_SIZE; ++j) {
- for (k = 0; k < 4; ++k) {
- dst->comps[i].class0_fp[j][k] += src->comps[i].class0_fp[j][k];
- }
- }
- for (j = 0; j < 4; ++j) {
- dst->comps[i].fp[j] += src->comps[i].fp[j];
- }
- dst->comps[i].class0_hp[0] += src->comps[i].class0_hp[0];
- dst->comps[i].class0_hp[1] += src->comps[i].class0_hp[1];
- dst->comps[i].hp[0] += src->comps[i].hp[0];
- dst->comps[i].hp[1] += src->comps[i].hp[1];
- }
-}
-#endif
-
void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
int i, j;
nmv_context prob;
@@ -378,10 +227,6 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
unsigned int branch_ct_hp[2][2];
nmv_context *mvc = &cpi->common.fc.nmvc;
-#ifdef NMV_STATS
- if (!cpi->dummy_packing)
- add_nmvcount(&tnmvcounts, &cpi->NMVcount);
-#endif
counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
branch_ct_joint, branch_ct_sign, branch_ct_classes,
branch_ct_class0, branch_ct_bits,
@@ -390,22 +235,22 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
for (j = 0; j < MV_JOINTS - 1; ++j)
update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j],
- VP9_NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
for (i = 0; i < 2; ++i) {
update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign,
- prob.comps[i].sign, VP9_NMV_UPDATE_PROB);
+ prob.comps[i].sign, NMV_UPDATE_PROB);
for (j = 0; j < MV_CLASSES - 1; ++j)
update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
- prob.comps[i].classes[j], VP9_NMV_UPDATE_PROB);
+ prob.comps[i].classes[j], NMV_UPDATE_PROB);
for (j = 0; j < CLASS0_SIZE - 1; ++j)
update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
- prob.comps[i].class0[j], VP9_NMV_UPDATE_PROB);
+ prob.comps[i].class0[j], NMV_UPDATE_PROB);
for (j = 0; j < MV_OFFSET_BITS; ++j)
update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
- prob.comps[i].bits[j], VP9_NMV_UPDATE_PROB);
+ prob.comps[i].bits[j], NMV_UPDATE_PROB);
}
for (i = 0; i < 2; ++i) {
@@ -414,20 +259,20 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
for (k = 0; k < 3; ++k)
update_mv(bc, branch_ct_class0_fp[i][j][k],
&mvc->comps[i].class0_fp[j][k],
- prob.comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);
+ prob.comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
}
for (j = 0; j < 3; ++j)
update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j],
- prob.comps[i].fp[j], VP9_NMV_UPDATE_PROB);
+ prob.comps[i].fp[j], NMV_UPDATE_PROB);
}
if (usehp) {
for (i = 0; i < 2; ++i) {
update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
- prob.comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
+ prob.comps[i].class0_hp, NMV_UPDATE_PROB);
update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
- prob.comps[i].hp, VP9_NMV_UPDATE_PROB);
+ prob.comps[i].hp, NMV_UPDATE_PROB);
}
}
}
@@ -471,7 +316,7 @@ void vp9_build_nmv_cost_table(int *mvjoint,
void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
int_mv *best_ref_mv, int_mv *second_best_ref_mv) {
- MODE_INFO *mi = x->e_mbd.mode_info_context;
+ MODE_INFO *mi = x->e_mbd.mi_8x8[0];
MB_MODE_INFO *const mbmi = &mi->mbmi;
MV diff;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
@@ -488,7 +333,7 @@ void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
diff.col = mi->bmi[i].as_mv[0].as_mv.col - best_ref_mv->as_mv.col;
vp9_inc_mv(&diff, &cpi->NMVcount);
- if (x->e_mbd.mode_info_context->mbmi.ref_frame[1] > INTRA_FRAME) {
+ if (mi->mbmi.ref_frame[1] > INTRA_FRAME) {
diff.row = mi->bmi[i].as_mv[1].as_mv.row -
second_best_ref_mv->as_mv.row;
diff.col = mi->bmi[i].as_mv[1].as_mv.col -
diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c
index 6ba2a4f..9cf7b83 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libvpx/vp9/encoder/vp9_firstpass.c
@@ -346,7 +346,7 @@ static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *r
// Set up pointers for this macro block recon buffer
xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;
- switch (xd->mode_info_context->mbmi.sb_type) {
+ switch (xd->this_mi->mbmi.sb_type) {
case BLOCK_8X8:
vp9_mse8x8(x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
@@ -385,7 +385,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
int n;
vp9_variance_fn_ptr_t v_fn_ptr =
- cpi->fn_ptr[xd->mode_info_context->mbmi.sb_type];
+ cpi->fn_ptr[xd->this_mi->mbmi.sb_type];
int new_mv_mode_penalty = 256;
int sr = 0;
@@ -402,7 +402,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
further_steps -= sr;
// override the default variance function to use MSE
- switch (xd->mode_info_context->mbmi.sb_type) {
+ switch (xd->this_mi->mbmi.sb_type) {
case BLOCK_8X8:
v_fn_ptr.vf = vp9_mse8x8;
break;
@@ -505,8 +505,11 @@ void vp9_first_pass(VP9_COMP *cpi) {
setup_dst_planes(xd, new_yv12, 0, 0);
x->partition_info = x->pi;
-
- xd->mode_info_context = cm->mi;
+ xd->mi_8x8 = cm->mi_grid_visible;
+ // required for vp9_frame_init_quantizer
+ xd->this_mi =
+ xd->mi_8x8[0] = cm->mi;
+ xd->mic_stream_ptr = cm->mi;
setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
@@ -549,26 +552,26 @@ void vp9_first_pass(VP9_COMP *cpi) {
if (mb_col * 2 + 1 < cm->mi_cols) {
if (mb_row * 2 + 1 < cm->mi_rows) {
- xd->mode_info_context->mbmi.sb_type = BLOCK_16X16;
+ xd->this_mi->mbmi.sb_type = BLOCK_16X16;
} else {
- xd->mode_info_context->mbmi.sb_type = BLOCK_16X8;
+ xd->this_mi->mbmi.sb_type = BLOCK_16X8;
}
} else {
if (mb_row * 2 + 1 < cm->mi_rows) {
- xd->mode_info_context->mbmi.sb_type = BLOCK_8X16;
+ xd->this_mi->mbmi.sb_type = BLOCK_8X16;
} else {
- xd->mode_info_context->mbmi.sb_type = BLOCK_8X8;
+ xd->this_mi->mbmi.sb_type = BLOCK_8X8;
}
}
- xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME;
+ xd->this_mi->mbmi.ref_frame[0] = INTRA_FRAME;
set_mi_row_col(cm, xd,
mb_row << 1,
- 1 << mi_height_log2(xd->mode_info_context->mbmi.sb_type),
+ 1 << mi_height_log2(xd->this_mi->mbmi.sb_type),
mb_col << 1,
- 1 << mi_height_log2(xd->mode_info_context->mbmi.sb_type));
+ 1 << mi_height_log2(xd->this_mi->mbmi.sb_type));
// do intra 16x16 prediction
- this_error = vp9_encode_intra(cpi, x, use_dc_pred);
+ this_error = vp9_encode_intra(x, use_dc_pred);
// "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
// We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
@@ -661,13 +664,13 @@ void vp9_first_pass(VP9_COMP *cpi) {
mv.as_mv.col <<= 3;
this_error = motion_error;
vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
- xd->mode_info_context->mbmi.txfm_size = TX_4X4;
- xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME;
- xd->mode_info_context->mbmi.ref_frame[1] = NONE;
+ xd->this_mi->mbmi.tx_size = TX_4X4;
+ xd->this_mi->mbmi.ref_frame[0] = LAST_FRAME;
+ xd->this_mi->mbmi.ref_frame[1] = NONE;
vp9_build_inter_predictors_sby(xd, mb_row << 1,
mb_col << 1,
- xd->mode_info_context->mbmi.sb_type);
- vp9_encode_sby(cm, x, xd->mode_info_context->mbmi.sb_type);
+ xd->this_mi->mbmi.sb_type);
+ vp9_encode_sby(x, xd->this_mi->mbmi.sb_type);
sum_mvr += mv.as_mv.row;
sum_mvr_abs += abs(mv.as_mv.row);
sum_mvc += mv.as_mv.col;
@@ -1092,7 +1095,6 @@ static int estimate_cq(VP9_COMP *cpi,
return q;
}
-
extern void vp9_new_framerate(VP9_COMP *cpi, double framerate);
void vp9_init_second_pass(VP9_COMP *cpi) {
@@ -1580,7 +1582,7 @@ void define_fixed_arf_period(VP9_COMP *cpi) {
// Analyse and define a gf/arf group.
static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
- FIRSTPASS_STATS next_frame;
+ FIRSTPASS_STATS next_frame = { 0 };
FIRSTPASS_STATS *start_pos;
int i;
double boost_score = 0.0;
@@ -1616,8 +1618,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
start_pos = cpi->twopass.stats_in;
- vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
-
// Load stats for the current frame.
mod_frame_err = calculate_modified_err(cpi, this_frame);
@@ -1720,6 +1720,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
old_boost_score = boost_score;
}
+ cpi->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
+
// Don't allow a gf too near the next kf
if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {
while (i < cpi->twopass.frames_to_key) {
@@ -2081,63 +2083,71 @@ void vp9_second_pass(VP9_COMP *cpi) {
vp9_clear_system_state();
- // Special case code for first frame.
- if (cpi->common.current_video_frame == 0) {
- cpi->twopass.est_max_qcorrection_factor = 1.0;
-
- // Set a cq_level in constrained quality mode.
- if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
- int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
- (int)(cpi->twopass.bits_left / frames_left));
-
- cpi->cq_target_quality = cpi->oxcf.cq_level;
- if (est_cq > cpi->cq_target_quality)
- cpi->cq_target_quality = est_cq;
- }
+ if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+ cpi->active_worst_quality = cpi->oxcf.cq_level;
+ } else {
+ // Special case code for first frame.
+ if (cpi->common.current_video_frame == 0) {
+ int section_target_bandwidth =
+ (int)(cpi->twopass.bits_left / frames_left);
+ cpi->twopass.est_max_qcorrection_factor = 1.0;
+
+ // Set a cq_level in constrained quality mode.
+ if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+ int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
+ section_target_bandwidth);
+
+ cpi->cq_target_quality = cpi->oxcf.cq_level;
+ if (est_cq > cpi->cq_target_quality)
+ cpi->cq_target_quality = est_cq;
+ }
- // guess at maxq needed in 2nd pass
- cpi->twopass.maxq_max_limit = cpi->worst_quality;
- cpi->twopass.maxq_min_limit = cpi->best_quality;
+ // guess at maxq needed in 2nd pass
+ cpi->twopass.maxq_max_limit = cpi->worst_quality;
+ cpi->twopass.maxq_min_limit = cpi->best_quality;
- tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
- (int)(cpi->twopass.bits_left / frames_left));
+ tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
+ section_target_bandwidth);
- cpi->active_worst_quality = tmp_q;
- cpi->ni_av_qi = tmp_q;
- cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
+ cpi->active_worst_quality = tmp_q;
+ cpi->ni_av_qi = tmp_q;
+ cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
#ifndef ONE_SHOT_Q_ESTIMATE
- // Limit the maxq value returned subsequently.
- // This increases the risk of overspend or underspend if the initial
- // estimate for the clip is bad, but helps prevent excessive
- // variation in Q, especially near the end of a clip
- // where for example a small overspend may cause Q to crash
- adjust_maxq_qrange(cpi);
+ // Limit the maxq value returned subsequently.
+ // This increases the risk of overspend or underspend if the initial
+ // estimate for the clip is bad, but helps prevent excessive
+ // variation in Q, especially near the end of a clip
+ // where for example a small overspend may cause Q to crash
+ adjust_maxq_qrange(cpi);
#endif
- }
+ }
#ifndef ONE_SHOT_Q_ESTIMATE
- // The last few frames of a clip almost always have to few or too many
- // bits and for the sake of over exact rate control we dont want to make
- // radical adjustments to the allowed quantizer range just to use up a
- // few surplus bits or get beneath the target rate.
- else if ((cpi->common.current_video_frame <
- (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) &&
- ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
- (unsigned int)cpi->twopass.total_stats.count)) {
- if (frames_left < 1)
- frames_left = 1;
-
- tmp_q = estimate_max_q(
- cpi,
- &cpi->twopass.total_left_stats,
- (int)(cpi->twopass.bits_left / frames_left));
-
- // Make a damped adjustment to active max Q
- cpi->active_worst_quality =
- adjust_active_maxq(cpi->active_worst_quality, tmp_q);
- }
+ // The last few frames of a clip almost always have to few or too many
+ // bits and for the sake of over exact rate control we dont want to make
+ // radical adjustments to the allowed quantizer range just to use up a
+ // few surplus bits or get beneath the target rate.
+ else if ((cpi->common.current_video_frame <
+ (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) &&
+ ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
+ (unsigned int)cpi->twopass.total_stats.count)) {
+ int section_target_bandwidth =
+ (int)(cpi->twopass.bits_left / frames_left);
+ if (frames_left < 1)
+ frames_left = 1;
+
+ tmp_q = estimate_max_q(
+ cpi,
+ &cpi->twopass.total_left_stats,
+ section_target_bandwidth);
+
+ // Make a damped adjustment to active max Q
+ cpi->active_worst_quality =
+ adjust_active_maxq(cpi->active_worst_quality, tmp_q);
+ }
#endif
+ }
vp9_zero(this_frame);
if (EOF == input_stats(cpi, &this_frame))
return;
@@ -2157,6 +2167,8 @@ void vp9_second_pass(VP9_COMP *cpi) {
// Define next gf group and assign bits to it
this_frame_copy = this_frame;
+ cpi->gf_zeromotion_pct = 0;
+
#if CONFIG_MULTIPLE_ARF
if (cpi->multi_arf_enabled) {
define_fixed_arf_period(cpi);
@@ -2167,6 +2179,15 @@ void vp9_second_pass(VP9_COMP *cpi) {
}
#endif
+ if (cpi->gf_zeromotion_pct > 995) {
+ // As long as max_thresh for encode breakout is small enough, it is ok
+ // to enable it for no-show frame, i.e. set enable_encode_breakout to 2.
+ if (!cpi->common.show_frame)
+ cpi->enable_encode_breakout = 0;
+ else
+ cpi->enable_encode_breakout = 2;
+ }
+
// If we are going to code an altref frame at the end of the group
// and the current frame is not a key frame....
// If the previous group used an arf this frame has already benefited
diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c
index 154d31a..5a671f2 100644
--- a/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -40,14 +40,15 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
(cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2);
step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
- vp9_clamp_mv_min_max(x, ref_mv);
+ vp9_clamp_mv_min_max(x, &ref_mv->as_mv);
ref_full.as_mv.col = ref_mv->as_mv.col >> 3;
ref_full.as_mv.row = ref_mv->as_mv.row >> 3;
/*cpi->sf.search_method == HEX*/
- best_err = vp9_hex_search(x, &ref_full, dst_mv, step_param, x->errorperbit,
- &v_fn_ptr, NULL, NULL, NULL, NULL, ref_mv);
+ best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit,
+ 0, &v_fn_ptr,
+ 0, ref_mv, dst_mv);
// Try sub-pixel MC
// if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -58,7 +59,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
x,
dst_mv, ref_mv,
x->errorperbit, &v_fn_ptr,
- NULL, NULL,
+ 0, cpi->sf.subpel_iters_per_step, NULL, NULL,
& distortion, &sse);
}
@@ -144,7 +145,7 @@ static int find_best_16x16_intra(VP9_COMP *cpi,
for (mode = DC_PRED; mode <= TM_PRED; mode++) {
unsigned int err;
- xd->mode_info_context->mbmi.mode = mode;
+ xd->this_mi->mbmi.mode = mode;
vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode,
x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].dst.buf, xd->plane[0].dst.stride);
@@ -240,9 +241,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
int mb_col, mb_row, offset = 0;
int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
int_mv arf_top_mv, gld_top_mv;
- MODE_INFO mi_local;
-
- vp9_zero(mi_local);
+ MODE_INFO mi_local = { { 0 } };
// Set up limit values for motion vectors to prevent them extending outside the UMV borders
arf_top_mv.as_int = 0;
@@ -254,7 +253,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
xd->plane[0].dst.stride = buf->y_stride;
xd->plane[0].pre[0].stride = buf->y_stride;
xd->plane[1].dst.stride = buf->uv_stride;
- xd->mode_info_context = &mi_local;
+ xd->this_mi = &mi_local;
mi_local.mbmi.sb_type = BLOCK_16X16;
mi_local.mbmi.ref_frame[0] = LAST_FRAME;
mi_local.mbmi.ref_frame[1] = NONE;
@@ -308,7 +307,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
static void separate_arf_mbs(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
int mb_col, mb_row, offset, i;
- int ncnt[4];
+ int ncnt[4] = { 0 };
int n_frames = cpi->mbgraph_n_frames;
int *arf_not_zz;
@@ -344,7 +343,6 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
}
}
- vpx_memset(ncnt, 0, sizeof(ncnt));
for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
offset += cm->mb_cols, mb_row++) {
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c
index 88beee7..1360088 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libvpx/vp9/encoder/vp9_mcomp.c
@@ -8,28 +8,30 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <stdio.h>
#include <limits.h>
#include <math.h>
+#include <stdio.h>
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vpx_mem/vpx_mem.h"
#include "./vpx_config.h"
+
+#include "vpx_mem/vpx_mem.h"
+
#include "vp9/common/vp9_findnearmv.h"
#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_mcomp.h"
+
// #define NEW_DIAMOND_SEARCH
-void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
- int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL +
- ((ref_mv->as_mv.col & 7) ? 1 : 0);
- int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL +
- ((ref_mv->as_mv.row & 7) ? 1 : 0);
- int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL;
- int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL;
+void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv) {
+ const int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
+ const int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
+ const int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
+ const int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
- /* Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. */
+ // Get intersection of UMV window and valid MV window to reduce # of checks
+ // in diamond search.
if (x->mv_col_min < col_min)
x->mv_col_min = col_min;
if (x->mv_col_max > col_max)
@@ -245,52 +247,112 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
}, \
v = INT_MAX;)
-int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
- int_mv *bestmv, int_mv *ref_mv,
- int error_per_bit,
- const vp9_variance_fn_ptr_t *vfp,
- int *mvjcost, int *mvcost[2],
- int *distortion,
- unsigned int *sse1) {
+#define FIRST_LEVEL_CHECKS \
+ { \
+ unsigned int left, right, up, down, diag; \
+ CHECK_BETTER(left, tr, tc - hstep); \
+ CHECK_BETTER(right, tr, tc + hstep); \
+ CHECK_BETTER(up, tr - hstep, tc); \
+ CHECK_BETTER(down, tr + hstep, tc); \
+ whichdir = (left < right ? 0 : 1) + \
+ (up < down ? 0 : 2); \
+ switch (whichdir) { \
+ case 0: \
+ CHECK_BETTER(diag, tr - hstep, tc - hstep); \
+ break; \
+ case 1: \
+ CHECK_BETTER(diag, tr - hstep, tc + hstep); \
+ break; \
+ case 2: \
+ CHECK_BETTER(diag, tr + hstep, tc - hstep); \
+ break; \
+ case 3: \
+ CHECK_BETTER(diag, tr + hstep, tc + hstep); \
+ break; \
+ } \
+ }
+
+#define SECOND_LEVEL_CHECKS \
+ { \
+ int kr, kc; \
+ unsigned int second; \
+ if (tr != br && tc != bc) { \
+ kr = br - tr; \
+ kc = bc - tc; \
+ CHECK_BETTER(second, tr + kr, tc + 2 * kc); \
+ CHECK_BETTER(second, tr + 2 * kr, tc + kc); \
+ } else if (tr == br && tc != bc) { \
+ kc = bc - tc; \
+ CHECK_BETTER(second, tr + hstep, tc + 2 * kc); \
+ CHECK_BETTER(second, tr - hstep, tc + 2 * kc); \
+ switch (whichdir) { \
+ case 0: \
+ case 1: \
+ CHECK_BETTER(second, tr + hstep, tc + kc); \
+ break; \
+ case 2: \
+ case 3: \
+ CHECK_BETTER(second, tr - hstep, tc + kc); \
+ break; \
+ } \
+ } else if (tr != br && tc == bc) { \
+ kr = br - tr; \
+ CHECK_BETTER(second, tr + 2 * kr, tc + hstep); \
+ CHECK_BETTER(second, tr + 2 * kr, tc - hstep); \
+ switch (whichdir) { \
+ case 0: \
+ case 2: \
+ CHECK_BETTER(second, tr + kr, tc + hstep); \
+ break; \
+ case 1: \
+ case 3: \
+ CHECK_BETTER(second, tr + kr, tc - hstep); \
+ break; \
+ } \
+ } \
+ }
+
+int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ int forced_stop,
+ int iters_per_step,
+ int *mvjcost, int *mvcost[2],
+ int *distortion,
+ unsigned int *sse1) {
uint8_t *z = x->plane[0].src.buf;
int src_stride = x->plane[0].src.stride;
MACROBLOCKD *xd = &x->e_mbd;
- int rr, rc, br, bc, hstep;
- int tr, tc;
unsigned int besterr = INT_MAX;
- unsigned int left, right, up, down, diag;
unsigned int sse;
unsigned int whichdir;
- unsigned int halfiters = 4;
- unsigned int quarteriters = 4;
- unsigned int eighthiters = 4;
+ unsigned int halfiters = iters_per_step;
+ unsigned int quarteriters = iters_per_step;
+ unsigned int eighthiters = iters_per_step;
int thismse;
- int maxc, minc, maxr, minr;
- int y_stride;
- int offset;
uint8_t *y = xd->plane[0].pre[0].buf +
(bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
bestmv->as_mv.col;
- y_stride = xd->plane[0].pre[0].stride;
-
- rr = ref_mv->as_mv.row;
- rc = ref_mv->as_mv.col;
- br = bestmv->as_mv.row << 3;
- bc = bestmv->as_mv.col << 3;
- hstep = 4;
- minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1));
- maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1));
- minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1));
- maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1));
+ const int y_stride = xd->plane[0].pre[0].stride;
- tr = br;
- tc = bc;
+ int rr = ref_mv->as_mv.row;
+ int rc = ref_mv->as_mv.col;
+ int br = bestmv->as_mv.row << 3;
+ int bc = bestmv->as_mv.col << 3;
+ int hstep = 4;
+ const int minc = MAX(x->mv_col_min << 3, ref_mv->as_mv.col - MV_MAX);
+ const int maxc = MIN(x->mv_col_max << 3, ref_mv->as_mv.col + MV_MAX);
+ const int minr = MAX(x->mv_row_min << 3, ref_mv->as_mv.row - MV_MAX);
+ const int maxr = MIN(x->mv_row_max << 3, ref_mv->as_mv.row + MV_MAX);
+ int tr = br;
+ int tc = bc;
- offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+ const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
// central mv
bestmv->as_mv.row <<= 3;
@@ -303,105 +365,45 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
// TODO: Each subsequent iteration checks at least one point in
// common with the last iteration could be 2 ( if diag selected)
- while (--halfiters) {
+ while (halfiters--) {
// 1/2 pel
- CHECK_BETTER(left, tr, tc - hstep);
- CHECK_BETTER(right, tr, tc + hstep);
- CHECK_BETTER(up, tr - hstep, tc);
- CHECK_BETTER(down, tr + hstep, tc);
-
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
- switch (whichdir) {
- case 0:
- CHECK_BETTER(diag, tr - hstep, tc - hstep);
- break;
- case 1:
- CHECK_BETTER(diag, tr - hstep, tc + hstep);
- break;
- case 2:
- CHECK_BETTER(diag, tr + hstep, tc - hstep);
- break;
- case 3:
- CHECK_BETTER(diag, tr + hstep, tc + hstep);
- break;
- }
-
+ FIRST_LEVEL_CHECKS;
// no reason to check the same one again.
if (tr == br && tc == bc)
break;
-
tr = br;
tc = bc;
}
// TODO: Each subsequent iteration checks at least one point in common with
// the last iteration could be 2 ( if diag selected) 1/4 pel
- hstep >>= 1;
- while (--quarteriters) {
- CHECK_BETTER(left, tr, tc - hstep);
- CHECK_BETTER(right, tr, tc + hstep);
- CHECK_BETTER(up, tr - hstep, tc);
- CHECK_BETTER(down, tr + hstep, tc);
-
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
- switch (whichdir) {
- case 0:
- CHECK_BETTER(diag, tr - hstep, tc - hstep);
- break;
- case 1:
- CHECK_BETTER(diag, tr - hstep, tc + hstep);
- break;
- case 2:
- CHECK_BETTER(diag, tr + hstep, tc - hstep);
- break;
- case 3:
- CHECK_BETTER(diag, tr + hstep, tc + hstep);
+
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ hstep >>= 1;
+ while (quarteriters--) {
+ FIRST_LEVEL_CHECKS;
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
break;
+ tr = br;
+ tc = bc;
}
-
- // no reason to check the same one again.
- if (tr == br && tc == bc)
- break;
-
- tr = br;
- tc = bc;
}
- if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv)) {
+ if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
+ forced_stop == 0) {
hstep >>= 1;
- while (--eighthiters) {
- CHECK_BETTER(left, tr, tc - hstep);
- CHECK_BETTER(right, tr, tc + hstep);
- CHECK_BETTER(up, tr - hstep, tc);
- CHECK_BETTER(down, tr + hstep, tc);
-
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
- switch (whichdir) {
- case 0:
- CHECK_BETTER(diag, tr - hstep, tc - hstep);
- break;
- case 1:
- CHECK_BETTER(diag, tr - hstep, tc + hstep);
- break;
- case 2:
- CHECK_BETTER(diag, tr + hstep, tc - hstep);
- break;
- case 3:
- CHECK_BETTER(diag, tr + hstep, tc + hstep);
- break;
- }
-
+ while (eighthiters--) {
+ FIRST_LEVEL_CHECKS;
// no reason to check the same one again.
if (tr == br && tc == bc)
break;
-
tr = br;
tc = bc;
}
}
+
bestmv->as_mv.row = br;
bestmv->as_mv.col = bc;
@@ -412,39 +414,31 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
return besterr;
}
-#undef DIST
-/* returns subpixel variance error function */
-#define DIST(r, c) \
- vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
- z, src_stride, &sse, second_pred)
-
-int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
int_mv *bestmv, int_mv *ref_mv,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
+ int forced_stop,
+ int iters_per_step,
int *mvjcost, int *mvcost[2],
int *distortion,
- unsigned int *sse1,
- const uint8_t *second_pred, int w, int h) {
+ unsigned int *sse1) {
uint8_t *z = x->plane[0].src.buf;
int src_stride = x->plane[0].src.stride;
MACROBLOCKD *xd = &x->e_mbd;
-
int rr, rc, br, bc, hstep;
int tr, tc;
unsigned int besterr = INT_MAX;
- unsigned int left, right, up, down, diag;
unsigned int sse;
unsigned int whichdir;
- unsigned int halfiters = 4;
- unsigned int quarteriters = 4;
- unsigned int eighthiters = 4;
int thismse;
int maxc, minc, maxr, minr;
int y_stride;
int offset;
+ unsigned int halfiters = iters_per_step;
+ unsigned int quarteriters = iters_per_step;
+ unsigned int eighthiters = iters_per_step;
- DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
uint8_t *y = xd->plane[0].pre[0].buf +
(bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
bestmv->as_mv.col;
@@ -456,19 +450,18 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
br = bestmv->as_mv.row << 3;
bc = bestmv->as_mv.col << 3;
hstep = 4;
- minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
- ((1 << MV_MAX_BITS) - 1));
- maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
- ((1 << MV_MAX_BITS) - 1));
- minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
- ((1 << MV_MAX_BITS) - 1));
- maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
- ((1 << MV_MAX_BITS) - 1));
+ minc = MAX(x->mv_col_min << 3,
+ (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1));
+ maxc = MIN(x->mv_col_max << 3,
+ (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1));
+ minr = MAX(x->mv_row_min << 3,
+ (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1));
+ maxr = MIN(x->mv_row_max << 3,
+ (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1));
tr = br;
tc = bc;
-
offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
// central mv
@@ -476,114 +469,40 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
bestmv->as_mv.col <<= 3;
// calculate central point error
- // TODO(yunqingwang): central pointer error was already calculated in full-
- // pixel search, and can be passed in this function.
- comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
- besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+ besterr = vfp->vf(y, y_stride, z, src_stride, sse1);
*distortion = besterr;
besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
- // Each subsequent iteration checks at least one point in
- // common with the last iteration could be 2 ( if diag selected)
- while (--halfiters) {
- // 1/2 pel
- CHECK_BETTER(left, tr, tc - hstep);
- CHECK_BETTER(right, tr, tc + hstep);
- CHECK_BETTER(up, tr - hstep, tc);
- CHECK_BETTER(down, tr + hstep, tc);
-
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+ // 1/2 pel
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
- switch (whichdir) {
- case 0:
- CHECK_BETTER(diag, tr - hstep, tc - hstep);
- break;
- case 1:
- CHECK_BETTER(diag, tr - hstep, tc + hstep);
- break;
- case 2:
- CHECK_BETTER(diag, tr + hstep, tc - hstep);
- break;
- case 3:
- CHECK_BETTER(diag, tr + hstep, tc + hstep);
- break;
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
}
-
- // no reason to check the same one again.
- if (tr == br && tc == bc)
- break;
-
tr = br;
tc = bc;
}
- // Each subsequent iteration checks at least one point in common with
- // the last iteration could be 2 ( if diag selected) 1/4 pel
- hstep >>= 1;
- while (--quarteriters) {
- CHECK_BETTER(left, tr, tc - hstep);
- CHECK_BETTER(right, tr, tc + hstep);
- CHECK_BETTER(up, tr - hstep, tc);
- CHECK_BETTER(down, tr + hstep, tc);
-
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
- switch (whichdir) {
- case 0:
- CHECK_BETTER(diag, tr - hstep, tc - hstep);
- break;
- case 1:
- CHECK_BETTER(diag, tr - hstep, tc + hstep);
- break;
- case 2:
- CHECK_BETTER(diag, tr + hstep, tc - hstep);
- break;
- case 3:
- CHECK_BETTER(diag, tr + hstep, tc + hstep);
- break;
+ if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
+ forced_stop == 0) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
}
-
- // no reason to check the same one again.
- if (tr == br && tc == bc)
- break;
-
tr = br;
tc = bc;
}
- if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv)) {
- hstep >>= 1;
- while (--eighthiters) {
- CHECK_BETTER(left, tr, tc - hstep);
- CHECK_BETTER(right, tr, tc + hstep);
- CHECK_BETTER(up, tr - hstep, tc);
- CHECK_BETTER(down, tr + hstep, tc);
-
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
- switch (whichdir) {
- case 0:
- CHECK_BETTER(diag, tr - hstep, tc - hstep);
- break;
- case 1:
- CHECK_BETTER(diag, tr - hstep, tc + hstep);
- break;
- case 2:
- CHECK_BETTER(diag, tr + hstep, tc - hstep);
- break;
- case 3:
- CHECK_BETTER(diag, tr + hstep, tc + hstep);
- break;
- }
-
- // no reason to check the same one again.
- if (tr == br && tc == bc)
- break;
-
- tr = br;
- tc = bc;
- }
- }
bestmv->as_mv.row = br;
bestmv->as_mv.col = bc;
@@ -594,636 +513,236 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
return besterr;
}
-
-#undef MVC
-#undef PRE
#undef DIST
-#undef IFMVCV
-#undef CHECK_BETTER
-#undef MIN
-#undef MAX
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+ vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
+ z, src_stride, &sse, second_pred)
-int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
- int_mv *bestmv, int_mv *ref_mv,
- int error_per_bit,
- const vp9_variance_fn_ptr_t *vfp,
- int *mvjcost, int *mvcost[2], int *distortion,
- unsigned int *sse1) {
- int bestmse = INT_MAX;
- int_mv startmv;
- int_mv this_mv;
- int_mv orig_mv;
- int yrow_movedback = 0, ycol_movedback = 0;
- uint8_t *z = x->plane[0].src.buf;
- int src_stride = x->plane[0].src.stride;
- int left, right, up, down, diag;
+int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ int forced_stop,
+ int iters_per_step,
+ int *mvjcost, int *mvcost[2],
+ int *distortion,
+ unsigned int *sse1,
+ const uint8_t *second_pred,
+ int w, int h) {
+ uint8_t *const z = x->plane[0].src.buf;
+ const int src_stride = x->plane[0].src.stride;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ unsigned int besterr = INT_MAX;
unsigned int sse;
- int whichdir;
+ unsigned int whichdir;
+ unsigned int halfiters = iters_per_step;
+ unsigned int quarteriters = iters_per_step;
+ unsigned int eighthiters = iters_per_step;
int thismse;
- int y_stride;
- MACROBLOCKD *xd = &x->e_mbd;
- uint8_t *y = xd->plane[0].pre[0].buf +
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
+ uint8_t *const y = xd->plane[0].pre[0].buf +
(bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
bestmv->as_mv.col;
- y_stride = xd->plane[0].pre[0].stride;
-
- // central mv
- bestmv->as_mv.row <<= 3;
- bestmv->as_mv.col <<= 3;
- startmv = *bestmv;
- orig_mv = *bestmv;
-
- // calculate central point error
- bestmse = vfp->vf(y, y_stride, z, src_stride, sse1);
- *distortion = bestmse;
- bestmse += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
- // go left then right and check error
- this_mv.as_mv.row = startmv.as_mv.row;
- this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
- thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, src_stride, &sse);
- left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (left < bestmse) {
- *bestmv = this_mv;
- bestmse = left;
- *distortion = thismse;
- *sse1 = sse;
- }
- this_mv.as_mv.col += 8;
- thismse = vfp->svf_halfpix_h(y, y_stride, z, src_stride, &sse);
- right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (right < bestmse) {
- *bestmv = this_mv;
- bestmse = right;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- // go up then down and check error
- this_mv.as_mv.col = startmv.as_mv.col;
- this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
- thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, src_stride, &sse);
- up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
-
- if (up < bestmse) {
- *bestmv = this_mv;
- bestmse = up;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- this_mv.as_mv.row += 8;
- thismse = vfp->svf_halfpix_v(y, y_stride, z, src_stride, &sse);
- down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (down < bestmse) {
- *bestmv = this_mv;
- bestmse = down;
- *distortion = thismse;
- *sse1 = sse;
- }
+ const int y_stride = xd->plane[0].pre[0].stride;
+ int rr = ref_mv->as_mv.row;
+ int rc = ref_mv->as_mv.col;
+ int br = bestmv->as_mv.row << 3;
+ int bc = bestmv->as_mv.col << 3;
+ int hstep = 4;
+ const int minc = MAX(x->mv_col_min << 3, ref_mv->as_mv.col - MV_MAX);
+ const int maxc = MIN(x->mv_col_max << 3, ref_mv->as_mv.col + MV_MAX);
+ const int minr = MAX(x->mv_row_min << 3, ref_mv->as_mv.row - MV_MAX);
+ const int maxr = MIN(x->mv_row_max << 3, ref_mv->as_mv.row + MV_MAX);
- // now check 1 more diagonal
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
- // for(whichdir =0;whichdir<4;whichdir++)
- // {
- this_mv = startmv;
+ int tr = br;
+ int tc = bc;
- switch (whichdir) {
- case 0:
- this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
- this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
- thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, src_stride,
- &sse);
- break;
- case 1:
- this_mv.as_mv.col += 4;
- this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
- thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, src_stride,
- &sse);
- break;
- case 2:
- this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
- this_mv.as_mv.row += 4;
- thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, src_stride, &sse);
- break;
- case 3:
- default:
- this_mv.as_mv.col += 4;
- this_mv.as_mv.row += 4;
- thismse = vfp->svf_halfpix_hv(y, y_stride, z, src_stride, &sse);
- break;
- }
-
- diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (diag < bestmse) {
- *bestmv = this_mv;
- bestmse = diag;
- *distortion = thismse;
- *sse1 = sse;
- }
-
-// }
-
-
- // time to check quarter pels.
- if (bestmv->as_mv.row < startmv.as_mv.row) {
- y -= y_stride;
- yrow_movedback = 1;
- }
-
- if (bestmv->as_mv.col < startmv.as_mv.col) {
- y--;
- ycol_movedback = 1;
- }
+ const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
- startmv = *bestmv;
-
-
-
- // go left then right and check error
- this_mv.as_mv.row = startmv.as_mv.row;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col = startmv.as_mv.col - 2;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
- thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,
- src_stride, &sse);
- }
-
- left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (left < bestmse) {
- *bestmv = this_mv;
- bestmse = left;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- this_mv.as_mv.col += 4;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (right < bestmse) {
- *bestmv = this_mv;
- bestmse = right;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- // go up then down and check error
- this_mv.as_mv.col = startmv.as_mv.col;
-
- if (startmv.as_mv.row & 7) {
- this_mv.as_mv.row = startmv.as_mv.row - 2;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- } else {
- this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
- thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6),
- z, src_stride, &sse);
- }
-
- up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
-
- if (up < bestmse) {
- *bestmv = this_mv;
- bestmse = up;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- this_mv.as_mv.row += 4;
- thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
-
- if (down < bestmse) {
- *bestmv = this_mv;
- bestmse = down;
- *distortion = thismse;
- *sse1 = sse;
- }
-
-
- // now check 1 more diagonal
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-// for(whichdir=0;whichdir<4;whichdir++)
-// {
- this_mv = startmv;
-
- switch (whichdir) {
- case 0:
-
- if (startmv.as_mv.row & 7) {
- this_mv.as_mv.row -= 2;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col -= 2;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
- thismse = vfp->svf(y - 1, y_stride,
- SP(6), SP(this_mv.as_mv.row), z, src_stride, &sse);
- }
- } else {
- this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col -= 2;
- thismse = vfp->svf(y - y_stride, y_stride,
- SP(this_mv.as_mv.col), SP(6), z, src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
- thismse = vfp->svf(y - y_stride - 1, y_stride,
- SP(6), SP(6), z, src_stride, &sse);
- }
- }
-
- break;
- case 1:
- this_mv.as_mv.col += 2;
-
- if (startmv.as_mv.row & 7) {
- this_mv.as_mv.row -= 2;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- } else {
- this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
- thismse = vfp->svf(y - y_stride, y_stride,
- SP(this_mv.as_mv.col), SP(6), z, src_stride, &sse);
- }
+ // central mv
+ bestmv->as_mv.row <<= 3;
+ bestmv->as_mv.col <<= 3;
- break;
- case 2:
- this_mv.as_mv.row += 2;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col -= 2;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
- thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,
- src_stride, &sse);
- }
+ // calculate central point error
+ // TODO(yunqingwang): central pointer error was already calculated in full-
+ // pixel search, and can be passed in this function.
+ comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+ besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+ // Each subsequent iteration checks at least one point in
+ // common with the last iteration could be 2 ( if diag selected)
+ while (halfiters--) {
+ // 1/2 pel
+ FIRST_LEVEL_CHECKS;
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
break;
- case 3:
- this_mv.as_mv.col += 2;
- this_mv.as_mv.row += 2;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- break;
- }
-
- diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (diag < bestmse) {
- *bestmv = this_mv;
- bestmse = diag;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- if (!(xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv)))
- return bestmse;
-
- /* Now do 1/8th pixel */
- if (bestmv->as_mv.row < orig_mv.as_mv.row && !yrow_movedback) {
- y -= y_stride;
- yrow_movedback = 1;
- }
-
- if (bestmv->as_mv.col < orig_mv.as_mv.col && !ycol_movedback) {
- y--;
- ycol_movedback = 1;
- }
-
- startmv = *bestmv;
-
- // go left then right and check error
- this_mv.as_mv.row = startmv.as_mv.row;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col = startmv.as_mv.col - 1;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
- thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- }
-
- left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (left < bestmse) {
- *bestmv = this_mv;
- bestmse = left;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- this_mv.as_mv.col += 2;
- thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (right < bestmse) {
- *bestmv = this_mv;
- bestmse = right;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- // go up then down and check error
- this_mv.as_mv.col = startmv.as_mv.col;
-
- if (startmv.as_mv.row & 7) {
- this_mv.as_mv.row = startmv.as_mv.row - 1;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- } else {
- this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
- thismse = vfp->svf(y - y_stride, y_stride,
- SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse);
+ tr = br;
+ tc = bc;
}
- up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
-
- if (up < bestmse) {
- *bestmv = this_mv;
- bestmse = up;
- *distortion = thismse;
- *sse1 = sse;
- }
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
- this_mv.as_mv.row += 2;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (down < bestmse) {
- *bestmv = this_mv;
- bestmse = down;
- *distortion = thismse;
- *sse1 = sse;
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ hstep >>= 1;
+ while (quarteriters--) {
+ FIRST_LEVEL_CHECKS;
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+ tr = br;
+ tc = bc;
+ }
}
- // now check 1 more diagonal
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-// for(whichdir=0;whichdir<4;whichdir++)
-// {
- this_mv = startmv;
-
- switch (whichdir) {
- case 0:
-
- if (startmv.as_mv.row & 7) {
- this_mv.as_mv.row -= 1;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col -= 1;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
- thismse = vfp->svf(y - 1, y_stride,
- SP(7), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- }
- } else {
- this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col -= 1;
- thismse = vfp->svf(y - y_stride, y_stride,
- SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
- thismse = vfp->svf(y - y_stride - 1, y_stride,
- SP(7), SP(7), z, src_stride, &sse);
- }
- }
-
- break;
- case 1:
- this_mv.as_mv.col += 1;
-
- if (startmv.as_mv.row & 7) {
- this_mv.as_mv.row -= 1;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- } else {
- this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
- thismse = vfp->svf(y - y_stride, y_stride,
- SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse);
- }
-
- break;
- case 2:
- this_mv.as_mv.row += 1;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col -= 1;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
- thismse = vfp->svf(y - 1, y_stride,
- SP(7), SP(this_mv.as_mv.row), z, src_stride, &sse);
- }
-
- break;
- case 3:
- this_mv.as_mv.col += 1;
- this_mv.as_mv.row += 1;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, src_stride, &sse);
- break;
+ if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
+ forced_stop == 0) {
+ hstep >>= 1;
+ while (eighthiters--) {
+ FIRST_LEVEL_CHECKS;
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+ tr = br;
+ tc = bc;
+ }
}
+ bestmv->as_mv.row = br;
+ bestmv->as_mv.col = bc;
- diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (diag < bestmse) {
- *bestmv = this_mv;
- bestmse = diag;
- *distortion = thismse;
- *sse1 = sse;
- }
+ if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+ (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+ return INT_MAX;
- return bestmse;
+ return besterr;
}
-#undef SP
-
-int vp9_find_best_half_pixel_step(MACROBLOCK *x,
- int_mv *bestmv, int_mv *ref_mv,
- int error_per_bit,
- const vp9_variance_fn_ptr_t *vfp,
- int *mvjcost, int *mvcost[2],
- int *distortion,
- unsigned int *sse1) {
- int bestmse = INT_MAX;
- int_mv startmv;
- int_mv this_mv;
+int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ int forced_stop,
+ int iters_per_step,
+ int *mvjcost, int *mvcost[2],
+ int *distortion,
+ unsigned int *sse1,
+ const uint8_t *second_pred,
+ int w, int h) {
uint8_t *z = x->plane[0].src.buf;
int src_stride = x->plane[0].src.stride;
- int left, right, up, down, diag;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int rr, rc, br, bc, hstep;
+ int tr, tc;
+ unsigned int besterr = INT_MAX;
unsigned int sse;
- int whichdir;
+ unsigned int whichdir;
int thismse;
+ int maxc, minc, maxr, minr;
int y_stride;
- MACROBLOCKD *xd = &x->e_mbd;
+ int offset;
+ unsigned int halfiters = iters_per_step;
+ unsigned int quarteriters = iters_per_step;
+ unsigned int eighthiters = iters_per_step;
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
uint8_t *y = xd->plane[0].pre[0].buf +
- (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + bestmv->as_mv.col;
+ (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
+ bestmv->as_mv.col;
+
y_stride = xd->plane[0].pre[0].stride;
+ rr = ref_mv->as_mv.row;
+ rc = ref_mv->as_mv.col;
+ br = bestmv->as_mv.row << 3;
+ bc = bestmv->as_mv.col << 3;
+ hstep = 4;
+ minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
+ ((1 << MV_MAX_BITS) - 1));
+ maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
+ ((1 << MV_MAX_BITS) - 1));
+ minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
+ ((1 << MV_MAX_BITS) - 1));
+ maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
+ ((1 << MV_MAX_BITS) - 1));
+
+ tr = br;
+ tc = bc;
+
+
+ offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
// central mv
bestmv->as_mv.row <<= 3;
bestmv->as_mv.col <<= 3;
- startmv = *bestmv;
// calculate central point error
- bestmse = vfp->vf(y, y_stride, z, src_stride, sse1);
- *distortion = bestmse;
- bestmse += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
- // go left then right and check error
- this_mv.as_mv.row = startmv.as_mv.row;
- this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
- thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, src_stride, &sse);
- left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (left < bestmse) {
- *bestmv = this_mv;
- bestmse = left;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- this_mv.as_mv.col += 8;
- thismse = vfp->svf_halfpix_h(y, y_stride, z, src_stride, &sse);
- right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (right < bestmse) {
- *bestmv = this_mv;
- bestmse = right;
- *distortion = thismse;
- *sse1 = sse;
- }
+ // TODO(yunqingwang): central pointer error was already calculated in full-
+ // pixel search, and can be passed in this function.
+ comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+ besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
- // go up then down and check error
- this_mv.as_mv.col = startmv.as_mv.col;
- this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
- thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, src_stride, &sse);
- up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
-
- if (up < bestmse) {
- *bestmv = this_mv;
- bestmse = up;
- *distortion = thismse;
- *sse1 = sse;
+ // Each subsequent iteration checks at least one point in
+ // common with the last iteration could be 2 ( if diag selected)
+ // 1/2 pel
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
}
+ tr = br;
+ tc = bc;
- this_mv.as_mv.row += 8;
- thismse = vfp->svf_halfpix_v(y, y_stride, z, src_stride, &sse);
- down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
- if (down < bestmse) {
- *bestmv = this_mv;
- bestmse = down;
- *distortion = thismse;
- *sse1 = sse;
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
}
- // now check 1 more diagonal -
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
- this_mv = startmv;
-
- switch (whichdir) {
- case 0:
- this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
- this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
- thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride,
- z, src_stride, &sse);
- break;
- case 1:
- this_mv.as_mv.col += 4;
- this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
- thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride,
- z, src_stride, &sse);
- break;
- case 2:
- this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
- this_mv.as_mv.row += 4;
- thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, src_stride, &sse);
- break;
- case 3:
- default:
- this_mv.as_mv.col += 4;
- this_mv.as_mv.row += 4;
- thismse = vfp->svf_halfpix_hv(y, y_stride, z, src_stride, &sse);
- break;
+ if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
+ forced_stop == 0) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
}
+ bestmv->as_mv.row = br;
+ bestmv->as_mv.col = bc;
- diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (diag < bestmse) {
- *bestmv = this_mv;
- bestmse = diag;
- *distortion = thismse;
- *sse1 = sse;
- }
+ if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+ (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+ return INT_MAX;
- return bestmse;
+ return besterr;
}
+#undef MVC
+#undef PRE
+#undef DIST
+#undef IFMVCV
+#undef CHECK_BETTER
+#undef SP
+
#define CHECK_BOUNDS(range) \
{\
all_in = 1;\
@@ -1245,8 +764,10 @@ int vp9_find_best_half_pixel_step(MACROBLOCK *x,
{\
if (thissad < bestsad)\
{\
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost, \
- sad_per_bit);\
+ if (use_mvcost) \
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, \
+ mvjsadcost, mvsadcost, \
+ sad_per_bit);\
if (thissad < bestsad)\
{\
bestsad = thissad;\
@@ -1255,46 +776,53 @@ int vp9_find_best_half_pixel_step(MACROBLOCK *x,
}\
}
-static const MV next_chkpts[6][3] = {
- {{ -2, 0}, { -1, -2}, {1, -2}},
- {{ -1, -2}, {1, -2}, {2, 0}},
- {{1, -2}, {2, 0}, {1, 2}},
- {{2, 0}, {1, 2}, { -1, 2}},
- {{1, 2}, { -1, 2}, { -2, 0}},
- {{ -1, 2}, { -2, 0}, { -1, -2}}
-};
-
-int vp9_hex_search
-(
- MACROBLOCK *x,
- int_mv *ref_mv,
- int_mv *best_mv,
- int search_param,
- int sad_per_bit,
- const vp9_variance_fn_ptr_t *vfp,
- int *mvjsadcost, int *mvsadcost[2],
- int *mvjcost, int *mvcost[2],
- int_mv *center_mv
-) {
+#define get_next_chkpts(list, i, n) \
+ list[0] = ((i) == 0 ? (n) - 1 : (i) - 1); \
+ list[1] = (i); \
+ list[2] = ((i) == (n) - 1 ? 0 : (i) + 1);
+
+#define MAX_PATTERN_SCALES 11
+#define MAX_PATTERN_CANDIDATES 8 // max number of canddiates per scale
+#define PATTERN_CANDIDATES_REF 3 // number of refinement candidates
+
+// Generic pattern search function that searches over multiple scales.
+// Each scale can have a different number of candidates and shape of
+// candidates as indicated in the num_candidates and candidates arrays
+// passed into this function
+static int vp9_pattern_search(MACROBLOCK *x,
+ int_mv *ref_mv,
+ int search_param,
+ int sad_per_bit,
+ int do_init_search,
+ int do_refine,
+ const vp9_variance_fn_ptr_t *vfp,
+ int use_mvcost,
+ int_mv *center_mv, int_mv *best_mv,
+ const int num_candidates[MAX_PATTERN_SCALES],
+ const MV candidates[MAX_PATTERN_SCALES]
+ [MAX_PATTERN_CANDIDATES]) {
const MACROBLOCKD* const xd = &x->e_mbd;
- MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} };
- MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
- int i, j;
-
+ static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+ 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+ };
+ int i, j, s, t;
uint8_t *what = x->plane[0].src.buf;
int what_stride = x->plane[0].src.stride;
int in_what_stride = xd->plane[0].pre[0].stride;
int br, bc;
int_mv this_mv;
- unsigned int bestsad = 0x7fffffff;
- unsigned int thissad;
+ int bestsad = INT_MAX;
+ int thissad;
uint8_t *base_offset;
uint8_t *this_offset;
int k = -1;
int all_in;
int best_site = -1;
-
int_mv fcenter_mv;
+ int best_init_s = search_param_to_steps[search_param];
+ int *mvjsadcost = x->nmvjointsadcost;
+ int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
@@ -1306,7 +834,7 @@ int vp9_hex_search
// Work out the start point for the search
base_offset = (uint8_t *)(xd->plane[0].pre[0].buf);
- this_offset = base_offset + (br * (xd->plane[0].pre[0].stride)) + bc;
+ this_offset = base_offset + (br * in_what_stride) + bc;
this_mv.as_mv.row = br;
this_mv.as_mv.col = bc;
bestsad = vfp->sdf(what, what_stride, this_offset,
@@ -1314,109 +842,310 @@ int vp9_hex_search
+ mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost,
sad_per_bit);
- // hex search
- // j=0
- CHECK_BOUNDS(2)
-
- if (all_in) {
- for (i = 0; i < 6; i++) {
- this_mv.as_mv.row = br + hex[i].row;
- this_mv.as_mv.col = bc + hex[i].col;
- this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
- CHECK_BETTER
+ // Search all possible scales upto the search param around the center point
+ // pick the scale of the point that is best as the starting scale of
+ // further steps around it.
+ if (do_init_search) {
+ s = best_init_s;
+ best_init_s = -1;
+ for (t = 0; t <= s; ++t) {
+ best_site = -1;
+ CHECK_BOUNDS((1 << t))
+ if (all_in) {
+ for (i = 0; i < num_candidates[t]; i++) {
+ this_mv.as_mv.row = br + candidates[t][i].row;
+ this_mv.as_mv.col = bc + candidates[t][i].col;
+ this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
+ this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+ bestsad);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[t]; i++) {
+ this_mv.as_mv.row = br + candidates[t][i].row;
+ this_mv.as_mv.col = bc + candidates[t][i].col;
+ CHECK_POINT
+ this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
+ this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+ bestsad);
+ CHECK_BETTER
+ }
+ }
+ if (best_site == -1) {
+ continue;
+ } else {
+ best_init_s = t;
+ k = best_site;
+ }
}
- } else {
- for (i = 0; i < 6; i++) {
- this_mv.as_mv.row = br + hex[i].row;
- this_mv.as_mv.col = bc + hex[i].col;
- CHECK_POINT
- this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
- CHECK_BETTER
+ if (best_init_s != -1) {
+ br += candidates[best_init_s][k].row;
+ bc += candidates[best_init_s][k].col;
}
}
- if (best_site == -1)
- goto cal_neighbors;
- else {
- br += hex[best_site].row;
- bc += hex[best_site].col;
- k = best_site;
- }
-
- for (j = 1; j < 127; j++) {
+ // If the center point is still the best, just skip this and move to
+ // the refinement step.
+ if (best_init_s != -1) {
+ s = best_init_s;
best_site = -1;
- CHECK_BOUNDS(2)
+ do {
+ // No need to search all 6 points the 1st time if initial search was used
+ if (!do_init_search || s != best_init_s) {
+ CHECK_BOUNDS((1 << s))
+ if (all_in) {
+ for (i = 0; i < num_candidates[s]; i++) {
+ this_mv.as_mv.row = br + candidates[s][i].row;
+ this_mv.as_mv.col = bc + candidates[s][i].col;
+ this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
+ this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+ bestsad);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[s]; i++) {
+ this_mv.as_mv.row = br + candidates[s][i].row;
+ this_mv.as_mv.col = bc + candidates[s][i].col;
+ CHECK_POINT
+ this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
+ this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+ bestsad);
+ CHECK_BETTER
+ }
+ }
- if (all_in) {
- for (i = 0; i < 3; i++) {
- this_mv.as_mv.row = br + next_chkpts[k][i].row;
- this_mv.as_mv.col = bc + next_chkpts[k][i].col;
- this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
- CHECK_BETTER
- }
- } else {
- for (i = 0; i < 3; i++) {
- this_mv.as_mv.row = br + next_chkpts[k][i].row;
- this_mv.as_mv.col = bc + next_chkpts[k][i].col;
- CHECK_POINT
- this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
- CHECK_BETTER
+ if (best_site == -1) {
+ continue;
+ } else {
+ br += candidates[s][best_site].row;
+ bc += candidates[s][best_site].col;
+ k = best_site;
+ }
}
- }
- if (best_site == -1)
- break;
- else {
- br += next_chkpts[k][best_site].row;
- bc += next_chkpts[k][best_site].col;
- k += 5 + best_site;
- if (k >= 12) k -= 12;
- else if (k >= 6) k -= 6;
- }
+ do {
+ int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+ best_site = -1;
+ CHECK_BOUNDS((1 << s))
+
+ get_next_chkpts(next_chkpts_indices, k, num_candidates[s]);
+ if (all_in) {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ this_mv.as_mv.row = br +
+ candidates[s][next_chkpts_indices[i]].row;
+ this_mv.as_mv.col = bc +
+ candidates[s][next_chkpts_indices[i]].col;
+ this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
+ this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+ bestsad);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ this_mv.as_mv.row = br +
+ candidates[s][next_chkpts_indices[i]].row;
+ this_mv.as_mv.col = bc +
+ candidates[s][next_chkpts_indices[i]].col;
+ CHECK_POINT
+ this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
+ this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+ bestsad);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site != -1) {
+ k = next_chkpts_indices[best_site];
+ br += candidates[s][k].row;
+ bc += candidates[s][k].col;
+ }
+ } while (best_site != -1);
+ } while (s--);
}
- // check 4 1-away neighbors
-cal_neighbors:
- for (j = 0; j < 32; j++) {
- best_site = -1;
- CHECK_BOUNDS(1)
+ // Check 4 1-away neighbors if do_refine is true.
+ // For most well-designed schemes do_refine will not be necessary.
+ if (do_refine) {
+ static const MV neighbors[4] = {
+ {0, -1}, { -1, 0}, {1, 0}, {0, 1},
+ };
+ for (j = 0; j < 16; j++) {
+ best_site = -1;
+ CHECK_BOUNDS(1)
+ if (all_in) {
+ for (i = 0; i < 4; i++) {
+ this_mv.as_mv.row = br + neighbors[i].row;
+ this_mv.as_mv.col = bc + neighbors[i].col;
+ this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
+ this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+ bestsad);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < 4; i++) {
+ this_mv.as_mv.row = br + neighbors[i].row;
+ this_mv.as_mv.col = bc + neighbors[i].col;
+ CHECK_POINT
+ this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
+ this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+ bestsad);
+ CHECK_BETTER
+ }
+ }
- if (all_in) {
- for (i = 0; i < 4; i++) {
- this_mv.as_mv.row = br + neighbors[i].row;
- this_mv.as_mv.col = bc + neighbors[i].col;
- this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
- CHECK_BETTER
- }
- } else {
- for (i = 0; i < 4; i++) {
- this_mv.as_mv.row = br + neighbors[i].row;
- this_mv.as_mv.col = bc + neighbors[i].col;
- CHECK_POINT
- this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
- CHECK_BETTER
+ if (best_site == -1) {
+ break;
+ } else {
+ br += neighbors[best_site].row;
+ bc += neighbors[best_site].col;
}
}
-
- if (best_site == -1)
- break;
- else {
- br += neighbors[best_site].row;
- bc += neighbors[best_site].col;
- }
}
best_mv->as_mv.row = br;
best_mv->as_mv.col = bc;
- return bestsad;
+ this_offset = base_offset + (best_mv->as_mv.row * (in_what_stride)) +
+ best_mv->as_mv.col;
+ this_mv.as_mv.row = best_mv->as_mv.row << 3;
+ this_mv.as_mv.col = best_mv->as_mv.col << 3;
+ if (bestsad == INT_MAX)
+ return INT_MAX;
+ return
+ vfp->vf(what, what_stride, this_offset, in_what_stride,
+ (unsigned int *)(&bestsad)) +
+ use_mvcost ? mv_err_cost(&this_mv, center_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit) : 0;
+}
+
+
+int vp9_hex_search(MACROBLOCK *x,
+ int_mv *ref_mv,
+ int search_param,
+ int sad_per_bit,
+ int do_init_search,
+ const vp9_variance_fn_ptr_t *vfp,
+ int use_mvcost,
+ int_mv *center_mv, int_mv *best_mv) {
+ // First scale has 8-closest points, the rest have 6 points in hex shape
+ // at increasing scales
+ static const int hex_num_candidates[MAX_PATTERN_SCALES] = {
+ 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6
+ };
+ // Note that the largest candidate step at each scale is 2^scale
+ static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ {{-1, -1}, {0, -1}, {1, -1}, {1, 0}, {1, 1}, { 0, 1}, { -1, 1}, {-1, 0}},
+ {{-1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0}},
+ {{-2, -4}, {2, -4}, {4, 0}, {2, 4}, { -2, 4}, { -4, 0}},
+ {{-4, -8}, {4, -8}, {8, 0}, {4, 8}, { -4, 8}, { -8, 0}},
+ {{-8, -16}, {8, -16}, {16, 0}, {8, 16}, { -8, 16}, { -16, 0}},
+ {{-16, -32}, {16, -32}, {32, 0}, {16, 32}, { -16, 32}, { -32, 0}},
+ {{-32, -64}, {32, -64}, {64, 0}, {32, 64}, { -32, 64}, { -64, 0}},
+ {{-64, -128}, {64, -128}, {128, 0}, {64, 128}, { -64, 128}, { -128, 0}},
+ {{-128, -256}, {128, -256}, {256, 0}, {128, 256}, { -128, 256}, { -256, 0}},
+ {{-256, -512}, {256, -512}, {512, 0}, {256, 512}, { -256, 512}, { -512, 0}},
+ {{-512, -1024}, {512, -1024}, {1024, 0}, {512, 1024}, { -512, 1024},
+ { -1024, 0}},
+ };
+ return
+ vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+ do_init_search, 0, vfp, use_mvcost,
+ center_mv, best_mv,
+ hex_num_candidates, hex_candidates);
+}
+
+int vp9_bigdia_search(MACROBLOCK *x,
+ int_mv *ref_mv,
+ int search_param,
+ int sad_per_bit,
+ int do_init_search,
+ const vp9_variance_fn_ptr_t *vfp,
+ int use_mvcost,
+ int_mv *center_mv,
+ int_mv *best_mv) {
+ // First scale has 4-closest points, the rest have 8 points in diamond
+ // shape at increasing scales
+ static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+ 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ };
+ // Note that the largest candidate step at each scale is 2^scale
+ static const MV bigdia_candidates[MAX_PATTERN_SCALES]
+ [MAX_PATTERN_CANDIDATES] = {
+ {{0, -1}, {1, 0}, { 0, 1}, {-1, 0}},
+ {{-1, -1}, {0, -2}, {1, -1}, {2, 0}, {1, 1}, {0, 2}, {-1, 1}, {-2, 0}},
+ {{-2, -2}, {0, -4}, {2, -2}, {4, 0}, {2, 2}, {0, 4}, {-2, 2}, {-4, 0}},
+ {{-4, -4}, {0, -8}, {4, -4}, {8, 0}, {4, 4}, {0, 8}, {-4, 4}, {-8, 0}},
+ {{-8, -8}, {0, -16}, {8, -8}, {16, 0}, {8, 8}, {0, 16}, {-8, 8}, {-16, 0}},
+ {{-16, -16}, {0, -32}, {16, -16}, {32, 0}, {16, 16}, {0, 32},
+ {-16, 16}, {-32, 0}},
+ {{-32, -32}, {0, -64}, {32, -32}, {64, 0}, {32, 32}, {0, 64},
+ {-32, 32}, {-64, 0}},
+ {{-64, -64}, {0, -128}, {64, -64}, {128, 0}, {64, 64}, {0, 128},
+ {-64, 64}, {-128, 0}},
+ {{-128, -128}, {0, -256}, {128, -128}, {256, 0}, {128, 128}, {0, 256},
+ {-128, 128}, {-256, 0}},
+ {{-256, -256}, {0, -512}, {256, -256}, {512, 0}, {256, 256}, {0, 512},
+ {-256, 256}, {-512, 0}},
+ {{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024},
+ {-512, 512}, {-1024, 0}},
+ };
+ return
+ vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+ do_init_search, 0, vfp, use_mvcost,
+ center_mv, best_mv,
+ bigdia_num_candidates, bigdia_candidates);
}
+
+int vp9_square_search(MACROBLOCK *x,
+ int_mv *ref_mv,
+ int search_param,
+ int sad_per_bit,
+ int do_init_search,
+ const vp9_variance_fn_ptr_t *vfp,
+ int use_mvcost,
+ int_mv *center_mv,
+ int_mv *best_mv) {
+ // All scales have 8 closest points in square shape
+ static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ };
+ // Note that the largest candidate step at each scale is 2^scale
+ static const MV square_candidates[MAX_PATTERN_SCALES]
+ [MAX_PATTERN_CANDIDATES] = {
+ {{-1, -1}, {0, -1}, {1, -1}, {1, 0}, {1, 1}, {0, 1}, {-1, 1}, {-1, 0}},
+ {{-2, -2}, {0, -2}, {2, -2}, {2, 0}, {2, 2}, {0, 2}, {-2, 2}, {-2, 0}},
+ {{-4, -4}, {0, -4}, {4, -4}, {4, 0}, {4, 4}, {0, 4}, {-4, 4}, {-4, 0}},
+ {{-8, -8}, {0, -8}, {8, -8}, {8, 0}, {8, 8}, {0, 8}, {-8, 8}, {-8, 0}},
+ {{-16, -16}, {0, -16}, {16, -16}, {16, 0}, {16, 16}, {0, 16},
+ {-16, 16}, {-16, 0}},
+ {{-32, -32}, {0, -32}, {32, -32}, {32, 0}, {32, 32}, {0, 32},
+ {-32, 32}, {-32, 0}},
+ {{-64, -64}, {0, -64}, {64, -64}, {64, 0}, {64, 64}, {0, 64},
+ {-64, 64}, {-64, 0}},
+ {{-128, -128}, {0, -128}, {128, -128}, {128, 0}, {128, 128}, {0, 128},
+ {-128, 128}, {-128, 0}},
+ {{-256, -256}, {0, -256}, {256, -256}, {256, 0}, {256, 256}, {0, 256},
+ {-256, 256}, {-256, 0}},
+ {{-512, -512}, {0, -512}, {512, -512}, {512, 0}, {512, 512}, {0, 512},
+ {-512, 512}, {-512, 0}},
+ {{-1024, -1024}, {0, -1024}, {1024, -1024}, {1024, 0}, {1024, 1024},
+ {0, 1024}, {-1024, 1024}, {-1024, 0}},
+ };
+ return
+ vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+ do_init_search, 0, vfp, use_mvcost,
+ center_mv, best_mv,
+ square_num_candidates, square_candidates);
+};
+
#undef CHECK_BOUNDS
#undef CHECK_POINT
#undef CHECK_BETTER
@@ -1808,7 +1537,7 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
int in_what_stride = xd->plane[0].pre[0].stride;
int mv_stride = xd->plane[0].pre[0].stride;
uint8_t *bestaddress;
- int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0];
+ int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0];
int_mv this_mv;
int bestsad = INT_MAX;
int r, c;
@@ -1844,18 +1573,12 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
+ mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
sad_per_bit);
- // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
- if (col_min < x->mv_col_min)
- col_min = x->mv_col_min;
-
- if (col_max > x->mv_col_max)
- col_max = x->mv_col_max;
-
- if (row_min < x->mv_row_min)
- row_min = x->mv_row_min;
-
- if (row_max > x->mv_row_max)
- row_max = x->mv_row_max;
+ // Apply further limits to prevent us looking using vectors that stretch
+ // beyond the UMV border
+ col_min = MAX(col_min, x->mv_col_min);
+ col_max = MIN(col_max, x->mv_col_max);
+ row_min = MAX(row_min, x->mv_row_min);
+ row_max = MIN(row_max, x->mv_row_max);
for (r = row_min; r < row_max; r++) {
this_mv.as_mv.row = r;
@@ -1902,7 +1625,7 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
int in_what_stride = xd->plane[0].pre[0].stride;
int mv_stride = xd->plane[0].pre[0].stride;
uint8_t *bestaddress;
- int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0];
+ int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0];
int_mv this_mv;
unsigned int bestsad = INT_MAX;
int r, c;
@@ -1940,18 +1663,12 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
+ mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
sad_per_bit);
- // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
- if (col_min < x->mv_col_min)
- col_min = x->mv_col_min;
-
- if (col_max > x->mv_col_max)
- col_max = x->mv_col_max;
-
- if (row_min < x->mv_row_min)
- row_min = x->mv_row_min;
-
- if (row_max > x->mv_row_max)
- row_max = x->mv_row_max;
+ // Apply further limits to prevent us looking using vectors that stretch
+ // beyond the UMV border
+ col_min = MAX(col_min, x->mv_col_min);
+ col_max = MIN(col_max, x->mv_col_max);
+ row_min = MAX(row_min, x->mv_row_min);
+ row_max = MIN(row_max, x->mv_row_max);
for (r = row_min; r < row_max; r++) {
this_mv.as_mv.row = r;
@@ -2030,7 +1747,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
int in_what_stride = xd->plane[0].pre[0].stride;
int mv_stride = xd->plane[0].pre[0].stride;
uint8_t *bestaddress;
- int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0];
+ int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0];
int_mv this_mv;
unsigned int bestsad = INT_MAX;
int r, c;
@@ -2069,18 +1786,12 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
+ mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost,
sad_per_bit);
- // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
- if (col_min < x->mv_col_min)
- col_min = x->mv_col_min;
-
- if (col_max > x->mv_col_max)
- col_max = x->mv_col_max;
-
- if (row_min < x->mv_row_min)
- row_min = x->mv_row_min;
-
- if (row_max > x->mv_row_max)
- row_max = x->mv_row_max;
+ // Apply further limits to prevent us looking using vectors that stretch
+ // beyond the UMV border
+ col_min = MAX(col_min, x->mv_col_min);
+ col_max = MIN(col_max, x->mv_col_max);
+ row_min = MAX(row_min, x->mv_row_min);
+ row_max = MIN(row_max, x->mv_row_max);
for (r = row_min; r < row_max; r++) {
this_mv.as_mv.row = r;
@@ -2113,7 +1824,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
}
}
- while ((c + 2) < col_max) {
+ while ((c + 2) < col_max && fn_ptr->sdx3f != NULL) {
int i;
fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h
index 097d33c..3598fa0 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/libvpx/vp9/encoder/vp9_mcomp.h
@@ -23,7 +23,7 @@
// Maximum size of the first step in full pel units
#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
-void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);
+void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv);
int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,
int *mvcost[2], int weight);
void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
@@ -40,19 +40,61 @@ int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
int_mv *ref_mv, int_mv *dst_mv);
int vp9_hex_search(MACROBLOCK *x,
- int_mv *ref_mv, int_mv *best_mv,
- int search_param, int error_per_bit,
+ int_mv *ref_mv,
+ int search_param,
+ int error_per_bit,
+ int do_init_search,
const vp9_variance_fn_ptr_t *vf,
- int *mvjsadcost, int *mvsadcost[2],
- int *mvjcost, int *mvcost[2],
- int_mv *center_mv);
+ int use_mvcost,
+ int_mv *center_mv,
+ int_mv *best_mv);
+int vp9_bigdia_search(MACROBLOCK *x,
+ int_mv *ref_mv,
+ int search_param,
+ int error_per_bit,
+ int do_init_search,
+ const vp9_variance_fn_ptr_t *vf,
+ int use_mvcost,
+ int_mv *center_mv,
+ int_mv *best_mv);
+int vp9_square_search(MACROBLOCK *x,
+ int_mv *ref_mv,
+ int search_param,
+ int error_per_bit,
+ int do_init_search,
+ const vp9_variance_fn_ptr_t *vf,
+ int use_mvcost,
+ int_mv *center_mv,
+ int_mv *best_mv);
-typedef int (fractional_mv_step_fp) (MACROBLOCK *x, int_mv
- *bestmv, int_mv *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp,
- int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse);
-extern fractional_mv_step_fp vp9_find_best_sub_pixel_step_iteratively;
-extern fractional_mv_step_fp vp9_find_best_sub_pixel_step;
-extern fractional_mv_step_fp vp9_find_best_half_pixel_step;
+typedef int (fractional_mv_step_fp) (
+ MACROBLOCK *x,
+ int_mv *bestmv,
+ int_mv *ref_mv,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ int forced_stop, // 0 - full, 1 - qtr only, 2 - half only
+ int iters_per_step,
+ int *mvjcost,
+ int *mvcost[2],
+ int *distortion,
+ unsigned int *sse);
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_iterative;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
+
+typedef int (fractional_mv_step_comp_fp) (
+ MACROBLOCK *x,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ int forced_stop, // 0 - full, 1 - qtr only, 2 - half only
+ int iters_per_step,
+ int *mvjcost, int *mvcost[2],
+ int *distortion, unsigned int *sse1,
+ const uint8_t *second_pred,
+ int w, int h);
+extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_iterative;
+extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree;
typedef int (*vp9_full_search_fn_t)(MACROBLOCK *x,
int_mv *ref_mv, int sad_per_bit,
@@ -75,15 +117,6 @@ typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x,
int *mvjcost, int *mvcost[2],
int_mv *center_mv);
-int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
- int_mv *bestmv, int_mv *ref_mv,
- int error_per_bit,
- const vp9_variance_fn_ptr_t *vfp,
- int *mvjcost, int *mvcost[2],
- int *distortion, unsigned int *sse1,
- const uint8_t *second_pred,
- int w, int h);
-
int vp9_refining_search_8p_c(MACROBLOCK *x,
int_mv *ref_mv, int error_per_bit,
int search_range, vp9_variance_fn_ptr_t *fn_ptr,
diff --git a/libvpx/vp9/encoder/vp9_modecosts.c b/libvpx/vp9/encoder/vp9_modecosts.c
index 993aba7..a5dfaed 100644
--- a/libvpx/vp9/encoder/vp9_modecosts.c
+++ b/libvpx/vp9/encoder/vp9_modecosts.c
@@ -16,28 +16,28 @@
void vp9_init_mode_costs(VP9_COMP *c) {
- VP9_COMMON *x = &c->common;
+ VP9_COMMON *const cm = &c->common;
const vp9_tree_p KT = vp9_intra_mode_tree;
int i, j;
- for (i = 0; i < VP9_INTRA_MODES; i++) {
- for (j = 0; j < VP9_INTRA_MODES; j++) {
+ for (i = 0; i < INTRA_MODES; i++) {
+ for (j = 0; j < INTRA_MODES; j++) {
vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
KT);
}
}
// TODO(rbultje) separate tables for superblock costing?
- vp9_cost_tokens(c->mb.mbmode_cost, x->fc.y_mode_prob[1],
+ vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1],
vp9_intra_mode_tree);
vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
- x->fc.uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree);
+ cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
- vp9_kf_uv_mode_prob[VP9_INTRA_MODES - 1],
+ vp9_kf_uv_mode_prob[INTRA_MODES - 1],
vp9_intra_mode_tree);
- for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)
+ for (i = 0; i <= SWITCHABLE_FILTERS; ++i)
vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
- x->fc.switchable_interp_prob[i],
+ cm->fc.switchable_interp_prob[i],
vp9_switchable_interp_tree);
}
diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_onyx_if.c
index db03995..883b31e 100644
--- a/libvpx/vp9/encoder/vp9_onyx_if.c
+++ b/libvpx/vp9/encoder/vp9_onyx_if.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -28,7 +28,7 @@
#include "vp9/encoder/vp9_segmentation.h"
#include "./vp9_rtcd.h"
#include "./vpx_scale_rtcd.h"
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
#include "vp9/common/vp9_postproc.h"
#endif
#include "vpx_mem/vpx_mem.h"
@@ -49,14 +49,10 @@
extern void print_tree_update_probs();
-static void set_default_lf_deltas(VP9_COMP *cpi);
+static void set_default_lf_deltas(struct loopfilter *lf);
#define DEFAULT_INTERP_FILTER SWITCHABLE
-#define SEARCH_BEST_FILTER 0 /* to search exhaustively for
- best filter */
-#define RESET_FOREACH_FILTER 0 /* whether to reset the encoder state
- before trying each new filter */
#define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */
#define ALTREF_HIGH_PRECISION_MV 1 /* whether to use high precision mv
@@ -98,15 +94,11 @@ FILE *keyfile;
#ifdef ENTROPY_STATS
-extern int intra_mode_stats[VP9_INTRA_MODES]
- [VP9_INTRA_MODES]
- [VP9_INTRA_MODES];
+extern int intra_mode_stats[INTRA_MODES]
+ [INTRA_MODES]
+ [INTRA_MODES];
#endif
-#ifdef NMV_STATS
-extern void init_nmvstats();
-extern void print_nmvstats();
-#endif
#ifdef MODE_STATS
extern void init_tx_count_stats();
extern void write_tx_count_stats();
@@ -241,10 +233,9 @@ void vp9_initialize_enc() {
}
}
-static void setup_features(VP9_COMP *cpi) {
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
- struct loopfilter *const lf = &xd->lf;
- struct segmentation *const seg = &xd->seg;
+static void setup_features(VP9_COMMON *cm) {
+ struct loopfilter *const lf = &cm->lf;
+ struct segmentation *const seg = &cm->seg;
// Set up default state for MB feature flags
seg->enabled = 0;
@@ -262,7 +253,7 @@ static void setup_features(VP9_COMP *cpi) {
vp9_zero(lf->last_ref_deltas);
vp9_zero(lf->last_mode_deltas);
- set_default_lf_deltas(cpi);
+ set_default_lf_deltas(lf);
}
static void dealloc_compressor_data(VP9_COMP *cpi) {
@@ -324,8 +315,7 @@ static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
static void configure_static_seg_features(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
- struct segmentation *seg = &xd->seg;
+ struct segmentation *seg = &cm->seg;
int high_q = (int)(cpi->avg_q > 48.0);
int qi_delta;
@@ -450,9 +440,9 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
void vp9_update_mode_context_stats(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
int i, j;
- unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =
+ unsigned int (*inter_mode_counts)[INTER_MODES - 1][2] =
cm->fc.inter_mode_counts;
- int64_t (*mv_ref_stats)[VP9_INTER_MODES - 1][2] = cpi->mv_ref_stats;
+ int64_t (*mv_ref_stats)[INTER_MODES - 1][2] = cpi->mv_ref_stats;
FILE *f;
// Read the past stats counters
@@ -466,7 +456,7 @@ void vp9_update_mode_context_stats(VP9_COMP *cpi) {
// Add in the values for this frame
for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
- for (j = 0; j < VP9_INTER_MODES - 1; j++) {
+ for (j = 0; j < INTER_MODES - 1; j++) {
mv_ref_stats[i][j][0] += (int64_t)inter_mode_counts[i][j][0];
mv_ref_stats[i][j][1] += (int64_t)inter_mode_counts[i][j][1];
}
@@ -485,12 +475,12 @@ void print_mode_context(VP9_COMP *cpi) {
fprintf(f, "#include \"vp9_entropy.h\"\n");
fprintf(
f,
- "const int inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1] =");
+ "const int inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1] =");
fprintf(f, "{\n");
for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
fprintf(f, " {/* %d */ ", j);
fprintf(f, " ");
- for (i = 0; i < VP9_INTER_MODES - 1; i++) {
+ for (i = 0; i < INTER_MODES - 1; i++) {
int this_prob;
int64_t count = cpi->mv_ref_stats[j][i][0] + cpi->mv_ref_stats[j][i][1];
if (count)
@@ -533,22 +523,20 @@ static void print_seg_map(VP9_COMP *cpi) {
static void update_reference_segmentation_map(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
int row, col;
- MODE_INFO *mi, *mi_ptr = cm->mi;
+ MODE_INFO **mi_8x8, **mi_8x8_ptr = cm->mi_grid_visible;
uint8_t *cache_ptr = cm->last_frame_seg_map, *cache;
for (row = 0; row < cm->mi_rows; row++) {
- mi = mi_ptr;
+ mi_8x8 = mi_8x8_ptr;
cache = cache_ptr;
- for (col = 0; col < cm->mi_cols; col++, mi++, cache++)
- cache[0] = mi->mbmi.segment_id;
- mi_ptr += cm->mode_info_stride;
+ for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
+ cache[0] = mi_8x8[0]->mbmi.segment_id;
+ mi_8x8_ptr += cm->mode_info_stride;
cache_ptr += cm->mi_cols;
}
}
-static void set_default_lf_deltas(VP9_COMP *cpi) {
- struct loopfilter *lf = &cpi->mb.e_mbd.lf;
-
+static void set_default_lf_deltas(struct loopfilter *lf) {
lf->mode_ref_delta_enabled = 1;
lf->mode_ref_delta_update = 1;
@@ -565,9 +553,8 @@ static void set_default_lf_deltas(VP9_COMP *cpi) {
lf->mode_deltas[1] = 0; // New mv
}
-static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
+static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
SPEED_FEATURES *sf = &cpi->sf;
- int speed_multiplier = speed + 1;
int i;
// Set baseline threshold values
@@ -578,46 +565,46 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
sf->thresh_mult[THR_NEARESTG] = 0;
sf->thresh_mult[THR_NEARESTA] = 0;
- sf->thresh_mult[THR_NEWMV] += speed_multiplier * 1000;
- sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1000;
- sf->thresh_mult[THR_NEARMV] += speed_multiplier * 1000;
- sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1000;
-
- sf->thresh_mult[THR_DC] += speed_multiplier * 1000;
-
- sf->thresh_mult[THR_NEWG] += speed_multiplier * 1000;
- sf->thresh_mult[THR_NEWA] += speed_multiplier * 1000;
- sf->thresh_mult[THR_NEARA] += speed_multiplier * 1000;
-
- sf->thresh_mult[THR_TM] += speed_multiplier * 1000;
-
- sf->thresh_mult[THR_COMP_NEARLA] += speed_multiplier * 1500;
- sf->thresh_mult[THR_COMP_NEWLA] += speed_multiplier * 2000;
- sf->thresh_mult[THR_NEARG] += speed_multiplier * 1000;
- sf->thresh_mult[THR_COMP_NEARGA] += speed_multiplier * 1500;
- sf->thresh_mult[THR_COMP_NEWGA] += speed_multiplier * 2000;
-
- sf->thresh_mult[THR_SPLITMV] += speed_multiplier * 2500;
- sf->thresh_mult[THR_SPLITG] += speed_multiplier * 2500;
- sf->thresh_mult[THR_SPLITA] += speed_multiplier * 2500;
- sf->thresh_mult[THR_COMP_SPLITLA] += speed_multiplier * 4500;
- sf->thresh_mult[THR_COMP_SPLITGA] += speed_multiplier * 4500;
-
- sf->thresh_mult[THR_ZEROMV] += speed_multiplier * 2000;
- sf->thresh_mult[THR_ZEROG] += speed_multiplier * 2000;
- sf->thresh_mult[THR_ZEROA] += speed_multiplier * 2000;
- sf->thresh_mult[THR_COMP_ZEROLA] += speed_multiplier * 2500;
- sf->thresh_mult[THR_COMP_ZEROGA] += speed_multiplier * 2500;
-
- sf->thresh_mult[THR_B_PRED] += speed_multiplier * 2500;
- sf->thresh_mult[THR_H_PRED] += speed_multiplier * 2000;
- sf->thresh_mult[THR_V_PRED] += speed_multiplier * 2000;
- sf->thresh_mult[THR_D45_PRED ] += speed_multiplier * 2500;
- sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 2500;
- sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 2500;
- sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 2500;
- sf->thresh_mult[THR_D27_PRED] += speed_multiplier * 2500;
- sf->thresh_mult[THR_D63_PRED] += speed_multiplier * 2500;
+ sf->thresh_mult[THR_NEWMV] += 1000;
+ sf->thresh_mult[THR_COMP_NEARESTLA] += 1000;
+ sf->thresh_mult[THR_NEARMV] += 1000;
+ sf->thresh_mult[THR_COMP_NEARESTGA] += 1000;
+
+ sf->thresh_mult[THR_DC] += 1000;
+
+ sf->thresh_mult[THR_NEWG] += 1000;
+ sf->thresh_mult[THR_NEWA] += 1000;
+ sf->thresh_mult[THR_NEARA] += 1000;
+
+ sf->thresh_mult[THR_TM] += 1000;
+
+ sf->thresh_mult[THR_COMP_NEARLA] += 1500;
+ sf->thresh_mult[THR_COMP_NEWLA] += 2000;
+ sf->thresh_mult[THR_NEARG] += 1000;
+ sf->thresh_mult[THR_COMP_NEARGA] += 1500;
+ sf->thresh_mult[THR_COMP_NEWGA] += 2000;
+
+ sf->thresh_mult[THR_SPLITMV] += 2500;
+ sf->thresh_mult[THR_SPLITG] += 2500;
+ sf->thresh_mult[THR_SPLITA] += 2500;
+ sf->thresh_mult[THR_COMP_SPLITLA] += 4500;
+ sf->thresh_mult[THR_COMP_SPLITGA] += 4500;
+
+ sf->thresh_mult[THR_ZEROMV] += 2000;
+ sf->thresh_mult[THR_ZEROG] += 2000;
+ sf->thresh_mult[THR_ZEROA] += 2000;
+ sf->thresh_mult[THR_COMP_ZEROLA] += 2500;
+ sf->thresh_mult[THR_COMP_ZEROGA] += 2500;
+
+ sf->thresh_mult[THR_B_PRED] += 2500;
+ sf->thresh_mult[THR_H_PRED] += 2000;
+ sf->thresh_mult[THR_V_PRED] += 2000;
+ sf->thresh_mult[THR_D45_PRED ] += 2500;
+ sf->thresh_mult[THR_D135_PRED] += 2500;
+ sf->thresh_mult[THR_D117_PRED] += 2500;
+ sf->thresh_mult[THR_D153_PRED] += 2500;
+ sf->thresh_mult[THR_D207_PRED] += 2500;
+ sf->thresh_mult[THR_D63_PRED] += 2500;
if (cpi->sf.skip_lots_of_modes) {
for (i = 0; i < MAX_MODES; ++i)
@@ -713,9 +700,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->search_method = NSTEP;
sf->auto_filter = 1;
sf->recode_loop = 1;
- sf->quarter_pixel_search = 1;
- sf->half_pixel_search = 1;
- sf->iterative_sub_pixel = 1;
+ sf->subpel_search_method = SUBPEL_TREE;
+ sf->subpel_iters_per_step = 2;
sf->optimize_coefficients = !cpi->oxcf.lossless;
sf->reduce_first_step_size = 0;
sf->auto_mv_step_size = 0;
@@ -724,11 +710,11 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->adaptive_rd_thresh = 0;
sf->use_lastframe_partitioning = 0;
sf->tx_size_search_method = USE_FULL_RD;
- sf->use_8tap_always = 0;
+ sf->use_lp32x32fdct = 0;
+ sf->adaptive_motion_search = 0;
sf->use_avoid_tested_higherror = 0;
sf->reference_masking = 0;
sf->skip_lots_of_modes = 0;
- sf->adjust_thresholds_by_speed = 0;
sf->partition_by_variance = 0;
sf->use_one_partition_size_always = 0;
sf->less_rectangular_check = 0;
@@ -736,22 +722,23 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->auto_min_max_partition_size = 0;
sf->auto_min_max_partition_interval = 0;
sf->auto_min_max_partition_count = 0;
- // sf->use_max_partition_size = 0;
sf->max_partition_size = BLOCK_64X64;
- // sf->use_min_partition_size = 0;
sf->min_partition_size = BLOCK_4X4;
sf->adjust_partitioning_from_last_frame = 0;
sf->last_partitioning_redo_frequency = 4;
sf->disable_splitmv = 0;
sf->mode_search_skip_flags = 0;
- sf->last_chroma_intra_mode = TM_PRED;
+ sf->disable_split_var_thresh = 0;
+ sf->disable_filter_search_var_thresh = 0;
+ sf->intra_y_mode_mask = ALL_INTRA_MODES;
+ sf->intra_uv_mode_mask = ALL_INTRA_MODES;
sf->use_rd_breakout = 0;
sf->skip_encode_sb = 0;
sf->use_uv_intra_rd_estimate = 0;
+ sf->use_fast_lpf_pick = 0;
+ sf->use_fast_coef_updates = 0;
sf->using_small_partition_info = 0;
- // Skip any mode not chosen at size < X for all sizes > X
- // Hence BLOCK_64X64 (skip is off)
- sf->unused_mode_skip_lvl = BLOCK_64X64;
+ sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set
#if CONFIG_MULTIPLE_ARF
// Switch segmentation off.
@@ -762,7 +749,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
switch (mode) {
case 0: // best quality mode
- sf->search_best_filter = SEARCH_BEST_FILTER;
break;
case 1:
@@ -773,9 +759,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->static_segmentation = 0;
#endif
sf->use_avoid_tested_higherror = 1;
- sf->adaptive_rd_thresh = 1;
+ sf->adaptive_rd_thresh = MIN((speed + 1), 4);
+
if (speed == 1) {
- sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
sf->less_rectangular_check = 1;
sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
cpi->common.intra_only ||
@@ -787,7 +774,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
cpi->common.show_frame == 0);
sf->disable_splitmv =
(MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
- sf->unused_mode_skip_lvl = BLOCK_32X32;
sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
FLAG_SKIP_INTRA_BESTINTER |
FLAG_SKIP_COMP_BESTINTRA |
@@ -795,22 +781,30 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->use_uv_intra_rd_estimate = 1;
sf->use_rd_breakout = 1;
sf->skip_encode_sb = 1;
+ sf->use_lp32x32fdct = 1;
+ sf->adaptive_motion_search = 1;
sf->auto_mv_step_size = 1;
sf->auto_min_max_partition_size = 1;
- // sf->use_max_partition_size = 1;
- // sf->use_min_partition_size = 1;
sf->auto_min_max_partition_interval = 1;
+ // FIXME(jingning): temporarily turn off disable_split_var_thresh
+ // during refactoring process. will get this back after finishing
+ // the main framework of partition search type.
+ sf->disable_split_var_thresh = 0;
+ sf->disable_filter_search_var_thresh = 16;
+
+ sf->intra_y_mode_mask = INTRA_DC_TM_H_V;
+ sf->intra_uv_mode_mask = INTRA_DC_TM_H_V;
+ sf->use_fast_coef_updates = 1;
+ sf->mode_skip_start = 9;
}
if (speed == 2) {
- sf->adjust_thresholds_by_speed = 1;
sf->less_rectangular_check = 1;
sf->use_square_partition_only = 1;
- sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
sf->use_lastframe_partitioning = 1;
sf->adjust_partitioning_from_last_frame = 1;
sf->last_partitioning_redo_frequency = 3;
- sf->unused_mode_skip_lvl = BLOCK_32X32;
sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
cpi->common.intra_only ||
cpi->common.show_frame == 0) ?
@@ -822,17 +816,29 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
FLAG_SKIP_COMP_REFMISMATCH |
FLAG_SKIP_INTRA_LOWVAR |
FLAG_EARLY_TERMINATE;
- sf->last_chroma_intra_mode = DC_PRED;
+ sf->intra_y_mode_mask = INTRA_DC_TM;
+ sf->intra_uv_mode_mask = INTRA_DC_TM;
sf->use_uv_intra_rd_estimate = 1;
sf->use_rd_breakout = 1;
sf->skip_encode_sb = 1;
- sf->using_small_partition_info = 1;
+ sf->use_lp32x32fdct = 1;
+ sf->adaptive_motion_search = 1;
+ sf->using_small_partition_info = 0;
sf->disable_splitmv =
(MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
sf->auto_mv_step_size = 1;
+ sf->search_method = SQUARE;
+ sf->subpel_iters_per_step = 1;
+ sf->use_fast_lpf_pick = 1;
+ sf->auto_min_max_partition_size = 1;
+ sf->auto_min_max_partition_interval = 2;
+ sf->disable_split_var_thresh = 32;
+ sf->disable_filter_search_var_thresh = 32;
+ sf->use_fast_coef_updates = 2;
+ sf->mode_skip_start = 9;
}
if (speed == 3) {
- sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
sf->partition_by_variance = 1;
sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
cpi->common.intra_only ||
@@ -847,11 +853,20 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
FLAG_EARLY_TERMINATE;
sf->use_rd_breakout = 1;
sf->skip_encode_sb = 1;
+ sf->use_lp32x32fdct = 1;
sf->disable_splitmv = 1;
sf->auto_mv_step_size = 1;
+ sf->search_method = BIGDIA;
+ sf->subpel_iters_per_step = 1;
+ sf->disable_split_var_thresh = 64;
+ sf->disable_filter_search_var_thresh = 64;
+ sf->intra_y_mode_mask = INTRA_DC_ONLY;
+ sf->intra_uv_mode_mask = INTRA_DC_ONLY;
+ sf->use_fast_coef_updates = 2;
+ sf->mode_skip_start = 9;
}
if (speed == 4) {
- sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
sf->use_one_partition_size_always = 1;
sf->always_this_block_size = BLOCK_16X16;
sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
@@ -866,37 +881,28 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
FLAG_SKIP_INTRA_LOWVAR |
FLAG_EARLY_TERMINATE;
sf->use_rd_breakout = 1;
+ sf->use_lp32x32fdct = 1;
sf->optimize_coefficients = 0;
sf->auto_mv_step_size = 1;
// sf->reduce_first_step_size = 1;
// sf->reference_masking = 1;
sf->disable_splitmv = 1;
+ sf->search_method = HEX;
+ sf->subpel_iters_per_step = 1;
+ sf->disable_split_var_thresh = 64;
+ sf->disable_filter_search_var_thresh = 96;
+ sf->intra_y_mode_mask = INTRA_DC_ONLY;
+ sf->intra_uv_mode_mask = INTRA_DC_ONLY;
+ sf->use_fast_coef_updates = 2;
+ sf->mode_skip_start = 9;
}
- /*
- if (speed == 2) {
- sf->first_step = 0;
- sf->comp_inter_joint_search_thresh = BLOCK_8X8;
- sf->use_max_partition_size = 1;
- sf->max_partition_size = BLOCK_16X16;
- }
- if (speed == 3) {
- sf->first_step = 0;
- sf->comp_inter_joint_search_thresh = BLOCK_B8X8;
- sf->use_min_partition_size = 1;
- sf->min_partition_size = BLOCK_8X8;
- }
- */
-
break;
}; /* switch */
// Set rd thresholds based on mode and speed setting
- if (cpi->sf.adjust_thresholds_by_speed)
- set_rd_speed_thresholds(cpi, mode, speed);
- else
- set_rd_speed_thresholds(cpi, mode, 0);
+ set_rd_speed_thresholds(cpi, mode);
// Slow quant, dct and trellis not worthwhile for first pass
// so make sure they are always turned off.
@@ -915,12 +921,12 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
cpi->mb.quantize_b_4x4 = vp9_regular_quantize_b_4x4;
- if (cpi->sf.iterative_sub_pixel == 1) {
- cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step_iteratively;
- } else if (cpi->sf.quarter_pixel_search) {
- cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step;
- } else if (cpi->sf.half_pixel_search) {
- cpi->find_fractional_mv_step = vp9_find_best_half_pixel_step;
+ if (cpi->sf.subpel_search_method == SUBPEL_ITERATIVE) {
+ cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_iterative;
+ cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_iterative;
+ } else if (cpi->sf.subpel_search_method == SUBPEL_TREE) {
+ cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
+ cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_tree;
}
cpi->mb.optimize = cpi->sf.optimize_coefficients == 1 && cpi->pass != 1;
@@ -1163,6 +1169,9 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
cpi->gld_fb_idx = 1;
cpi->alt_fb_idx = 2;
+ cpi->current_layer = 0;
+ cpi->use_svc = 0;
+
set_tile_limits(cpi);
cpi->fixed_divide[0] = 0;
@@ -1227,7 +1236,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
cm->refresh_frame_context = 1;
cm->reset_frame_context = 0;
- setup_features(cpi);
+ setup_features(cm);
cpi->mb.e_mbd.allow_high_precision_mv = 0; // Default mv precision adaptation
set_mvcost(&cpi->mb);
@@ -1297,7 +1306,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
// VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
cpi->oxcf.Sharpness = MIN(7, cpi->oxcf.Sharpness);
- cpi->mb.e_mbd.lf.sharpness_level = cpi->oxcf.Sharpness;
+ cpi->common.lf.sharpness_level = cpi->oxcf.Sharpness;
if (cpi->initial_width) {
// Increasing the size of the frame beyond the first seen frame, or some
@@ -1382,7 +1391,7 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
}
VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
- int i;
+ int i, j;
volatile union {
VP9_COMP *cpi;
VP9_PTR ptr;
@@ -1433,14 +1442,13 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
cpi->alt_is_last = 0;
cpi->gold_is_alt = 0;
+ // Spatial scalability
+ cpi->number_spatial_layers = oxcf->ss_number_layers;
+
// Create the encoder segmentation map and set all entries to 0
CHECK_MEM_ERROR(cm, cpi->segmentation_map,
vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
- // And a copy in common for temporal coding
- CHECK_MEM_ERROR(cm, cm->last_frame_seg_map,
- vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
-
// And a place holder structure is the coding context
// for use if we want to save and restore it
CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
@@ -1462,9 +1470,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
init_context_counters();
#endif
-#ifdef NMV_STATS
- init_nmvstats();
-#endif
#ifdef MODE_STATS
init_tx_count_stats();
init_switchable_interp_stats();
@@ -1576,6 +1581,8 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
cpi->output_pkt_list = oxcf->output_pkt_list;
+ cpi->enable_encode_breakout = 1;
+
if (cpi->pass == 1) {
vp9_init_first_pass(cpi);
} else if (cpi->pass == 2) {
@@ -1591,9 +1598,10 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
vp9_set_speed_features(cpi);
- // Set starting values of RD threshold multipliers (128 = *1)
- for (i = 0; i < MAX_MODES; i++)
- cpi->rd_thresh_mult[i] = 128;
+ // Default rd threshold factors for mode selection
+ for (i = 0; i < BLOCK_SIZES; ++i)
+ for (j = 0; j < MAX_MODES; ++j)
+ cpi->rd_thresh_freq_fact[i][j] = 32;
#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
SDX3F, SDX8F, SDX4DF)\
@@ -1700,12 +1708,16 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
*/
vp9_init_quantizer(cpi);
- vp9_loop_filter_init(cm, &cpi->mb.e_mbd.lf);
+ vp9_loop_filter_init(cm);
cpi->common.error.setjmp = 0;
vp9_zero(cpi->y_uv_mode_count)
+#ifdef MODE_TEST_HIT_STATS
+ vp9_zero(cpi->mode_test_hits)
+#endif
+
return (VP9_PTR) cpi;
}
@@ -1728,10 +1740,7 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
print_mode_context(cpi);
}
#endif
-#ifdef NMV_STATS
- if (cpi->pass != 1)
- print_nmvstats();
-#endif
+
#ifdef MODE_STATS
if (cpi->pass != 1) {
write_tx_count_stats();
@@ -1790,6 +1799,34 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
#endif
+#ifdef MODE_TEST_HIT_STATS
+ if (cpi->pass != 1) {
+ double norm_per_pixel_mode_tests = 0;
+ double norm_counts[BLOCK_SIZES];
+ int i;
+ int sb64_per_frame;
+ int norm_factors[BLOCK_SIZES] =
+ {256, 128, 128, 64, 32, 32, 16, 8, 8, 4, 2, 2, 1};
+ FILE *f = fopen("mode_hit_stats.stt", "a");
+
+ // On average, how many mode tests do we do
+ for (i = 0; i < BLOCK_SIZES; ++i) {
+ norm_counts[i] = (double)cpi->mode_test_hits[i] /
+ (double)norm_factors[i];
+ norm_per_pixel_mode_tests += norm_counts[i];
+ }
+ // Convert to a number per 64x64 and per frame
+ sb64_per_frame = ((cpi->common.height + 63) / 64) *
+ ((cpi->common.width + 63) / 64);
+ norm_per_pixel_mode_tests =
+ norm_per_pixel_mode_tests /
+ (double)(cpi->common.current_video_frame * sb64_per_frame);
+
+ fprintf(f, "%6.4f\n", norm_per_pixel_mode_tests);
+ fclose(f);
+ }
+#endif
+
#ifdef ENTROPY_STATS
{
int i, j, k;
@@ -1797,18 +1834,18 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
fprintf(fmode, "\n#include \"vp9_entropymode.h\"\n\n");
fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");
- fprintf(fmode, "[VP9_INTRA_MODES][VP9_INTRA_MODES]"
- "[VP9_INTRA_MODES] =\n{\n");
+ fprintf(fmode, "[INTRA_MODES][INTRA_MODES]"
+ "[INTRA_MODES] =\n{\n");
- for (i = 0; i < VP9_INTRA_MODES; i++) {
+ for (i = 0; i < INTRA_MODES; i++) {
fprintf(fmode, " { // Above Mode : %d\n", i);
- for (j = 0; j < VP9_INTRA_MODES; j++) {
+ for (j = 0; j < INTRA_MODES; j++) {
fprintf(fmode, " {");
- for (k = 0; k < VP9_INTRA_MODES; k++) {
+ for (k = 0; k < INTRA_MODES; k++) {
if (!intra_mode_stats[i][j][k])
fprintf(fmode, " %5d, ", 1);
else
@@ -2214,6 +2251,12 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) {
cpi->source_alt_ref_pending = 1;
cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+
+ // TODO(ivan): for SVC encoder, GF automatic update is disabled by using a
+ // large GF_interval
+ if (cpi->use_svc) {
+ cpi->frames_till_gf_update_due = INT_MAX;
+ }
}
if (!cpi->source_alt_ref_pending)
@@ -2379,7 +2422,8 @@ static void update_reference_frames(VP9_COMP * const cpi) {
else if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame &&
!cpi->refresh_alt_ref_frame) {
#else
- else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
+ else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame &&
+ !cpi->use_svc) {
#endif
/* Preserve the previously existing golden frame and update the frame in
* the alt ref slot instead. This is highly specific to the current use of
@@ -2424,7 +2468,7 @@ static void update_reference_frames(VP9_COMP * const cpi) {
static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
MACROBLOCKD *xd = &cpi->mb.e_mbd;
- struct loopfilter *lf = &xd->lf;
+ struct loopfilter *lf = &cm->lf;
if (xd->lossless) {
lf->filter_level = 0;
} else {
@@ -2434,7 +2478,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
vpx_usec_timer_start(&timer);
- vp9_pick_filter_level(cpi->Source, cpi);
+ vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.use_fast_lpf_pick);
vpx_usec_timer_mark(&timer);
cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
@@ -2442,7 +2486,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
if (lf->filter_level > 0) {
vp9_set_alt_lf_level(cpi, lf->filter_level);
- vp9_loop_filter_frame(cm, xd, lf->filter_level, 0);
+ vp9_loop_filter_frame(cm, xd, lf->filter_level, 0, 0);
}
vp9_extend_frame_inner_borders(cm->frame_to_show,
@@ -2452,9 +2496,11 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
static void scale_references(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
int i;
+ int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,
+ cpi->alt_fb_idx};
for (i = 0; i < 3; i++) {
- YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[i]];
+ YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[refs[i]]];
if (ref->y_crop_width != cm->width ||
ref->y_crop_height != cm->height) {
@@ -2467,8 +2513,8 @@ static void scale_references(VP9_COMP *cpi) {
scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]);
cpi->scaled_ref_idx[i] = new_fb;
} else {
- cpi->scaled_ref_idx[i] = cm->ref_frame_map[i];
- cm->fb_idx_ref_cnt[cm->ref_frame_map[i]]++;
+ cpi->scaled_ref_idx[i] = cm->ref_frame_map[refs[i]];
+ cm->fb_idx_ref_cnt[cm->ref_frame_map[refs[i]]]++;
}
}
}
@@ -2532,25 +2578,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
SPEED_FEATURES *sf = &cpi->sf;
unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
- struct segmentation *seg = &xd->seg;
-#if RESET_FOREACH_FILTER
- int q_low0;
- int q_high0;
- int Q0;
- int active_best_quality0;
- int active_worst_quality0;
- double rate_correction_factor0;
- double gf_rate_correction_factor0;
-#endif
-
- /* list of filters to search over */
- int mcomp_filters_to_search[] = {
- EIGHTTAP, EIGHTTAP_SHARP, EIGHTTAP_SMOOTH, SWITCHABLE
- };
- int mcomp_filters = sizeof(mcomp_filters_to_search) /
- sizeof(*mcomp_filters_to_search);
- int mcomp_filter_index = 0;
- int64_t mcomp_filter_cost[4];
+ struct segmentation *seg = &cm->seg;
/* Scale the source buffer, if required */
if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
@@ -2603,7 +2631,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
}
// Set default state for segment based loop filter update flags
- xd->lf.mode_ref_delta_update = 0;
+ cm->lf.mode_ref_delta_update = 0;
// Initialize cpi->mv_step_param to default based on max resolution
cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
@@ -2626,10 +2654,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// Set various flags etc to special state if it is a key frame
if (cm->frame_type == KEY_FRAME) {
- int i;
-
// Reset the loop filter deltas and segmentation map
- setup_features(cpi);
+ setup_features(cm);
// If segmentation is enabled force a map update for key frames
if (seg->enabled) {
@@ -2640,10 +2666,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// The alternate reference frame cannot be active for a key frame
cpi->source_alt_ref_active = 0;
- // Reset the RD threshold multipliers to default of * 1 (128)
- for (i = 0; i < MAX_MODES; i++)
- cpi->rd_thresh_mult[i] = 128;
-
cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
cm->frame_parallel_decoding_mode =
(cpi->oxcf.frame_parallel_decoding_mode != 0);
@@ -2672,9 +2694,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
if (cm->frame_type == KEY_FRAME) {
#if !CONFIG_MULTIPLE_ARF
- // Special case for key frames forced because we have reached
- // the maximum key frame interval. Here force the Q to a range
- // based on the ambient Q to reduce the risk of popping
+ // Special case for key frames forced because we have reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping
if (cpi->this_key_frame_forced) {
int delta_qindex;
int qindex = cpi->last_boosted_qindex;
@@ -2683,7 +2705,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
delta_qindex = compute_qdelta(cpi, last_boosted_q,
(last_boosted_q * 0.75));
- cpi->active_best_quality = MAX(qindex + delta_qindex, cpi->best_quality);
+ cpi->active_best_quality = MAX(qindex + delta_qindex,
+ cpi->best_quality);
} else {
int high = 5000;
int low = 400;
@@ -2704,7 +2727,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cpi->active_best_quality = kf_low_motion_minq[q] + adjustment;
}
-
// Allow somewhat lower kf minq with small image formats.
if ((cm->width * cm->height) <= (352 * 288)) {
q_adj_factor -= 0.25;
@@ -2713,14 +2735,14 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// Make a further adjustment based on the kf zero motion measure.
q_adj_factor += 0.05 - (0.001 * (double)cpi->kf_zeromotion_pct);
- // Convert the adjustment factor to a qindex delta on active_best_quality.
+ // Convert the adjustment factor to a qindex delta
+ // on active_best_quality.
q_val = vp9_convert_qindex_to_q(cpi->active_best_quality);
cpi->active_best_quality +=
- compute_qdelta(cpi, q_val, (q_val * q_adj_factor));
+ compute_qdelta(cpi, q_val, (q_val * q_adj_factor));
}
#else
double current_q;
-
// Force the KF quantizer to be 30% of the active_worst_quality.
current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);
cpi->active_best_quality = cpi->active_worst_quality
@@ -2737,13 +2759,11 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cpi->avg_frame_qindex < cpi->active_worst_quality) {
q = cpi->avg_frame_qindex;
}
-
// For constrained quality dont allow Q less than the cq level
if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
q < cpi->cq_target_quality) {
q = cpi->cq_target_quality;
}
-
if (cpi->gfu_boost > high) {
cpi->active_best_quality = gf_low_motion_minq[q];
} else if (cpi->gfu_boost < low) {
@@ -2760,28 +2780,54 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// Constrained quality use slightly lower active best.
if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
cpi->active_best_quality = cpi->active_best_quality * 15 / 16;
+
+ // TODO(debargha): Refine the logic below
+ if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+ if (!cpi->refresh_alt_ref_frame) {
+ cpi->active_best_quality = cpi->cq_target_quality;
+ } else {
+ if (cpi->frames_since_key > 1) {
+ if (cpi->gfu_boost > high) {
+ cpi->active_best_quality = cpi->cq_target_quality * 6 / 16;
+ } else if (cpi->gfu_boost < low) {
+ cpi->active_best_quality = cpi->cq_target_quality * 11 / 16;
+ } else {
+ const int gap = high - low;
+ const int offset = high - cpi->gfu_boost;
+ const int qdiff = cpi->cq_target_quality * 5 / 16;
+ const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+ cpi->active_best_quality = cpi->cq_target_quality * 6 / 16
+ + adjustment;
+ }
+ }
+ }
+ }
} else {
+ if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+ cpi->active_best_quality = cpi->cq_target_quality;
+ } else {
#ifdef ONE_SHOT_Q_ESTIMATE
#ifdef STRICT_ONE_SHOT_Q
- cpi->active_best_quality = q;
+ cpi->active_best_quality = q;
#else
- cpi->active_best_quality = inter_minq[q];
+ cpi->active_best_quality = inter_minq[q];
#endif
#else
- cpi->active_best_quality = inter_minq[q];
+ cpi->active_best_quality = inter_minq[q];
#endif
- // For the constant/constrained quality mode we don't want
- // q to fall below the cq level.
- if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
- (cpi->active_best_quality < cpi->cq_target_quality)) {
- // If we are strongly undershooting the target rate in the last
- // frames then use the user passed in cq value not the auto
- // cq value.
- if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth)
- cpi->active_best_quality = cpi->oxcf.cq_level;
- else
- cpi->active_best_quality = cpi->cq_target_quality;
+ // For the constant/constrained quality mode we don't want
+ // q to fall below the cq level.
+ if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+ (cpi->active_best_quality < cpi->cq_target_quality)) {
+ // If we are strongly undershooting the target rate in the last
+ // frames then use the user passed in cq value not the auto
+ // cq value.
+ if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth)
+ cpi->active_best_quality = cpi->oxcf.cq_level;
+ else
+ cpi->active_best_quality = cpi->cq_target_quality;
+ }
}
}
@@ -2799,7 +2845,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cpi->active_worst_quality = cpi->active_best_quality;
// Special case code to try and match quality with forced key frames
- if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
+ if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+ q = cpi->active_best_quality;
+ } else if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
q = cpi->last_boosted_qindex;
} else {
// Determine initial Q to try
@@ -2811,7 +2859,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
#if CONFIG_MULTIPLE_ARF
// Force the quantizer determined by the coding order pattern.
- if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME)) {
+ if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
+ cpi->oxcf.end_usage != USAGE_CONSTANT_QUALITY) {
double new_q;
double current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);
int level = cpi->this_frame_weight;
@@ -2841,19 +2890,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
vp9_zero(cpi->rd_tx_select_threshes);
if (cm->frame_type != KEY_FRAME) {
- /* TODO: Decide this more intelligently */
- if (sf->search_best_filter) {
- cm->mcomp_filter_type = mcomp_filters_to_search[0];
- mcomp_filter_index = 0;
- } else {
- cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
- }
+ cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
/* TODO: Decide this more intelligently */
xd->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH;
set_mvcost(&cpi->mb);
}
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
if (cpi->oxcf.noise_sensitivity > 0) {
int l = 0;
@@ -2886,17 +2929,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
vp9_write_yuv_frame(cpi->Source);
#endif
-#if RESET_FOREACH_FILTER
- if (sf->search_best_filter) {
- q_low0 = q_low;
- q_high0 = q_high;
- Q0 = Q;
- rate_correction_factor0 = cpi->rate_correction_factor;
- gf_rate_correction_factor0 = cpi->gf_rate_correction_factor;
- active_best_quality0 = cpi->active_best_quality;
- active_worst_quality0 = cpi->active_worst_quality;
- }
-#endif
do {
vp9_clear_system_state(); // __asm emms;
@@ -2946,178 +2978,135 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
active_worst_qchanged = 0;
// Special case handling for forced key frames
- if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
- int last_q = q;
- int kf_err = vp9_calc_ss_err(cpi->Source,
- &cm->yv12_fb[cm->new_fb_idx]);
-
- int high_err_target = cpi->ambient_err;
- int low_err_target = cpi->ambient_err >> 1;
-
- // Prevent possible divide by zero error below for perfect KF
- kf_err += !kf_err;
-
- // The key frame is not good enough or we can afford
- // to make it better without undue risk of popping.
- if ((kf_err > high_err_target &&
- cpi->projected_frame_size <= frame_over_shoot_limit) ||
- (kf_err > low_err_target &&
- cpi->projected_frame_size <= frame_under_shoot_limit)) {
- // Lower q_high
- q_high = q > q_low ? q - 1 : q_low;
-
- // Adjust Q
- q = (q * high_err_target) / kf_err;
- q = MIN(q, (q_high + q_low) >> 1);
- } else if (kf_err < low_err_target &&
- cpi->projected_frame_size >= frame_under_shoot_limit) {
- // The key frame is much better than the previous frame
- // Raise q_low
- q_low = q < q_high ? q + 1 : q_high;
-
- // Adjust Q
- q = (q * low_err_target) / kf_err;
- q = MIN(q, (q_high + q_low + 1) >> 1);
- }
-
- // Clamp Q to upper and lower limits:
- q = clamp(q, q_low, q_high);
-
- loop = q != last_q;
- }
-
- // Is the projected frame size out of range and are we allowed to attempt to recode.
- else if (recode_loop_test(cpi,
- frame_over_shoot_limit, frame_under_shoot_limit,
- q, top_index, bottom_index)) {
- int last_q = q;
- int retries = 0;
+ if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+ loop = 0;
+ } else {
+ if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
+ int last_q = q;
+ int kf_err = vp9_calc_ss_err(cpi->Source,
+ &cm->yv12_fb[cm->new_fb_idx]);
+
+ int high_err_target = cpi->ambient_err;
+ int low_err_target = cpi->ambient_err >> 1;
+
+ // Prevent possible divide by zero error below for perfect KF
+ kf_err += !kf_err;
+
+ // The key frame is not good enough or we can afford
+ // to make it better without undue risk of popping.
+ if ((kf_err > high_err_target &&
+ cpi->projected_frame_size <= frame_over_shoot_limit) ||
+ (kf_err > low_err_target &&
+ cpi->projected_frame_size <= frame_under_shoot_limit)) {
+ // Lower q_high
+ q_high = q > q_low ? q - 1 : q_low;
+
+ // Adjust Q
+ q = (q * high_err_target) / kf_err;
+ q = MIN(q, (q_high + q_low) >> 1);
+ } else if (kf_err < low_err_target &&
+ cpi->projected_frame_size >= frame_under_shoot_limit) {
+ // The key frame is much better than the previous frame
+ // Raise q_low
+ q_low = q < q_high ? q + 1 : q_high;
+
+ // Adjust Q
+ q = (q * low_err_target) / kf_err;
+ q = MIN(q, (q_high + q_low + 1) >> 1);
+ }
- // Frame size out of permitted range:
- // Update correction factor & compute new Q to try...
+ // Clamp Q to upper and lower limits:
+ q = clamp(q, q_low, q_high);
+
+ loop = q != last_q;
+ } else if (recode_loop_test(
+ cpi, frame_over_shoot_limit, frame_under_shoot_limit,
+ q, top_index, bottom_index)) {
+ // Is the projected frame size out of range and are we allowed
+ // to attempt to recode.
+ int last_q = q;
+ int retries = 0;
+
+ // Frame size out of permitted range:
+ // Update correction factor & compute new Q to try...
+
+ // Frame is too large
+ if (cpi->projected_frame_size > cpi->this_frame_target) {
+ // Raise Qlow as to at least the current value
+ q_low = q < q_high ? q + 1 : q_high;
+
+ if (undershoot_seen || loop_count > 1) {
+ // Update rate_correction_factor unless
+ // cpi->active_worst_quality has changed.
+ if (!active_worst_qchanged)
+ vp9_update_rate_correction_factors(cpi, 1);
+
+ q = (q_high + q_low + 1) / 2;
+ } else {
+ // Update rate_correction_factor unless
+ // cpi->active_worst_quality has changed.
+ if (!active_worst_qchanged)
+ vp9_update_rate_correction_factors(cpi, 0);
- // Frame is too large
- if (cpi->projected_frame_size > cpi->this_frame_target) {
- // Raise Qlow as to at least the current value
- q_low = q < q_high ? q + 1 : q_high;
+ q = vp9_regulate_q(cpi, cpi->this_frame_target);
- if (undershoot_seen || loop_count > 1) {
- // Update rate_correction_factor unless cpi->active_worst_quality
- // has changed.
- if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, 1);
+ while (q < q_low && retries < 10) {
+ vp9_update_rate_correction_factors(cpi, 0);
+ q = vp9_regulate_q(cpi, cpi->this_frame_target);
+ retries++;
+ }
+ }
- q = (q_high + q_low + 1) / 2;
+ overshoot_seen = 1;
} else {
- // Update rate_correction_factor unless cpi->active_worst_quality has changed.
- if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, 0);
-
- q = vp9_regulate_q(cpi, cpi->this_frame_target);
+ // Frame is too small
+ q_high = q > q_low ? q - 1 : q_low;
+
+ if (overshoot_seen || loop_count > 1) {
+ // Update rate_correction_factor unless
+ // cpi->active_worst_quality has changed.
+ if (!active_worst_qchanged)
+ vp9_update_rate_correction_factors(cpi, 1);
+
+ q = (q_high + q_low) / 2;
+ } else {
+ // Update rate_correction_factor unless
+ // cpi->active_worst_quality has changed.
+ if (!active_worst_qchanged)
+ vp9_update_rate_correction_factors(cpi, 0);
- while (q < q_low && retries < 10) {
- vp9_update_rate_correction_factors(cpi, 0);
q = vp9_regulate_q(cpi, cpi->this_frame_target);
- retries++;
- }
- }
- overshoot_seen = 1;
- } else {
- // Frame is too small
- q_high = q > q_low ? q - 1 : q_low;
-
- if (overshoot_seen || loop_count > 1) {
- // Update rate_correction_factor unless cpi->active_worst_quality has changed.
- if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, 1);
+ // Special case reset for qlow for constrained quality.
+ // This should only trigger where there is very substantial
+ // undershoot on a frame and the auto cq level is above
+ // the user passsed in value.
+ if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) {
+ q_low = q;
+ }
- q = (q_high + q_low) / 2;
- } else {
- // Update rate_correction_factor unless cpi->active_worst_quality has changed.
- if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, 0);
-
- q = vp9_regulate_q(cpi, cpi->this_frame_target);
-
- // Special case reset for qlow for constrained quality.
- // This should only trigger where there is very substantial
- // undershoot on a frame and the auto cq level is above
- // the user passsed in value.
- if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) {
- q_low = q;
+ while (q > q_high && retries < 10) {
+ vp9_update_rate_correction_factors(cpi, 0);
+ q = vp9_regulate_q(cpi, cpi->this_frame_target);
+ retries++;
+ }
}
- while (q > q_high && retries < 10) {
- vp9_update_rate_correction_factors(cpi, 0);
- q = vp9_regulate_q(cpi, cpi->this_frame_target);
- retries++;
- }
+ undershoot_seen = 1;
}
- undershoot_seen = 1;
- }
+ // Clamp Q to upper and lower limits:
+ q = clamp(q, q_low, q_high);
- // Clamp Q to upper and lower limits:
- q = clamp(q, q_low, q_high);
-
- loop = q != last_q;
- } else {
- loop = 0;
+ loop = q != last_q;
+ } else {
+ loop = 0;
+ }
}
if (cpi->is_src_frame_alt_ref)
loop = 0;
- if (!loop && cm->frame_type != KEY_FRAME && sf->search_best_filter) {
- if (mcomp_filter_index < mcomp_filters) {
- int64_t err = vp9_calc_ss_err(cpi->Source,
- &cm->yv12_fb[cm->new_fb_idx]);
- int64_t rate = cpi->projected_frame_size << 8;
- mcomp_filter_cost[mcomp_filter_index] =
- (RDCOST(cpi->RDMULT, cpi->RDDIV, rate, err));
- mcomp_filter_index++;
- if (mcomp_filter_index < mcomp_filters) {
- cm->mcomp_filter_type = mcomp_filters_to_search[mcomp_filter_index];
- loop_count = -1;
- loop = 1;
- } else {
- int f;
- int64_t best_cost = mcomp_filter_cost[0];
- int mcomp_best_filter = mcomp_filters_to_search[0];
- for (f = 1; f < mcomp_filters; f++) {
- if (mcomp_filter_cost[f] < best_cost) {
- mcomp_best_filter = mcomp_filters_to_search[f];
- best_cost = mcomp_filter_cost[f];
- }
- }
- if (mcomp_best_filter != mcomp_filters_to_search[mcomp_filters - 1]) {
- loop_count = -1;
- loop = 1;
- cm->mcomp_filter_type = mcomp_best_filter;
- }
- /*
- printf(" best filter = %d, ( ", mcomp_best_filter);
- for (f=0;f<mcomp_filters; f++) printf("%d ", mcomp_filter_cost[f]);
- printf(")\n");
- */
- }
-#if RESET_FOREACH_FILTER
- if (loop) {
- overshoot_seen = 0;
- undershoot_seen = 0;
- q_low = q_low0;
- q_high = q_high0;
- q = Q0;
- cpi->rate_correction_factor = rate_correction_factor0;
- cpi->gf_rate_correction_factor = gf_rate_correction_factor0;
- cpi->active_best_quality = active_best_quality0;
- cpi->active_worst_quality = active_worst_quality0;
- }
-#endif
- }
- }
-
if (loop) {
loop_count++;
@@ -3165,7 +3154,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cpi->dummy_packing = 0;
vp9_pack_bitstream(cpi, dest, size);
- if (xd->seg.update_map)
+ if (cm->seg.update_map)
update_reference_segmentation_map(cpi);
release_scaled_references(cpi);
@@ -3296,9 +3285,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// in this frame.
// update_base_skip_probs(cpi);
-#if 0 && CONFIG_INTERNAL_STATS
+#if CONFIG_INTERNAL_STATS
{
- FILE *f = fopen("tmp.stt", "a");
+ FILE *f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
int recon_err;
vp9_clear_system_state(); // __asm emms;
@@ -3307,7 +3296,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
&cm->yv12_fb[cm->new_fb_idx]);
if (cpi->twopass.total_left_stats.coded_error != 0.0)
- fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
+ fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
"%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
"%6d %6d %5d %5d %5d %8.2f %10d %10.3f"
"%10.3f %8d %10d %10d %10d\n",
@@ -3317,6 +3306,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
(int)cpi->total_target_vs_actual,
(int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
(int)cpi->total_actual_bits,
+ cm->base_qindex,
vp9_convert_qindex_to_q(cm->base_qindex),
(double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
vp9_convert_qindex_to_q(cpi->active_best_quality),
@@ -3335,7 +3325,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cpi->tot_recode_hits, recon_err, cpi->kf_boost,
cpi->kf_zeromotion_pct);
else
- fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
+ fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
"%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
"%5d %5d %5d %8d %8d %8.2f %10d %10.3f"
"%8d %10d %10d %10d\n",
@@ -3346,6 +3336,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
(int)cpi->total_target_vs_actual,
(int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
(int)cpi->total_actual_bits,
+ cm->base_qindex,
vp9_convert_qindex_to_q(cm->base_qindex),
(double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
vp9_convert_qindex_to_q(cpi->active_best_quality),
@@ -3473,9 +3464,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
}
// Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
- xd->seg.update_map = 0;
- xd->seg.update_data = 0;
- xd->lf.mode_ref_delta_update = 0;
+ cm->seg.update_map = 0;
+ cm->seg.update_data = 0;
+ cm->lf.mode_ref_delta_update = 0;
// keep track of the last coded dimensions
cm->last_width = cm->width;
@@ -3486,11 +3477,15 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
if (cm->show_frame) {
// current mip will be the prev_mip for the next frame
MODE_INFO *temp = cm->prev_mip;
+ MODE_INFO **temp2 = cm->prev_mi_grid_base;
cm->prev_mip = cm->mip;
cm->mip = temp;
+ cm->prev_mi_grid_base = cm->mi_grid_base;
+ cm->mi_grid_base = temp2;
// update the upper left visible macroblock ptrs
cm->mi = cm->mip + cm->mode_info_stride + 1;
+ cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1;
// Don't increment frame counters if this was an altref buffer
// update not a real frame
@@ -3499,8 +3494,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
}
// restore prev_mi
cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
+ cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
-#if 0
+ #if 0
{
char filename[512];
FILE *recon_file;
@@ -3521,6 +3517,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
unsigned char *dest, unsigned int *frame_flags) {
+ cpi->enable_encode_breakout = 1;
+
if (!cpi->refresh_alt_ref_frame)
vp9_second_pass(cpi);
@@ -3544,24 +3542,28 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
}
}
-
-int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
- YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
- int64_t end_time) {
- VP9_COMP *cpi = (VP9_COMP *) ptr;
+static void check_initial_width(VP9_COMP *cpi, YV12_BUFFER_CONFIG *sd) {
VP9_COMMON *cm = &cpi->common;
- struct vpx_usec_timer timer;
- int res = 0;
-
if (!cpi->initial_width) {
// TODO(jkoleszar): Support 1/4 subsampling?
- cm->subsampling_x = sd->uv_width < sd->y_width;
- cm->subsampling_y = sd->uv_height < sd->y_height;
+ cm->subsampling_x = (sd != NULL) && sd->uv_width < sd->y_width;
+ cm->subsampling_y = (sd != NULL) && sd->uv_height < sd->y_height;
alloc_raw_frame_buffers(cpi);
cpi->initial_width = cm->width;
cpi->initial_height = cm->height;
}
+}
+
+
+int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
+ YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+ int64_t end_time) {
+ VP9_COMP *cpi = (VP9_COMP *) ptr;
+ struct vpx_usec_timer timer;
+ int res = 0;
+
+ check_initial_width(cpi, sd);
vpx_usec_timer_start(&timer);
if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
cpi->active_map_enabled ? cpi->active_map : NULL))
@@ -3575,16 +3577,15 @@ int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
static int frame_is_reference(const VP9_COMP *cpi) {
const VP9_COMMON *cm = &cpi->common;
- const MACROBLOCKD *mb = &cpi->mb.e_mbd;
return cm->frame_type == KEY_FRAME ||
cpi->refresh_last_frame ||
cpi->refresh_golden_frame ||
cpi->refresh_alt_ref_frame ||
cm->refresh_frame_context ||
- mb->lf.mode_ref_delta_update ||
- mb->seg.update_map ||
- mb->seg.update_data;
+ cm->lf.mode_ref_delta_update ||
+ cm->seg.update_map ||
+ cm->seg.update_data;
}
#if CONFIG_MULTIPLE_ARF
@@ -3644,6 +3645,8 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
configure_arnr_filter(cpi, cm->current_video_frame + frames_to_arf,
cpi->gfu_boost);
vp9_temporal_filter_prepare(cpi, frames_to_arf);
+ vp9_extend_frame_borders(&cpi->alt_ref_buffer,
+ cm->subsampling_x, cm->subsampling_y);
force_src_buffer = &cpi->alt_ref_buffer;
}
@@ -3911,9 +3914,9 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
{
double frame_psnr2, frame_ssim2 = 0;
double weight = 0;
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
- cpi->mb.e_mbd.lf.filter_level * 10 / 6);
+ cm->lf.filter_level * 10 / 6);
#endif
vp9_clear_system_state();
@@ -3987,8 +3990,8 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
return -1;
else {
int ret;
-#if CONFIG_POSTPROC
- ret = vp9_post_proc_frame(&cpi->common, &cpi->mb.e_mbd.lf, dest, flags);
+#if CONFIG_VP9_POSTPROC
+ ret = vp9_post_proc_frame(&cpi->common, dest, flags);
#else
if (cpi->common.frame_to_show) {
@@ -4001,7 +4004,7 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
ret = -1;
}
-#endif // !CONFIG_POSTPROC
+#endif // !CONFIG_VP9_POSTPROC
vp9_clear_system_state();
return ret;
}
@@ -4013,7 +4016,7 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
unsigned int threshold[MAX_SEGMENTS]) {
VP9_COMP *cpi = (VP9_COMP *) comp;
signed char feature_data[SEG_LVL_MAX][MAX_SEGMENTS];
- struct segmentation *seg = &cpi->mb.e_mbd.seg;
+ struct segmentation *seg = &cpi->common.seg;
int i;
if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
@@ -4030,7 +4033,7 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
// Activate segmentation.
vp9_enable_segmentation((VP9_PTR)cpi);
- // Set up the quan, LF and breakout threshold segment data
+ // Set up the quant, LF and breakout threshold segment data
for (i = 0; i < MAX_SEGMENTS; i++) {
feature_data[SEG_LVL_ALT_Q][i] = delta_q[i];
feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i];
@@ -4050,7 +4053,7 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF);
}
- // Initialise the feature data structure
+ // Initialize the feature data structure
// SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1
vp9_set_segment_data((VP9_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
@@ -4098,7 +4101,76 @@ int vp9_set_internal_size(VP9_PTR comp,
return 0;
}
+int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
+ unsigned int height) {
+ VP9_COMP *cpi = (VP9_COMP *)comp;
+ VP9_COMMON *cm = &cpi->common;
+
+ check_initial_width(cpi, NULL);
+
+ if (width) {
+ cm->width = width;
+ if (cm->width * 5 < cpi->initial_width) {
+ cm->width = cpi->initial_width / 5 + 1;
+ printf("Warning: Desired width too small, changed to %d \n", cm->width);
+ }
+ if (cm->width > cpi->initial_width) {
+ cm->width = cpi->initial_width;
+ printf("Warning: Desired width too large, changed to %d \n", cm->width);
+ }
+ }
+
+ if (height) {
+ cm->height = height;
+ if (cm->height * 5 < cpi->initial_height) {
+ cm->height = cpi->initial_height / 5 + 1;
+ printf("Warning: Desired height too small, changed to %d \n", cm->height);
+ }
+ if (cm->height > cpi->initial_height) {
+ cm->height = cpi->initial_height;
+ printf("Warning: Desired height too large, changed to %d \n", cm->height);
+ }
+ }
+
+ assert(cm->width <= cpi->initial_width);
+ assert(cm->height <= cpi->initial_height);
+ update_frame_size(cpi);
+ return 0;
+}
+
+int vp9_switch_layer(VP9_PTR comp, int layer) {
+ VP9_COMP *cpi = (VP9_COMP *)comp;
+
+ if (cpi->use_svc) {
+ cpi->current_layer = layer;
+
+ // Use buffer i for layer i LST
+ cpi->lst_fb_idx = layer;
+ // Use buffer i-1 for layer i Alt (Inter-layer prediction)
+ if (layer != 0) cpi->alt_fb_idx = layer - 1;
+
+ // Use the rest for Golden
+ if (layer < 2 * cpi->number_spatial_layers - NUM_REF_FRAMES)
+ cpi->gld_fb_idx = cpi->lst_fb_idx;
+ else
+ cpi->gld_fb_idx = 2 * cpi->number_spatial_layers - 1 - layer;
+
+ printf("Switching to layer %d:\n", layer);
+ printf("Using references: LST/GLD/ALT [%d|%d|%d]\n", cpi->lst_fb_idx,
+ cpi->gld_fb_idx, cpi->alt_fb_idx);
+ } else {
+ printf("Switching layer not supported. Enable SVC first \n");
+ }
+ return 0;
+}
+
+void vp9_set_svc(VP9_PTR comp, int use_svc) {
+ VP9_COMP *cpi = (VP9_COMP *)comp;
+ cpi->use_svc = use_svc;
+ if (cpi->use_svc) printf("Enabled SVC encoder \n");
+ return;
+}
int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) {
int i, j;
diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_onyx_int.h
index c258829..3e5796f 100644
--- a/libvpx/vp9/encoder/vp9_onyx_int.h
+++ b/libvpx/vp9/encoder/vp9_onyx_int.h
@@ -36,6 +36,8 @@
#define DISABLE_RC_LONG_TERM_MEM 0
#endif
+// #define MODE_TEST_HIT_STATS
+
// #define SPEEDSTATS 1
#if CONFIG_MULTIPLE_ARF
// Set MIN_GF_INTERVAL to 1 for the full decomposition.
@@ -79,15 +81,15 @@ typedef struct {
vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
- vp9_prob y_mode_prob[4][VP9_INTRA_MODES - 1];
- vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
+ vp9_prob y_mode_prob[4][INTRA_MODES - 1];
+ vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
vp9_prob partition_prob[2][NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
- vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
- [VP9_SWITCHABLE_FILTERS - 1];
+ vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
+ [SWITCHABLE_FILTERS - 1];
- int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
- vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
+ int inter_mode_counts[INTER_MODE_CONTEXTS][INTER_MODES - 1][2];
+ vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
struct tx_probs tx_probs;
vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
@@ -145,18 +147,19 @@ typedef struct {
// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
typedef enum {
THR_NEARESTMV,
- THR_DC,
-
THR_NEARESTA,
THR_NEARESTG,
- THR_NEWMV,
- THR_COMP_NEARESTLA,
- THR_NEARMV,
- THR_COMP_NEARESTGA,
- THR_NEWG,
+ THR_DC,
+
+ THR_NEWMV,
THR_NEWA,
+ THR_NEWG,
+
+ THR_NEARMV,
THR_NEARA,
+ THR_COMP_NEARESTLA,
+ THR_COMP_NEARESTGA,
THR_TM,
@@ -182,7 +185,7 @@ typedef enum {
THR_H_PRED,
THR_V_PRED,
THR_D135_PRED,
- THR_D27_PRED,
+ THR_D207_PRED,
THR_D153_PRED,
THR_D63_PRED,
THR_D117_PRED,
@@ -192,7 +195,9 @@ typedef enum {
typedef enum {
DIAMOND = 0,
NSTEP = 1,
- HEX = 2
+ HEX = 2,
+ BIGDIA = 3,
+ SQUARE = 4
} SEARCH_METHODS;
typedef enum {
@@ -230,20 +235,29 @@ typedef enum {
FLAG_SKIP_INTRA_LOWVAR = 32,
} MODE_SEARCH_SKIP_LOGIC;
+typedef enum {
+ SUBPEL_ITERATIVE = 0,
+ SUBPEL_TREE = 1,
+ // Other methods to come
+} SUBPEL_SEARCH_METHODS;
+
+#define ALL_INTRA_MODES 0x3FF
+#define INTRA_DC_ONLY 0x01
+#define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED))
+#define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED))
+
typedef struct {
int RD;
SEARCH_METHODS search_method;
int auto_filter;
int recode_loop;
- int iterative_sub_pixel;
- int half_pixel_search;
- int quarter_pixel_search;
+ SUBPEL_SEARCH_METHODS subpel_search_method;
+ int subpel_iters_per_step;
int thresh_mult[MAX_MODES];
int max_step_search_steps;
int reduce_first_step_size;
int auto_mv_step_size;
int optimize_coefficients;
- int search_best_filter;
int static_segmentation;
int comp_inter_joint_search_thresh;
int adaptive_rd_thresh;
@@ -251,36 +265,43 @@ typedef struct {
int skip_encode_frame;
int use_lastframe_partitioning;
TX_SIZE_SEARCH_METHOD tx_size_search_method;
- int use_8tap_always;
+ int use_lp32x32fdct;
int use_avoid_tested_higherror;
int skip_lots_of_modes;
- int adjust_thresholds_by_speed;
int partition_by_variance;
int use_one_partition_size_always;
int less_rectangular_check;
int use_square_partition_only;
- int unused_mode_skip_lvl;
+ int mode_skip_start;
int reference_masking;
- BLOCK_SIZE_TYPE always_this_block_size;
+ BLOCK_SIZE always_this_block_size;
int auto_min_max_partition_size;
int auto_min_max_partition_interval;
int auto_min_max_partition_count;
- BLOCK_SIZE_TYPE min_partition_size;
- BLOCK_SIZE_TYPE max_partition_size;
- // int use_min_partition_size; // not used in code
- // int use_max_partition_size;
+ BLOCK_SIZE min_partition_size;
+ BLOCK_SIZE max_partition_size;
int adjust_partitioning_from_last_frame;
int last_partitioning_redo_frequency;
int disable_splitmv;
int using_small_partition_info;
+ // TODO(jingning): combine the related motion search speed features
+ int adaptive_motion_search;
// Implements various heuristics to skip searching modes
// The heuristics selected are based on flags
// defined in the MODE_SEARCH_SKIP_HEURISTICS enum
unsigned int mode_search_skip_flags;
- MB_PREDICTION_MODE last_chroma_intra_mode;
+ // A source variance threshold below which the split mode is disabled
+ unsigned int disable_split_var_thresh;
+ // A source variance threshold below which filter search is disabled
+ // Choose a very large value (UINT_MAX) to use 8-tap always
+ unsigned int disable_filter_search_var_thresh;
+ int intra_y_mode_mask;
+ int intra_uv_mode_mask;
int use_rd_breakout;
int use_uv_intra_rd_estimate;
+ int use_fast_lpf_pick;
+ int use_fast_coef_updates; // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
} SPEED_FEATURES;
typedef struct VP9_COMP {
@@ -331,6 +352,10 @@ typedef struct VP9_COMP {
int lst_fb_idx;
int gld_fb_idx;
int alt_fb_idx;
+
+ int current_layer;
+ int use_svc;
+
#if CONFIG_MULTIPLE_ARF
int alt_ref_fb_idx[NUM_REF_FRAMES - 3];
#endif
@@ -360,14 +385,12 @@ typedef struct VP9_COMP {
unsigned int mode_check_freq[MAX_MODES];
unsigned int mode_test_hit_counts[MAX_MODES];
unsigned int mode_chosen_counts[MAX_MODES];
- int64_t unused_mode_skip_mask;
+ int64_t mode_skip_mask;
int ref_frame_mask;
int set_ref_frame_mask;
- int rd_thresh_mult[MAX_MODES];
- int rd_baseline_thresh[BLOCK_SIZE_TYPES][MAX_MODES];
- int rd_threshes[BLOCK_SIZE_TYPES][MAX_MODES];
- int rd_thresh_freq_fact[BLOCK_SIZE_TYPES][MAX_MODES];
+ int rd_threshes[BLOCK_SIZES][MAX_MODES];
+ int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
// FIXME(rbultje) int64_t?
@@ -381,9 +404,9 @@ typedef struct VP9_COMP {
// FIXME(rbultje) can this overflow?
int rd_tx_select_threshes[4][TX_MODES];
- int64_t rd_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
- int64_t rd_filter_threshes[4][VP9_SWITCHABLE_FILTERS + 1];
- int64_t rd_filter_cache[VP9_SWITCHABLE_FILTERS + 1];
+ int64_t rd_filter_diff[SWITCHABLE_FILTERS + 1];
+ int64_t rd_filter_threshes[4][SWITCHABLE_FILTERS + 1];
+ int64_t rd_filter_cache[SWITCHABLE_FILTERS + 1];
int RDMULT;
int RDDIV;
@@ -458,8 +481,8 @@ typedef struct VP9_COMP {
int cq_target_quality;
- int y_mode_count[4][VP9_INTRA_MODES];
- int y_uv_mode_count[VP9_INTRA_MODES][VP9_INTRA_MODES];
+ int y_mode_count[4][INTRA_MODES];
+ int y_uv_mode_count[INTRA_MODES][INTRA_MODES];
unsigned int partition_count[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
nmv_context_counts NMVcount;
@@ -472,6 +495,7 @@ typedef struct VP9_COMP {
int last_boost;
int kf_boost;
int kf_zeromotion_pct;
+ int gf_zeromotion_pct;
int64_t target_bandwidth;
struct vpx_codec_pkt_list *output_pkt_list;
@@ -527,10 +551,11 @@ typedef struct VP9_COMP {
unsigned int active_map_enabled;
fractional_mv_step_fp *find_fractional_mv_step;
+ fractional_mv_step_comp_fp *find_fractional_mv_step_comp;
vp9_full_search_fn_t full_search_sad;
vp9_refining_search_fn_t refining_search_sad;
vp9_diamond_search_fn_t diamond_search_sad;
- vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZE_TYPES];
+ vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
uint64_t time_receive_data;
uint64_t time_compress_data;
uint64_t time_pick_lpf;
@@ -623,14 +648,18 @@ typedef struct VP9_COMP {
int dummy_packing; /* flag to indicate if packing is dummy */
- unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
- [VP9_SWITCHABLE_FILTERS];
+ unsigned int switchable_interp_count[SWITCHABLE_FILTERS + 1]
+ [SWITCHABLE_FILTERS];
unsigned int txfm_stepdown_count[TX_SIZES];
int initial_width;
int initial_height;
+ int number_spatial_layers;
+ int enable_encode_breakout; // Default value is 1. From first pass stats,
+ // encode_breakout may be disabled.
+
#if CONFIG_MULTIPLE_ARF
// ARF tracking variables.
int multi_arf_enabled;
@@ -645,7 +674,13 @@ typedef struct VP9_COMP {
#endif
#ifdef ENTROPY_STATS
- int64_t mv_ref_stats[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
+ int64_t mv_ref_stats[INTER_MODE_CONTEXTS][INTER_MODES - 1][2];
+#endif
+
+
+#ifdef MODE_TEST_HIT_STATS
+ // Debug / test stats
+ int64_t mode_test_hits[BLOCK_SIZES];
#endif
} VP9_COMP;
@@ -659,6 +694,17 @@ static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
}
}
+static int get_scale_ref_frame_idx(VP9_COMP *cpi,
+ MV_REFERENCE_FRAME ref_frame) {
+ if (ref_frame == LAST_FRAME) {
+ return 0;
+ } else if (ref_frame == GOLDEN_FRAME) {
+ return 1;
+ } else {
+ return 2;
+ }
+}
+
void vp9_encode_frame(VP9_COMP *cpi);
void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
diff --git a/libvpx/vp9/encoder/vp9_picklpf.c b/libvpx/vp9/encoder/vp9_picklpf.c
index 2b8f2cd..239fd6b 100644
--- a/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/libvpx/vp9/encoder/vp9_picklpf.c
@@ -21,29 +21,15 @@
#include "./vpx_scale_rtcd.h"
void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
- YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
- uint8_t *src_y, *dst_y;
- int yheight;
- int ystride;
- int yoffset;
- int linestocopy;
+ YV12_BUFFER_CONFIG *dst_ybc, int fraction) {
+ const int height = src_ybc->y_height;
+ const int stride = src_ybc->y_stride;
+ const int offset = stride * ((height >> 5) * 16 - 8);
+ const int lines_to_copy = MAX(height >> (fraction + 4), 1) << 4;
assert(src_ybc->y_stride == dst_ybc->y_stride);
- yheight = src_ybc->y_height;
- ystride = src_ybc->y_stride;
-
- linestocopy = (yheight >> (Fraction + 4));
-
- if (linestocopy < 1)
- linestocopy = 1;
-
- linestocopy <<= 4;
-
- yoffset = ystride * ((yheight >> 5) * 16 - 8);
- src_y = src_ybc->y_buffer + yoffset;
- dst_y = dst_ybc->y_buffer + yoffset;
-
- vpx_memcpy(dst_y, src_y, ystride * (linestocopy + 16));
+ vpx_memcpy(dst_ybc->y_buffer + offset, src_ybc->y_buffer + offset,
+ stride * (lines_to_copy + 16));
}
static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
@@ -125,14 +111,14 @@ static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {
void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {
}
-void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
- struct loopfilter *lf = &cpi->mb.e_mbd.lf;
+void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
+ VP9_COMMON *const cm = &cpi->common;
+ struct loopfilter *const lf = &cm->lf;
int best_err = 0;
int filt_err = 0;
- int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
- int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+ const int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+ const int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
int filter_step;
int filt_high = 0;
@@ -145,33 +131,26 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
int Bias = 0; // Bias against raising loop filter and in favour of lowering it
// Make a copy of the unfiltered / processed recon buffer
- vp8_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+ vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
- if (cm->frame_type == KEY_FRAME)
- lf->sharpness_level = 0;
- else
- lf->sharpness_level = cpi->oxcf.Sharpness;
+ lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
+ : cpi->oxcf.Sharpness;
// Start the search at the previous frame filter level unless it is now out of range.
- filt_mid = lf->filter_level;
-
- if (filt_mid < min_filter_level)
- filt_mid = min_filter_level;
- else if (filt_mid > max_filter_level)
- filt_mid = max_filter_level;
+ filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
// Define the initial step size
- filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;
+ filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
// Get baseline error score
vp9_set_alt_lf_level(cpi, filt_mid);
- vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1);
+ vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1, partial);
best_err = vp9_calc_ss_err(sd, cm->frame_to_show);
filt_best = filt_mid;
// Re-instate the unfiltered frame
- vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+ vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
while (filter_step > 0) {
Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images
@@ -190,12 +169,12 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
if ((filt_direction <= 0) && (filt_low != filt_mid)) {
// Get Low filter error score
vp9_set_alt_lf_level(cpi, filt_low);
- vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1);
+ vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1, partial);
filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
// Re-instate the unfiltered frame
- vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+ vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
// If value is close to the best so far then bias towards a lower loop filter value.
if ((filt_err - Bias) < best_err) {
@@ -210,12 +189,12 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
// Now look at filt_high
if ((filt_direction >= 0) && (filt_high != filt_mid)) {
vp9_set_alt_lf_level(cpi, filt_high);
- vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1);
+ vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, partial);
filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
// Re-instate the unfiltered frame
- vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+ vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
// Was it better than the previous best?
if (filt_err < (best_err - Bias)) {
@@ -236,3 +215,4 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
lf->filter_level = filt_best;
}
+
diff --git a/libvpx/vp9/encoder/vp9_picklpf.h b/libvpx/vp9/encoder/vp9_picklpf.h
index 698cb8d..9de4cf8 100644
--- a/libvpx/vp9/encoder/vp9_picklpf.h
+++ b/libvpx/vp9/encoder/vp9_picklpf.h
@@ -18,6 +18,5 @@ struct VP9_COMP;
void vp9_set_alt_lf_level(struct VP9_COMP *cpi, int filt_val);
void vp9_pick_filter_level(struct yv12_buffer_config *sd,
- struct VP9_COMP *cpi);
-
+ struct VP9_COMP *cpi, int partial);
#endif // VP9_ENCODER_VP9_PICKLPF_H_
diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c
index 525f4da..6c8b2a0 100644
--- a/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libvpx/vp9/encoder/vp9_quantize.c
@@ -69,6 +69,7 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
if (x >= zbin) {
x += (round_ptr[rc != 0]);
+ x = clamp(x, INT16_MIN, INT16_MAX);
y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
quant_shift_ptr[rc != 0]) >> 16; // quantize (x)
x = (y ^ sz) - sz; // get the sign back
@@ -84,7 +85,6 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
*eob_ptr = eob + 1;
}
-// This function works well for large transform size.
void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
int16_t *zbin_ptr, int16_t *round_ptr,
@@ -94,7 +94,7 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
uint16_t *eob_ptr, const int16_t *scan,
const int16_t *iscan) {
int i, rc, eob;
- int zbins[2], nzbins[2], zbin;
+ int zbins[2], nzbins[2];
int x, y, z, sz;
int idx = 0;
int idx_arr[1024];
@@ -105,8 +105,8 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
eob = -1;
// Base ZBIN
- zbins[0] = zbin_ptr[0] + zbin_oq_value;
- zbins[1] = zbin_ptr[1] + zbin_oq_value;
+ zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);
+ zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);
nzbins[0] = zbins[0] * -1;
nzbins[1] = zbins[1] * -1;
@@ -114,7 +114,7 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
// Pre-scan pass
for (i = 0; i < n_coeffs; i++) {
rc = scan[i];
- z = coeff_ptr[rc] * 2;
+ z = coeff_ptr[rc];
// If the coefficient is out of the base ZBIN range, keep it for
// quantization.
@@ -127,31 +127,52 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
for (i = 0; i < idx; i++) {
rc = scan[idx_arr[i]];
- // Calculate ZBIN
- zbin = (zbins[rc != 0]);
-
- z = coeff_ptr[rc] * 2;
+ z = coeff_ptr[rc];
sz = (z >> 31); // sign of z
x = (z ^ sz) - sz; // x = abs(z)
- if (x >= zbin) {
- x += (round_ptr[rc != 0]);
- y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
- quant_shift_ptr[rc != 0]) >> 16; // quantize (x)
+ x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ x = clamp(x, INT16_MIN, INT16_MAX);
+ y = ((((x * quant_ptr[rc != 0]) >> 16) + x) *
+ quant_shift_ptr[rc != 0]) >> 15; // quantize (x)
- x = (y ^ sz) - sz; // get the sign back
- qcoeff_ptr[rc] = x; // write to destination
- dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2; // dequantized value
+ x = (y ^ sz) - sz; // get the sign back
+ qcoeff_ptr[rc] = x; // write to destination
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2; // dequantized value
- if (y) {
- eob = idx_arr[i]; // last nonzero coeffs
- }
- }
+ if (y)
+ eob = idx_arr[i]; // last nonzero coeffs
}
}
*eob_ptr = eob + 1;
}
+struct plane_block_idx {
+ int plane;
+ int block;
+};
+
+// TODO(jkoleszar): returning a struct so it can be used in a const context,
+// expect to refactor this further later.
+static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
+ int b_idx) {
+ const int v_offset = y_blocks * 5 / 4;
+ struct plane_block_idx res;
+
+ if (b_idx < y_blocks) {
+ res.plane = 0;
+ res.block = b_idx;
+ } else if (b_idx < v_offset) {
+ res.plane = 1;
+ res.block = b_idx - y_blocks;
+ } else {
+ assert(b_idx < y_blocks * 3 / 2);
+ res.plane = 2;
+ res.block = b_idx - v_offset;
+ }
+ return res;
+}
+
void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
int y_blocks) {
MACROBLOCKD *const xd = &mb->e_mbd;
@@ -159,14 +180,14 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
const int16_t *scan = get_scan_4x4(tx_type);
const int16_t *iscan = get_iscan_4x4(tx_type);
- vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
+ vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block),
16, mb->skip_block,
mb->plane[pb_idx.plane].zbin,
mb->plane[pb_idx.plane].round,
mb->plane[pb_idx.plane].quant,
mb->plane[pb_idx.plane].quant_shift,
- BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
- BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
+ BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block),
+ BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block),
xd->plane[pb_idx.plane].dequant,
mb->plane[pb_idx.plane].zbin_extra,
&xd->plane[pb_idx.plane].eobs[pb_idx.block],
@@ -185,63 +206,43 @@ static void invert_quant(int16_t *quant, int16_t *shift, int d) {
}
void vp9_init_quantizer(VP9_COMP *cpi) {
- int i;
- int quant_val;
- int quant_uv_val;
-#if CONFIG_ALPHA
- int quant_alpha_val;
-#endif
- int q;
+ int i, q;
+ VP9_COMMON *const cm = &cpi->common;
for (q = 0; q < QINDEX_RANGE; q++) {
- int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80;
- int qrounding_factor = 48;
- if (q == 0) {
- qzbin_factor = 64;
- qrounding_factor = 64;
+ const int qzbin_factor = q == 0 ? 64 : (vp9_dc_quant(q, 0) < 148 ? 84 : 80);
+ const int qrounding_factor = q == 0 ? 64 : 48;
+
+ // y
+ for (i = 0; i < 2; ++i) {
+ const int quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
+ : vp9_ac_quant(q, 0);
+ invert_quant(&cpi->y_quant[q][i], &cpi->y_quant_shift[q][i], quant);
+ cpi->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+ cpi->y_round[q][i] = (qrounding_factor * quant) >> 7;
+ cm->y_dequant[q][i] = quant;
}
- // dc values
- quant_val = vp9_dc_quant(q, cpi->common.y_dc_delta_q);
- invert_quant(cpi->y_quant[q] + 0, cpi->y_quant_shift[q] + 0, quant_val);
- cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
- cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7;
- cpi->common.y_dequant[q][0] = quant_val;
-
- quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q);
- invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val);
- cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
- cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7;
- cpi->common.uv_dequant[q][0] = quant_val;
-
-#if CONFIG_ALPHA
- quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q);
- invert_quant(cpi->a_quant[q] + 0, cpi->a_quant_shift[q] + 0, quant_val);
- cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
- cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7;
- cpi->common.a_dequant[q][0] = quant_val;
-#endif
-
- quant_val = vp9_ac_quant(q, 0);
- invert_quant(cpi->y_quant[q] + 1, cpi->y_quant_shift[q] + 1, quant_val);
- cpi->y_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
- cpi->y_round[q][1] = (qrounding_factor * quant_val) >> 7;
- cpi->common.y_dequant[q][1] = quant_val;
-
- quant_uv_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q);
- invert_quant(cpi->uv_quant[q] + 1, cpi->uv_quant_shift[q] + 1,
- quant_uv_val);
- cpi->uv_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
- cpi->uv_round[q][1] = (qrounding_factor * quant_uv_val) >> 7;
- cpi->common.uv_dequant[q][1] = quant_uv_val;
+ // uv
+ for (i = 0; i < 2; ++i) {
+ const int quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q)
+ : vp9_ac_quant(q, cm->uv_ac_delta_q);
+ invert_quant(&cpi->uv_quant[q][i], &cpi->uv_quant_shift[q][i], quant);
+ cpi->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+ cpi->uv_round[q][i] = (qrounding_factor * quant) >> 7;
+ cm->uv_dequant[q][i] = quant;
+ }
#if CONFIG_ALPHA
- quant_alpha_val = vp9_ac_quant(q, cpi->common.a_ac_delta_q);
- invert_quant(cpi->a_quant[q] + 1, cpi->a_quant_shift[q] + 1,
- quant_alpha_val);
- cpi->a_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);
- cpi->a_round[q][1] = (qrounding_factor * quant_alpha_val) >> 7;
- cpi->common.a_dequant[q][1] = quant_alpha_val;
+ // alpha
+ for (i = 0; i < 2; ++i) {
+ const int quant = i == 0 ? vp9_dc_quant(q, cm->a_dc_delta_q)
+ : vp9_ac_quant(q, cm->a_ac_delta_q);
+ invert_quant(&cpi->a_quant[q][i], &cpi->a_quant_shift[q][i], quant);
+ cpi->a_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+ cpi->a_round[q][i] = (qrounding_factor * quant) >> 7;
+ cm->a_dequant[q][i] = quant;
+ }
#endif
for (i = 2; i < 8; i++) {
@@ -249,20 +250,20 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
cpi->y_quant_shift[q][i] = cpi->y_quant_shift[q][1];
cpi->y_zbin[q][i] = cpi->y_zbin[q][1];
cpi->y_round[q][i] = cpi->y_round[q][1];
- cpi->common.y_dequant[q][i] = cpi->common.y_dequant[q][1];
+ cm->y_dequant[q][i] = cm->y_dequant[q][1];
cpi->uv_quant[q][i] = cpi->uv_quant[q][1];
cpi->uv_quant_shift[q][i] = cpi->uv_quant_shift[q][1];
cpi->uv_zbin[q][i] = cpi->uv_zbin[q][1];
cpi->uv_round[q][i] = cpi->uv_round[q][1];
- cpi->common.uv_dequant[q][i] = cpi->common.uv_dequant[q][1];
+ cm->uv_dequant[q][i] = cm->uv_dequant[q][1];
#if CONFIG_ALPHA
cpi->a_quant[q][i] = cpi->a_quant[q][1];
cpi->a_quant_shift[q][i] = cpi->a_quant_shift[q][1];
cpi->a_zbin[q][i] = cpi->a_zbin[q][1];
cpi->a_round[q][i] = cpi->a_round[q][1];
- cpi->common.a_dequant[q][i] = cpi->common.a_dequant[q][1];
+ cm->a_dequant[q][i] = cm->a_dequant[q][1];
#endif
}
}
@@ -272,8 +273,9 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
int i;
MACROBLOCKD *xd = &x->e_mbd;
int zbin_extra;
- int segment_id = xd->mode_info_context->mbmi.segment_id;
- const int qindex = vp9_get_qindex(xd, segment_id, cpi->common.base_qindex);
+ int segment_id = xd->this_mi->mbmi.segment_id;
+ const int qindex = vp9_get_qindex(&cpi->common.seg, segment_id,
+ cpi->common.base_qindex);
// Y
zbin_extra = (cpi->common.y_dequant[qindex][1] *
@@ -308,7 +310,8 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex];
#endif
- x->skip_block = vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP);
+ x->skip_block = vp9_segfeature_active(&cpi->common.seg, segment_id,
+ SEG_LVL_SKIP);
/* save this macroblock QIndex for vp9_update_zbin_extra() */
x->e_mbd.q_index = qindex;
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c
index d3a9529..2d12ba9 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -71,7 +71,6 @@ int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
void vp9_save_coding_context(VP9_COMP *cpi) {
CODING_CONTEXT *const cc = &cpi->coding_context;
VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
// Stores a snapshot of key state variables which can subsequently be
// restored with a call to vp9_restore_coding_context. These functions are
@@ -89,7 +88,7 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
vp9_copy(cc->partition_prob, cm->fc.partition_prob);
- vp9_copy(cc->segment_pred_probs, xd->seg.pred_probs);
+ vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
vp9_copy(cc->intra_inter_prob, cm->fc.intra_inter_prob);
vp9_copy(cc->comp_inter_prob, cm->fc.comp_inter_prob);
@@ -99,8 +98,8 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
- vp9_copy(cc->last_ref_lf_deltas, xd->lf.last_ref_deltas);
- vp9_copy(cc->last_mode_lf_deltas, xd->lf.last_mode_deltas);
+ vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
+ vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
vp9_copy(cc->coef_probs, cm->fc.coef_probs);
vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
@@ -111,7 +110,6 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
void vp9_restore_coding_context(VP9_COMP *cpi) {
CODING_CONTEXT *const cc = &cpi->coding_context;
VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
// Restore key state variables to the snapshot state stored in the
// previous call to vp9_save_coding_context.
@@ -127,7 +125,7 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
vp9_copy(cm->fc.partition_prob, cc->partition_prob);
- vp9_copy(xd->seg.pred_probs, cc->segment_pred_probs);
+ vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
vp9_copy(cm->fc.intra_inter_prob, cc->intra_inter_prob);
vp9_copy(cm->fc.comp_inter_prob, cc->comp_inter_prob);
@@ -138,8 +136,8 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
cpi->coding_context.last_frame_seg_map_copy,
(cm->mi_rows * cm->mi_cols));
- vp9_copy(xd->lf.last_ref_deltas, cc->last_ref_lf_deltas);
- vp9_copy(xd->lf.last_mode_deltas, cc->last_mode_lf_deltas);
+ vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
+ vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
vp9_copy(cm->fc.coef_probs, cc->coef_probs);
vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
@@ -149,9 +147,8 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
void vp9_setup_key_frame(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
- vp9_setup_past_independence(cm, xd);
+ vp9_setup_past_independence(cm);
// interval before next GF
cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
@@ -162,9 +159,8 @@ void vp9_setup_key_frame(VP9_COMP *cpi) {
void vp9_setup_inter_frame(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
if (cm->error_resilient_mode || cm->intra_only)
- vp9_setup_past_independence(cm, xd);
+ vp9_setup_past_independence(cm);
assert(cm->frame_context_idx < NUM_FRAME_CONTEXTS);
cm->fc = cm->frame_contexts[cm->frame_context_idx];
diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index 2d93250..df00334 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c
@@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
#include <stdio.h>
#include <math.h>
#include <limits.h>
@@ -49,65 +48,66 @@
DECLARE_ALIGNED(16, extern const uint8_t,
vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-#define I4X4_PRED 0x8000
-#define SPLITMV 0x10000
+#define LAST_FRAME_MODE_MASK 0xFFDADCD60
+#define GOLDEN_FRAME_MODE_MASK 0xFFB5A3BB0
+#define ALT_REF_MODE_MASK 0xFF8C648D0
const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
- {NEARESTMV, LAST_FRAME, NONE},
- {DC_PRED, INTRA_FRAME, NONE},
-
- {NEARESTMV, ALTREF_FRAME, NONE},
- {NEARESTMV, GOLDEN_FRAME, NONE},
- {NEWMV, LAST_FRAME, NONE},
- {NEARESTMV, LAST_FRAME, ALTREF_FRAME},
- {NEARMV, LAST_FRAME, NONE},
- {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
-
- {NEWMV, GOLDEN_FRAME, NONE},
- {NEWMV, ALTREF_FRAME, NONE},
- {NEARMV, ALTREF_FRAME, NONE},
-
- {TM_PRED, INTRA_FRAME, NONE},
-
- {NEARMV, LAST_FRAME, ALTREF_FRAME},
- {NEWMV, LAST_FRAME, ALTREF_FRAME},
- {NEARMV, GOLDEN_FRAME, NONE},
- {NEARMV, GOLDEN_FRAME, ALTREF_FRAME},
- {NEWMV, GOLDEN_FRAME, ALTREF_FRAME},
-
- {SPLITMV, LAST_FRAME, NONE},
- {SPLITMV, GOLDEN_FRAME, NONE},
- {SPLITMV, ALTREF_FRAME, NONE},
- {SPLITMV, LAST_FRAME, ALTREF_FRAME},
- {SPLITMV, GOLDEN_FRAME, ALTREF_FRAME},
-
- {ZEROMV, LAST_FRAME, NONE},
- {ZEROMV, GOLDEN_FRAME, NONE},
- {ZEROMV, ALTREF_FRAME, NONE},
- {ZEROMV, LAST_FRAME, ALTREF_FRAME},
- {ZEROMV, GOLDEN_FRAME, ALTREF_FRAME},
-
- {I4X4_PRED, INTRA_FRAME, NONE},
- {H_PRED, INTRA_FRAME, NONE},
- {V_PRED, INTRA_FRAME, NONE},
- {D135_PRED, INTRA_FRAME, NONE},
- {D27_PRED, INTRA_FRAME, NONE},
- {D153_PRED, INTRA_FRAME, NONE},
- {D63_PRED, INTRA_FRAME, NONE},
- {D117_PRED, INTRA_FRAME, NONE},
- {D45_PRED, INTRA_FRAME, NONE},
+ {RD_NEARESTMV, LAST_FRAME, NONE},
+ {RD_NEARESTMV, ALTREF_FRAME, NONE},
+ {RD_NEARESTMV, GOLDEN_FRAME, NONE},
+
+ {RD_DC_PRED, INTRA_FRAME, NONE},
+
+ {RD_NEWMV, LAST_FRAME, NONE},
+ {RD_NEWMV, ALTREF_FRAME, NONE},
+ {RD_NEWMV, GOLDEN_FRAME, NONE},
+
+ {RD_NEARMV, LAST_FRAME, NONE},
+ {RD_NEARMV, ALTREF_FRAME, NONE},
+ {RD_NEARESTMV, LAST_FRAME, ALTREF_FRAME},
+ {RD_NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
+
+ {RD_TM_PRED, INTRA_FRAME, NONE},
+
+ {RD_NEARMV, LAST_FRAME, ALTREF_FRAME},
+ {RD_NEWMV, LAST_FRAME, ALTREF_FRAME},
+ {RD_NEARMV, GOLDEN_FRAME, NONE},
+ {RD_NEARMV, GOLDEN_FRAME, ALTREF_FRAME},
+ {RD_NEWMV, GOLDEN_FRAME, ALTREF_FRAME},
+
+ {RD_SPLITMV, LAST_FRAME, NONE},
+ {RD_SPLITMV, GOLDEN_FRAME, NONE},
+ {RD_SPLITMV, ALTREF_FRAME, NONE},
+ {RD_SPLITMV, LAST_FRAME, ALTREF_FRAME},
+ {RD_SPLITMV, GOLDEN_FRAME, ALTREF_FRAME},
+
+ {RD_ZEROMV, LAST_FRAME, NONE},
+ {RD_ZEROMV, GOLDEN_FRAME, NONE},
+ {RD_ZEROMV, ALTREF_FRAME, NONE},
+ {RD_ZEROMV, LAST_FRAME, ALTREF_FRAME},
+ {RD_ZEROMV, GOLDEN_FRAME, ALTREF_FRAME},
+
+ {RD_I4X4_PRED, INTRA_FRAME, NONE},
+ {RD_H_PRED, INTRA_FRAME, NONE},
+ {RD_V_PRED, INTRA_FRAME, NONE},
+ {RD_D135_PRED, INTRA_FRAME, NONE},
+ {RD_D207_PRED, INTRA_FRAME, NONE},
+ {RD_D153_PRED, INTRA_FRAME, NONE},
+ {RD_D63_PRED, INTRA_FRAME, NONE},
+ {RD_D117_PRED, INTRA_FRAME, NONE},
+ {RD_D45_PRED, INTRA_FRAME, NONE},
};
// The baseline rd thresholds for breaking out of the rd loop for
// certain modes are assumed to be based on 8x8 blocks.
// This table is used to correct for blocks size.
// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
-static int rd_thresh_block_size_factor[BLOCK_SIZE_TYPES] =
+static int rd_thresh_block_size_factor[BLOCK_SIZES] =
{2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
-#define BASE_RD_THRESH_FREQ_FACT 16
-#define MAX_RD_THRESH_FREQ_FACT 32
-#define MAX_RD_THRESH_FREQ_INC 1
+#define MAX_RD_THRESH_FACT 64
+#define RD_THRESH_INC 1
static void fill_token_costs(vp9_coeff_cost *c,
vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
@@ -160,6 +160,15 @@ static int compute_rd_mult(int qindex) {
return (11 * q * q) >> 2;
}
+static MB_PREDICTION_MODE rd_mode_to_mode(RD_PREDICTION_MODE rd_mode) {
+ if (rd_mode == RD_SPLITMV || rd_mode == RD_I4X4_PRED) {
+ assert(!"Invalid rd_mode");
+ return MB_MODE_COUNT;
+ }
+ assert((int)rd_mode < (int)MB_MODE_COUNT);
+ return (MB_PREDICTION_MODE)rd_mode;
+}
+
void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
@@ -199,7 +208,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
cpi->RDDIV = 1;
cpi->RDMULT /= 100;
- for (bsize = 0; bsize < BLOCK_SIZE_TYPES; ++bsize) {
+ for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
for (i = 0; i < MAX_MODES; ++i) {
// Threshold here seem unecessarily harsh but fine given actual
// range of values used for cpi->sf.thresh_mult[]
@@ -213,18 +222,12 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
} else {
cpi->rd_threshes[bsize][i] = INT_MAX;
}
- cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
-
- if (cpi->sf.adaptive_rd_thresh)
- cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
- else
- cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
}
}
} else {
cpi->RDDIV = 100;
- for (bsize = 0; bsize < BLOCK_SIZE_TYPES; ++bsize) {
+ for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
for (i = 0; i < MAX_MODES; i++) {
// Threshold here seem unecessarily harsh but fine given actual
// range of values used for cpi->sf.thresh_mult[]
@@ -237,12 +240,6 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
} else {
cpi->rd_threshes[bsize][i] = INT_MAX;
}
- cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
-
- if (cpi->sf.adaptive_rd_thresh)
- cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
- else
- cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
}
}
}
@@ -277,16 +274,6 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
}
}
-static INLINE BLOCK_SIZE_TYPE get_block_size(int bwl, int bhl) {
- return bsize_from_dim_lookup[bwl][bhl];
-}
-
-static BLOCK_SIZE_TYPE get_plane_block_size(BLOCK_SIZE_TYPE bsize,
- struct macroblockd_plane *pd) {
- return get_block_size(plane_block_width_log2by4(bsize, pd),
- plane_block_height_log2by4(bsize, pd));
-}
-
static INLINE void linear_interpolate2(double x, int ntab, int inv_step,
const double *tab1, const double *tab2,
double *v1, double *v2) {
@@ -388,7 +375,7 @@ static void model_rd_from_var_lapndz(int var, int n, int qstep,
vp9_clear_system_state();
}
-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
int *out_rate_sum, int64_t *out_dist_sum) {
// Note our transform coeffs are 8 times an orthogonal transform.
@@ -399,18 +386,14 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
for (i = 0; i < MAX_MB_PLANE; ++i) {
struct macroblock_plane *const p = &x->plane[i];
struct macroblockd_plane *const pd = &xd->plane[i];
-
- // TODO(dkovalev) the same code in get_plane_block_size
- const int bwl = plane_block_width_log2by4(bsize, pd);
- const int bhl = plane_block_height_log2by4(bsize, pd);
- const BLOCK_SIZE_TYPE bs = get_block_size(bwl, bhl);
+ const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
unsigned int sse;
int rate;
int64_t dist;
(void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
pd->dst.buf, pd->dst.stride, &sse);
// sse works better than var, since there is no dc prediction used
- model_rd_from_var_lapndz(sse, 16 << (bwl + bhl),
+ model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
pd->dequant[1] >> 3, &rate, &dist);
rate_sum += rate;
@@ -421,81 +404,52 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
*out_dist_sum = dist_sum << 4;
}
-static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
- MACROBLOCK *x, MACROBLOCKD *xd,
- int *out_rate_sum, int64_t *out_dist_sum) {
- // Note our transform coeffs are 8 times an orthogonal transform.
- // Hence quantizer step is also 8 times. To get effective quantizer
- // we need to divide by 8 before sending to modeling function.
- struct macroblock_plane *const p = &x->plane[0];
- struct macroblockd_plane *const pd = &xd->plane[0];
-
- // TODO(dkovalev) the same code in get_plane_block_size
- const int bwl = plane_block_width_log2by4(bsize, pd);
- const int bhl = plane_block_height_log2by4(bsize, pd);
- const BLOCK_SIZE_TYPE bs = get_block_size(bwl, bhl);
- unsigned int sse;
- int rate;
- int64_t dist;
- (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
- pd->dst.buf, pd->dst.stride, &sse);
- // sse works better than var, since there is no dc prediction used
- model_rd_from_var_lapndz(sse, 16 << (bwl + bhl),
- pd->dequant[1] >> 3, &rate, &dist);
-
- *out_rate_sum = rate;
- *out_dist_sum = dist << 4;
-}
-
-static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
TX_SIZE tx_size,
MACROBLOCK *x, MACROBLOCKD *xd,
int *out_rate_sum, int64_t *out_dist_sum,
int *out_skip) {
- int t = 4, j, k;
- BLOCK_SIZE_TYPE bs = BLOCK_4X4;
+ int j, k;
+ BLOCK_SIZE bs;
struct macroblock_plane *const p = &x->plane[0];
struct macroblockd_plane *const pd = &xd->plane[0];
- const int width = plane_block_width(bsize, pd);
- const int height = plane_block_height(bsize, pd);
+ const int width = 4 << num_4x4_blocks_wide_lookup[bsize];
+ const int height = 4 << num_4x4_blocks_high_lookup[bsize];
int rate_sum = 0;
int64_t dist_sum = 0;
+ const int t = 4 << tx_size;
if (tx_size == TX_4X4) {
bs = BLOCK_4X4;
- t = 4;
} else if (tx_size == TX_8X8) {
bs = BLOCK_8X8;
- t = 8;
} else if (tx_size == TX_16X16) {
bs = BLOCK_16X16;
- t = 16;
} else if (tx_size == TX_32X32) {
bs = BLOCK_32X32;
- t = 32;
} else {
assert(0);
}
+
*out_skip = 1;
for (j = 0; j < height; j += t) {
for (k = 0; k < width; k += t) {
int rate;
int64_t dist;
unsigned int sse;
- (void) cpi->fn_ptr[bs].vf(p->src.buf + j * p->src.stride + k,
- p->src.stride,
- pd->dst.buf + j * pd->dst.stride + k,
- pd->dst.stride, &sse);
+ cpi->fn_ptr[bs].vf(&p->src.buf[j * p->src.stride + k], p->src.stride,
+ &pd->dst.buf[j * pd->dst.stride + k], pd->dst.stride,
+ &sse);
// sse works better than var, since there is no dc prediction used
- model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3,
- &rate, &dist);
+ model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3, &rate, &dist);
rate_sum += rate;
dist_sum += dist;
*out_skip &= (rate < 1024);
}
}
+
*out_rate_sum = rate_sum;
- *out_dist_sum = (dist_sum << 4);
+ *out_dist_sum = dist_sum << 4;
}
int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
@@ -526,42 +480,39 @@ static const int16_t band_counts[TX_SIZES][8] = {
};
static INLINE int cost_coeffs(MACROBLOCK *mb,
- int plane, int block, PLANE_TYPE type,
+ int plane, int block,
ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
TX_SIZE tx_size,
const int16_t *scan, const int16_t *nb) {
MACROBLOCKD *const xd = &mb->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
- int pt, c, cost;
+ MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const PLANE_TYPE type = pd->plane_type;
const int16_t *band_count = &band_counts[tx_size][1];
- const int eob = xd->plane[plane].eobs[block];
- const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
+ const int eob = pd->eobs[block];
+ const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
- unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS]
- [MAX_ENTROPY_TOKENS] = mb->token_costs[tx_size][type][ref];
- ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
+ unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
+ mb->token_costs[tx_size][type][ref];
+ const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
uint8_t token_cache[1024];
+ int pt = combine_entropy_contexts(above_ec, left_ec);
+ int c, cost;
// Check for consistency of tx_size with mode info
- assert((!type && !plane) || (type && plane));
- if (type == PLANE_TYPE_Y_WITH_DC) {
- assert(xd->mode_info_context->mbmi.txfm_size == tx_size);
- } else {
- assert(tx_size == get_uv_tx_size(mbmi));
- }
-
- pt = combine_entropy_contexts(above_ec, left_ec);
+ assert(type == PLANE_TYPE_Y_WITH_DC ? mbmi->tx_size == tx_size
+ : get_uv_tx_size(mbmi) == tx_size);
if (eob == 0) {
// single eob token
cost = token_costs[0][0][pt][DCT_EOB_TOKEN];
c = 0;
} else {
- int v, prev_t, band_left = *band_count++;
+ int band_left = *band_count++;
// dc token
- v = qcoeff_ptr[0];
- prev_t = vp9_dct_value_tokens_ptr[v].token;
+ int v = qcoeff_ptr[0];
+ int prev_t = vp9_dct_value_tokens_ptr[v].token;
cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
token_cache[0] = vp9_pt_energy_class[prev_t];
++token_costs;
@@ -591,13 +542,12 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
}
// is eob first coefficient;
- *A = *L = c > 0;
+ *A = *L = (c > 0);
return cost;
}
struct rdcost_block_args {
- VP9_COMMON *cm;
MACROBLOCK *x;
ENTROPY_CONTEXT t_above[16];
ENTROPY_CONTEXT t_left[16];
@@ -612,23 +562,23 @@ struct rdcost_block_args {
const int16_t *scan, *nb;
};
-static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg) {
+static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
+ const int ss_txfrm_size = tx_size << 1;
struct rdcost_block_args* args = arg;
MACROBLOCK* const x = args->x;
MACROBLOCKD* const xd = &x->e_mbd;
- struct macroblock_plane *const p = &x->plane[0];
- struct macroblockd_plane *const pd = &xd->plane[0];
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
int64_t this_sse;
int shift = args->tx_size == TX_32X32 ? 0 : 2;
- int16_t *const coeff = BLOCK_OFFSET(p->coeff, block, 16);
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+ int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
&this_sse) >> shift;
args->sse += this_sse >> shift;
if (x->skip_encode &&
- xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
+ xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
// TODO(jingning): tune the model to better capture the distortion.
int64_t p = (pd->dequant[1] * pd->dequant[1] *
(1 << ss_txfrm_size)) >> shift;
@@ -637,119 +587,25 @@ static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
}
}
-static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg) {
+static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
struct rdcost_block_args* args = arg;
- int x_idx, y_idx;
- MACROBLOCKD * const xd = &args->x->e_mbd;
- txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,
- &y_idx);
+ int x_idx, y_idx;
+ txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx);
args->rate += cost_coeffs(args->x, plane, block,
- xd->plane[plane].plane_type, args->t_above + x_idx,
+ args->t_above + x_idx,
args->t_left + y_idx, args->tx_size,
args->scan, args->nb);
}
-// FIXME(jingning): need to make the rd test of chroma components consistent
-// with that of luma component. this function should be deprecated afterwards.
-static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,
- BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
- MACROBLOCKD * const xd = &x->e_mbd;
- const int bwl = plane_block_width_log2by4(bsize, &xd->plane[plane]);
- const int bhl = plane_block_height_log2by4(bsize, &xd->plane[plane]);
- const int bw = 1 << bwl, bh = 1 << bhl;
- int i;
- struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
- 0, 0, 0, INT64_MAX, 0 };
-
- switch (tx_size) {
- case TX_4X4:
- vpx_memcpy(&args.t_above, xd->plane[plane].above_context,
- sizeof(ENTROPY_CONTEXT) * bw);
- vpx_memcpy(&args.t_left, xd->plane[plane].left_context,
- sizeof(ENTROPY_CONTEXT) * bh);
- args.scan = vp9_default_scan_4x4;
- args.nb = vp9_default_scan_4x4_neighbors;
- break;
- case TX_8X8:
- for (i = 0; i < bw; i += 2)
- args.t_above[i] = !!*(uint16_t *)&xd->plane[plane].above_context[i];
- for (i = 0; i < bh; i += 2)
- args.t_left[i] = !!*(uint16_t *)&xd->plane[plane].left_context[i];
- args.scan = vp9_default_scan_8x8;
- args.nb = vp9_default_scan_8x8_neighbors;
- break;
- case TX_16X16:
- for (i = 0; i < bw; i += 4)
- args.t_above[i] = !!*(uint32_t *)&xd->plane[plane].above_context[i];
- for (i = 0; i < bh; i += 4)
- args.t_left[i] = !!*(uint32_t *)&xd->plane[plane].left_context[i];
- args.scan = vp9_default_scan_16x16;
- args.nb = vp9_default_scan_16x16_neighbors;
- break;
- case TX_32X32:
- for (i = 0; i < bw; i += 8)
- args.t_above[i] = !!*(uint64_t *)&xd->plane[plane].above_context[i];
- for (i = 0; i < bh; i += 8)
- args.t_left[i] = !!*(uint64_t *)&xd->plane[plane].left_context[i];
- args.scan = vp9_default_scan_32x32;
- args.nb = vp9_default_scan_32x32_neighbors;
- break;
- default:
- assert(0);
- }
-
- foreach_transformed_block_in_plane(xd, bsize, plane, rate_block, &args);
- return args.rate;
-}
-
-static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
- int cost = 0, plane;
-
- for (plane = 1; plane < MAX_MB_PLANE; plane++) {
- cost += rdcost_plane(cm, x, plane, bsize, tx_size);
- }
- return cost;
-}
-
-static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
- int shift, int64_t *sse) {
- struct macroblockd_plane *p = &x->e_mbd.plane[0];
- const int bwl = plane_block_width_log2by4(bsize, p);
- const int bhl = plane_block_height_log2by4(bsize, p);
- int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
- 16 << (bwl + bhl), sse) >> shift;
- *sse >>= shift;
- return e;
-}
-
-static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
- int shift, int64_t *sse) {
- int64_t sum = 0, this_sse;
- int plane;
-
- *sse = 0;
- for (plane = 1; plane < MAX_MB_PLANE; plane++) {
- struct macroblockd_plane *p = &x->e_mbd.plane[plane];
- const int bwl = plane_block_width_log2by4(bsize, p);
- const int bhl = plane_block_height_log2by4(bsize, p);
- sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
- 16 << (bwl + bhl), &this_sse);
- *sse += this_sse;
- }
- *sse >>= shift;
- return sum >> shift;
-}
-
-static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg) {
+static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
struct rdcost_block_args *args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
- struct encode_b_args encode_args = {args->cm, x, NULL};
+ struct encode_b_args encode_args = {x, NULL};
int64_t rd1, rd2, rd;
if (args->skip)
@@ -765,58 +621,61 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize,
return;
}
- if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
- encode_block_intra(plane, block, bsize, ss_txfrm_size, &encode_args);
+ if (!is_inter_block(&xd->this_mi->mbmi))
+ vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &encode_args);
else
- xform_quant(plane, block, bsize, ss_txfrm_size, &encode_args);
+ vp9_xform_quant(plane, block, plane_bsize, tx_size, &encode_args);
- dist_block(plane, block, bsize, ss_txfrm_size, args);
- rate_block(plane, block, bsize, ss_txfrm_size, args);
+ dist_block(plane, block, tx_size, args);
+ rate_block(plane, block, plane_bsize, tx_size, args);
}
-static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
- int *rate, int64_t *distortion,
- int *skippable, int64_t *sse,
- int64_t ref_best_rd,
- BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+static void txfm_rd_in_plane(MACROBLOCK *x,
+ int *rate, int64_t *distortion,
+ int *skippable, int64_t *sse,
+ int64_t ref_best_rd, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
- struct macroblockd_plane *const pd = &xd->plane[0];
- const int bwl = plane_block_width_log2by4(bsize, pd);
- const int bhl = plane_block_height_log2by4(bsize, pd);
- const int bw = 1 << bwl, bh = 1 << bhl;
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs];
int i;
- struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
+ struct rdcost_block_args args = { x, { 0 }, { 0 }, tx_size,
+ num_4x4_blocks_wide, num_4x4_blocks_high,
0, 0, 0, ref_best_rd, 0 };
- xd->mode_info_context->mbmi.txfm_size = tx_size;
+ if (plane == 0)
+ xd->this_mi->mbmi.tx_size = tx_size;
+
switch (tx_size) {
case TX_4X4:
vpx_memcpy(&args.t_above, pd->above_context,
- sizeof(ENTROPY_CONTEXT) * bw);
+ sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide);
vpx_memcpy(&args.t_left, pd->left_context,
- sizeof(ENTROPY_CONTEXT) * bh);
- get_scan_nb_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, 0),
+ sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high);
+ get_scan_nb_4x4(get_tx_type_4x4(pd->plane_type, xd, 0),
&args.scan, &args.nb);
break;
case TX_8X8:
- for (i = 0; i < bw; i += 2)
+ for (i = 0; i < num_4x4_blocks_wide; i += 2)
args.t_above[i] = !!*(uint16_t *)&pd->above_context[i];
- for (i = 0; i < bh; i += 2)
+ for (i = 0; i < num_4x4_blocks_high; i += 2)
args.t_left[i] = !!*(uint16_t *)&pd->left_context[i];
- get_scan_nb_8x8(get_tx_type_8x8(PLANE_TYPE_Y_WITH_DC, xd),
+ get_scan_nb_8x8(get_tx_type_8x8(pd->plane_type, xd),
&args.scan, &args.nb);
break;
case TX_16X16:
- for (i = 0; i < bw; i += 4)
+ for (i = 0; i < num_4x4_blocks_wide; i += 4)
args.t_above[i] = !!*(uint32_t *)&pd->above_context[i];
- for (i = 0; i < bh; i += 4)
+ for (i = 0; i < num_4x4_blocks_high; i += 4)
args.t_left[i] = !!*(uint32_t *)&pd->left_context[i];
- get_scan_nb_16x16(get_tx_type_16x16(PLANE_TYPE_Y_WITH_DC, xd),
+ get_scan_nb_16x16(get_tx_type_16x16(pd->plane_type, xd),
&args.scan, &args.nb);
break;
case TX_32X32:
- for (i = 0; i < bw; i += 8)
+ for (i = 0; i < num_4x4_blocks_wide; i += 8)
args.t_above[i] = !!*(uint64_t *)&pd->above_context[i];
- for (i = 0; i < bh; i += 8)
+ for (i = 0; i < num_4x4_blocks_high; i += 8)
args.t_left[i] = !!*(uint64_t *)&pd->left_context[i];
args.scan = vp9_default_scan_32x32;
args.nb = vp9_default_scan_32x32_neighbors;
@@ -825,40 +684,39 @@ static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
assert(0);
}
- foreach_transformed_block_in_plane(xd, bsize, 0, block_yrd_txfm, &args);
+ foreach_transformed_block_in_plane(xd, bsize, plane, block_yrd_txfm, &args);
*distortion = args.dist;
*rate = args.rate;
*sse = args.sse;
- *skippable = vp9_sby_is_skippable(xd, bsize) && (!args.skip);
+ *skippable = vp9_is_skippable_in_plane(xd, bsize, plane) && (!args.skip);
}
static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
int *rate, int64_t *distortion,
int *skip, int64_t *sse,
int64_t ref_best_rd,
- BLOCK_SIZE_TYPE bs) {
- const TX_SIZE max_txfm_size = TX_32X32
- - (bs < BLOCK_32X32) - (bs < BLOCK_16X16);
+ BLOCK_SIZE bs) {
+ const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
if (max_txfm_size == TX_32X32 &&
(cm->tx_mode == ALLOW_32X32 ||
cm->tx_mode == TX_MODE_SELECT)) {
- mbmi->txfm_size = TX_32X32;
+ mbmi->tx_size = TX_32X32;
} else if (max_txfm_size >= TX_16X16 &&
(cm->tx_mode == ALLOW_16X16 ||
cm->tx_mode == ALLOW_32X32 ||
cm->tx_mode == TX_MODE_SELECT)) {
- mbmi->txfm_size = TX_16X16;
+ mbmi->tx_size = TX_16X16;
} else if (cm->tx_mode != ONLY_4X4) {
- mbmi->txfm_size = TX_8X8;
+ mbmi->tx_size = TX_8X8;
} else {
- mbmi->txfm_size = TX_4X4;
+ mbmi->tx_size = TX_4X4;
}
- super_block_yrd_for_txfm(cm, x, rate, distortion, skip,
- &sse[mbmi->txfm_size], ref_best_rd, bs,
- mbmi->txfm_size);
+ txfm_rd_in_plane(x, rate, distortion, skip,
+ &sse[mbmi->tx_size], ref_best_rd, 0, bs,
+ mbmi->tx_size);
cpi->txfm_stepdown_count[0]++;
}
@@ -867,18 +725,17 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
int64_t *d, int64_t *distortion,
int *s, int *skip,
int64_t tx_cache[TX_MODES],
- BLOCK_SIZE_TYPE bs) {
- const TX_SIZE max_tx_size = TX_32X32
- - (bs < BLOCK_32X32) - (bs < BLOCK_16X16);
+ BLOCK_SIZE bs) {
+ const TX_SIZE max_tx_size = max_txsize_lookup[bs];
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
int64_t rd[TX_SIZES][2];
int n, m;
int s0, s1;
- const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs);
+ const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->this_mi);
for (n = TX_4X4; n <= max_tx_size; n++) {
r[n][1] = r[n][0];
@@ -914,26 +771,26 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
(cm->tx_mode == TX_MODE_SELECT &&
rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
rd[TX_32X32][1] < rd[TX_4X4][1]))) {
- mbmi->txfm_size = TX_32X32;
+ mbmi->tx_size = TX_32X32;
} else if (max_tx_size >= TX_16X16 &&
(cm->tx_mode == ALLOW_16X16 ||
cm->tx_mode == ALLOW_32X32 ||
(cm->tx_mode == TX_MODE_SELECT &&
rd[TX_16X16][1] < rd[TX_8X8][1] &&
rd[TX_16X16][1] < rd[TX_4X4][1]))) {
- mbmi->txfm_size = TX_16X16;
+ mbmi->tx_size = TX_16X16;
} else if (cm->tx_mode == ALLOW_8X8 ||
cm->tx_mode == ALLOW_16X16 ||
cm->tx_mode == ALLOW_32X32 ||
(cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
- mbmi->txfm_size = TX_8X8;
+ mbmi->tx_size = TX_8X8;
} else {
- mbmi->txfm_size = TX_4X4;
+ mbmi->tx_size = TX_4X4;
}
- *distortion = d[mbmi->txfm_size];
- *rate = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT];
- *skip = s[mbmi->txfm_size];
+ *distortion = d[mbmi->tx_size];
+ *rate = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
+ *skip = s[mbmi->tx_size];
tx_cache[ONLY_4X4] = rd[TX_4X4][0];
tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
@@ -971,13 +828,11 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
int64_t *d, int64_t *distortion,
int *s, int *skip, int64_t *sse,
int64_t ref_best_rd,
- BLOCK_SIZE_TYPE bs,
- int *model_used) {
- const TX_SIZE max_txfm_size = TX_32X32
- - (bs < BLOCK_32X32) - (bs < BLOCK_16X16);
+ BLOCK_SIZE bs) {
+ const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
int64_t rd[TX_SIZES][2];
int n, m;
@@ -985,7 +840,7 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
// double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
- const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs);
+ const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->this_mi);
// for (n = TX_4X4; n <= max_txfm_size; n++)
// r[n][0] = (r[n][0] * scale_r[n]);
@@ -1023,35 +878,28 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
rd[TX_32X32][1] <= rd[TX_16X16][1] &&
rd[TX_32X32][1] <= rd[TX_8X8][1] &&
rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
- mbmi->txfm_size = TX_32X32;
+ mbmi->tx_size = TX_32X32;
} else if (max_txfm_size >= TX_16X16 &&
(cm->tx_mode == ALLOW_16X16 ||
cm->tx_mode == ALLOW_32X32 ||
(cm->tx_mode == TX_MODE_SELECT &&
rd[TX_16X16][1] <= rd[TX_8X8][1] &&
rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
- mbmi->txfm_size = TX_16X16;
+ mbmi->tx_size = TX_16X16;
} else if (cm->tx_mode == ALLOW_8X8 ||
cm->tx_mode == ALLOW_16X16 ||
cm->tx_mode == ALLOW_32X32 ||
(cm->tx_mode == TX_MODE_SELECT &&
rd[TX_8X8][1] <= rd[TX_4X4][1])) {
- mbmi->txfm_size = TX_8X8;
+ mbmi->tx_size = TX_8X8;
} else {
- mbmi->txfm_size = TX_4X4;
+ mbmi->tx_size = TX_4X4;
}
- if (model_used[mbmi->txfm_size]) {
- // Actually encode using the chosen mode if a model was used, but do not
- // update the r, d costs
- super_block_yrd_for_txfm(cm, x, rate, distortion, skip,
- &sse[mbmi->txfm_size], ref_best_rd,
- bs, mbmi->txfm_size);
- } else {
- *distortion = d[mbmi->txfm_size];
- *rate = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT];
- *skip = s[mbmi->txfm_size];
- }
+ // Actually encode using the chosen mode if a model was used, but do not
+ // update the r, d costs
+ txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size],
+ ref_best_rd, 0, bs, mbmi->tx_size);
if (max_txfm_size == TX_32X32 &&
rd[TX_32X32][1] <= rd[TX_16X16][1] &&
@@ -1071,14 +919,13 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
static void super_block_yrd(VP9_COMP *cpi,
MACROBLOCK *x, int *rate, int64_t *distortion,
- int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,
+ int *skip, int64_t *psse, BLOCK_SIZE bs,
int64_t txfm_cache[TX_MODES],
int64_t ref_best_rd) {
- VP9_COMMON *const cm = &cpi->common;
int r[TX_SIZES][2], s[TX_SIZES];
int64_t d[TX_SIZES], sse[TX_SIZES];
MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
assert(bs == mbmi->sb_type);
if (mbmi->ref_frame[0] > INTRA_FRAME)
@@ -1091,65 +938,43 @@ static void super_block_yrd(VP9_COMP *cpi,
choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
ref_best_rd, bs);
if (psse)
- *psse = sse[mbmi->txfm_size];
+ *psse = sse[mbmi->tx_size];
return;
}
if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
mbmi->ref_frame[0] > INTRA_FRAME) {
- int model_used[TX_SIZES] = {1, 1, 1, 1};
- if (bs >= BLOCK_32X32) {
- if (model_used[TX_32X32])
- model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
- &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
- else
- super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
- &s[TX_32X32], &sse[TX_32X32], INT64_MAX,
- bs, TX_32X32);
- }
- if (bs >= BLOCK_16X16) {
- if (model_used[TX_16X16])
- model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
- &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
- else
- super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
- &s[TX_16X16], &sse[TX_16X16], INT64_MAX,
- bs, TX_16X16);
- }
- if (model_used[TX_8X8])
- model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
- &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
- else
- super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
- &sse[TX_8X8], INT64_MAX, bs, TX_8X8);
+ if (bs >= BLOCK_32X32)
+ model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
+ &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
+ if (bs >= BLOCK_16X16)
+ model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
+ &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
- if (model_used[TX_4X4])
- model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
- &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
- else
- super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
- &sse[TX_4X4], INT64_MAX, bs, TX_4X4);
+ model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
+ &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
+
+ model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
+ &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
- skip, sse, ref_best_rd, bs, model_used);
+ skip, sse, ref_best_rd, bs);
} else {
if (bs >= BLOCK_32X32)
- super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
- &s[TX_32X32], &sse[TX_32X32], ref_best_rd,
- bs, TX_32X32);
+ txfm_rd_in_plane(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
+ &sse[TX_32X32], ref_best_rd, 0, bs, TX_32X32);
if (bs >= BLOCK_16X16)
- super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
- &s[TX_16X16], &sse[TX_16X16], ref_best_rd,
- bs, TX_16X16);
- super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
- &sse[TX_8X8], ref_best_rd, bs, TX_8X8);
- super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
- &sse[TX_4X4], ref_best_rd, bs, TX_4X4);
+ txfm_rd_in_plane(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
+ &sse[TX_16X16], ref_best_rd, 0, bs, TX_16X16);
+ txfm_rd_in_plane(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+ &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8);
+ txfm_rd_in_plane(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+ &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4);
choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
skip, txfm_cache, bs);
}
if (psse)
- *psse = sse[mbmi->txfm_size];
+ *psse = sse[mbmi->tx_size];
}
static int conditional_skipintra(MB_PREDICTION_MODE mode,
@@ -1162,7 +987,7 @@ static int conditional_skipintra(MB_PREDICTION_MODE mode,
best_intra_mode != V_PRED &&
best_intra_mode != D45_PRED)
return 1;
- if (mode == D27_PRED &&
+ if (mode == D207_PRED &&
best_intra_mode != H_PRED &&
best_intra_mode != D45_PRED)
return 1;
@@ -1179,8 +1004,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
int *bestrate, int *bestratey,
int64_t *bestdistortion,
- BLOCK_SIZE_TYPE bsize,
- int64_t rd_thresh) {
+ BLOCK_SIZE bsize, int64_t rd_thresh) {
MB_PREDICTION_MODE mode;
MACROBLOCKD *xd = &x->e_mbd;
int64_t best_rd = rd_thresh;
@@ -1190,9 +1014,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
struct macroblockd_plane *pd = &xd->plane[0];
const int src_stride = p->src.stride;
const int dst_stride = pd->dst.stride;
- uint8_t *src_init = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, ib,
+ uint8_t *src_init = raster_block_offset_uint8(BLOCK_8X8, ib,
p->src.buf, src_stride);
- uint8_t *dst_init = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, ib,
+ uint8_t *dst_init = raster_block_offset_uint8(BLOCK_8X8, ib,
pd->dst.buf, dst_stride);
int16_t *src_diff, *coeff;
@@ -1208,11 +1032,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
vpx_memcpy(ta, a, sizeof(ta));
vpx_memcpy(tl, l, sizeof(tl));
- xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+ xd->this_mi->mbmi.tx_size = TX_4X4;
for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
int64_t this_rd;
int ratey = 0;
+
+ if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+ continue;
+
// Only do the oblique modes if the best so far is
// one of the neighboring directional modes
if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
@@ -1234,10 +1062,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
block = ib + idy * 2 + idx;
- xd->mode_info_context->bmi[block].as_mode = mode;
- src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block,
- p->src_diff);
- coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);
+ xd->this_mi->bmi[block].as_mode = mode;
+ src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
+ coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
vp9_predict_intra_block(xd, block, 1,
TX_4X4, mode,
x->skip_encode ? src : dst,
@@ -1257,20 +1084,19 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
}
scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block));
- ratey += cost_coeffs(x, 0, block, PLANE_TYPE_Y_WITH_DC,
+ ratey += cost_coeffs(x, 0, block,
tempa + idx, templ + idy, TX_4X4, scan,
vp9_get_coef_neighbors_handle(scan));
- distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff,
- block, 16),
+ distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
16, &ssz) >> 2;
if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
goto next;
if (tx_type != DCT_DCT)
- vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
+ vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
dst, pd->dst.stride, tx_type);
else
- xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
+ xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
dst, pd->dst.stride);
}
}
@@ -1312,7 +1138,10 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
int64_t best_rd) {
int i, j;
MACROBLOCKD *const xd = &mb->e_mbd;
- BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+ MODE_INFO *const mic = xd->this_mi;
+ const MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
+ const MODE_INFO *left_mi = xd->mi_8x8[-1];
+ const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
int idx, idy;
@@ -1322,7 +1151,6 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
int64_t total_rd = 0;
ENTROPY_CONTEXT t_above[4], t_left[4];
int *bmode_costs;
- MODE_INFO *const mic = xd->mode_info_context;
vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
@@ -1332,24 +1160,22 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
// Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
- const int mis = xd->mode_info_stride;
- MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
- int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
- int64_t UNINITIALIZED_IS_SAFE(d), this_rd;
+ MB_PREDICTION_MODE best_mode = DC_PRED;
+ int r = INT_MAX, ry = INT_MAX;
+ int64_t d = INT64_MAX, this_rd = INT64_MAX;
i = idy * 2 + idx;
-
if (cpi->common.frame_type == KEY_FRAME) {
- const MB_PREDICTION_MODE A = above_block_mode(mic, i, mis);
+ const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, i);
const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
- left_block_mode(mic, i) : DC_PRED;
+ left_block_mode(mic, left_mi, i) :
+ DC_PRED;
bmode_costs = mb->y_mode_costs[A][L];
}
this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
- t_above + idx, t_left + idy,
- &r, &ry, &d, bsize,
- best_rd - total_rd);
+ t_above + idx, t_left + idy, &r, &ry, &d,
+ bsize, best_rd - total_rd);
if (this_rd >= best_rd - total_rd)
return INT64_MAX;
@@ -1372,7 +1198,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
*rate = cost;
*rate_y = tot_rate_y;
*distortion = total_distortion;
- xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode;
+ mic->mbmi.mode = mic->bmi[3].as_mode;
return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
}
@@ -1380,15 +1206,16 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
int *rate, int *rate_tokenonly,
int64_t *distortion, int *skippable,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
int64_t tx_cache[TX_MODES],
int64_t best_rd) {
MB_PREDICTION_MODE mode;
- MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+ MB_PREDICTION_MODE mode_selected = DC_PRED;
MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mic = xd->this_mi;
int this_rate, this_rate_tokenonly, s;
int64_t this_distortion, this_rd;
- TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
+ TX_SIZE best_tx = TX_4X4;
int i;
int *bmode_costs = x->mbmode_cost;
@@ -1399,17 +1226,20 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
/* Y Search for intra prediction mode */
for (mode = DC_PRED; mode <= TM_PRED; mode++) {
int64_t local_tx_cache[TX_MODES];
- MODE_INFO *const mic = xd->mode_info_context;
- const int mis = xd->mode_info_stride;
+ MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
+ MODE_INFO *left_mi = xd->mi_8x8[-1];
+
+ if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+ continue;
if (cpi->common.frame_type == KEY_FRAME) {
- const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);
+ const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, 0);
const MB_PREDICTION_MODE L = xd->left_available ?
- left_block_mode(mic, 0) : DC_PRED;
+ left_block_mode(mic, left_mi, 0) : DC_PRED;
bmode_costs = x->y_mode_costs[A][L];
}
- x->e_mbd.mode_info_context->mbmi.mode = mode;
+ mic->mbmi.mode = mode;
super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
bsize, local_tx_cache, best_rd);
@@ -1423,7 +1253,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (this_rd < best_rd) {
mode_selected = mode;
best_rd = this_rd;
- best_tx = x->e_mbd.mode_info_context->mbmi.txfm_size;
+ best_tx = mic->mbmi.tx_size;
*rate = this_rate;
*rate_tokenonly = this_rate_tokenonly;
*distortion = this_distortion;
@@ -1431,7 +1261,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
- for (i = 0; i < TX_MODES; i++) {
+ for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) {
const int64_t adj_rd = this_rd + local_tx_cache[i] -
local_tx_cache[cpi->common.tx_mode];
if (adj_rd < tx_cache[i]) {
@@ -1441,61 +1271,78 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
}
- x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
- x->e_mbd.mode_info_context->mbmi.txfm_size = best_tx;
+ mic->mbmi.mode = mode_selected;
+ mic->mbmi.tx_size = best_tx;
return best_rd;
}
-static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
- int *rate, int64_t *distortion,
- int *skippable, int64_t *sse,
- BLOCK_SIZE_TYPE bsize,
- TX_SIZE uv_tx_size) {
- MACROBLOCKD *const xd = &x->e_mbd;
- int64_t dummy;
- if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
- vp9_encode_intra_block_uv(cm, x, bsize);
- else
- vp9_xform_quant_sbuv(cm, x, bsize);
-
- *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2,
- sse ? sse : &dummy);
- *rate = rdcost_uv(cm, x, bsize, uv_tx_size);
- *skippable = vp9_sbuv_is_skippable(xd, bsize);
-}
-
static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
int *rate, int64_t *distortion, int *skippable,
- int64_t *sse, BLOCK_SIZE_TYPE bsize) {
+ int64_t *sse, BLOCK_SIZE bsize,
+ int64_t ref_best_rd) {
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
+ int plane;
+ int pnrate = 0, pnskip = 1;
+ int64_t pndist = 0, pnsse = 0;
- if (mbmi->ref_frame[0] > INTRA_FRAME)
+ if (ref_best_rd < 0)
+ goto term;
+
+ if (is_inter_block(mbmi))
vp9_subtract_sbuv(x, bsize);
- super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, sse, bsize,
- uv_txfm_size);
+ *rate = 0;
+ *distortion = 0;
+ *sse = 0;
+ *skippable = 1;
+
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+ txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
+ ref_best_rd, plane, bsize, uv_txfm_size);
+ if (pnrate == INT_MAX)
+ goto term;
+ *rate += pnrate;
+ *distortion += pndist;
+ *sse += pnsse;
+ *skippable &= pnskip;
+ }
+ return;
+
+ term:
+ *rate = INT_MAX;
+ *distortion = INT64_MAX;
+ *sse = INT64_MAX;
+ *skippable = 0;
+ return;
}
static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
int *rate, int *rate_tokenonly,
int64_t *distortion, int *skippable,
- BLOCK_SIZE_TYPE bsize) {
+ BLOCK_SIZE bsize) {
MB_PREDICTION_MODE mode;
- MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+ MB_PREDICTION_MODE mode_selected = DC_PRED;
int64_t best_rd = INT64_MAX, this_rd;
int this_rate_tokenonly, this_rate, s;
- int64_t this_distortion;
+ int64_t this_distortion, this_sse;
+
+ // int mode_mask = (bsize <= BLOCK_8X8)
+ // ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
+
+ for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+ // if (!(mode_mask & (1 << mode)))
+ if (!(cpi->sf.intra_uv_mode_mask & (1 << mode)))
+ continue;
- MB_PREDICTION_MODE last_mode = bsize <= BLOCK_SIZE_SB8X8 ?
- TM_PRED : cpi->sf.last_chroma_intra_mode;
+ x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
- for (mode = DC_PRED; mode <= last_mode; mode++) {
- x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
- &this_distortion, &s, NULL, bsize);
+ &this_distortion, &s, &this_sse, bsize, best_rd);
+ if (this_rate_tokenonly == INT_MAX)
+ continue;
this_rate = this_rate_tokenonly +
x->intra_uv_mode_cost[cpi->common.frame_type][mode];
this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
@@ -1510,7 +1357,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
}
- x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
+ x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected;
return best_rd;
}
@@ -1518,12 +1365,13 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
int *rate, int *rate_tokenonly,
int64_t *distortion, int *skippable,
- BLOCK_SIZE_TYPE bsize) {
+ BLOCK_SIZE bsize) {
int64_t this_rd;
+ int64_t this_sse;
- x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+ x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
super_block_uvrd(&cpi->common, x, rate_tokenonly,
- distortion, skippable, NULL, bsize);
+ distortion, skippable, &this_sse, bsize, INT64_MAX);
*rate = *rate_tokenonly +
x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
@@ -1531,7 +1379,7 @@ static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
return this_rd;
}
-static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize,
int *rate_uv, int *rate_uv_tokenonly,
int64_t *dist_uv, int *skip_uv,
MB_PREDICTION_MODE *mode_uv) {
@@ -1541,27 +1389,25 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
// appropriate speed flag is set.
if (cpi->sf.use_uv_intra_rd_estimate) {
rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
- (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
- bsize);
+ bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
// Else do a proper rd search for each possible transform size that may
// be considered in the main rd loop.
} else {
rd_pick_intra_sbuv_mode(cpi, x,
rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
- (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8
- : bsize);
+ bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
}
- *mode_uv = x->e_mbd.mode_info_context->mbmi.uv_mode;
+ *mode_uv = x->e_mbd.mi_8x8[0]->mbmi.uv_mode;
}
static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
int mode_context) {
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- const int segment_id = xd->mode_info_context->mbmi.segment_id;
+ const int segment_id = xd->this_mi->mbmi.segment_id;
// Don't account for mode here if segment skip is enabled.
- if (!vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) {
+ if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
assert(is_inter_mode(mode));
return x->inter_mode_cost[mode_context][mode - NEARESTMV];
} else {
@@ -1570,18 +1416,18 @@ static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
}
void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
- x->e_mbd.mode_info_context->mbmi.mode = mb;
- x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
+ x->e_mbd.mi_8x8[0]->mbmi.mode = mb;
+ x->e_mbd.mi_8x8[0]->mbmi.mv[0].as_int = mv->as_int;
}
static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
int_mv *frame_mv,
int mi_row, int mi_col,
int_mv single_newmv[MAX_REF_FRAMES],
int *rate_mv);
static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
int mi_row, int mi_col,
int_mv *tmp_mv, int *rate_mv);
@@ -1594,8 +1440,8 @@ static int labels2mode(MACROBLOCK *x, int i,
int_mv *second_best_ref_mv,
int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
MACROBLOCKD *const xd = &x->e_mbd;
- MODE_INFO *const mic = xd->mode_info_context;
- MB_MODE_INFO * mbmi = &mic->mbmi;
+ MODE_INFO *const mic = xd->this_mi;
+ MB_MODE_INFO *mbmi = &mic->mbmi;
int cost = 0, thismvcost = 0;
int idx, idy;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
@@ -1641,7 +1487,7 @@ static int labels2mode(MACROBLOCK *x, int i,
}
cost = cost_mv_ref(cpi, this_mode,
- mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+ mbmi->mode_context[mbmi->ref_frame[0]]);
mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
if (mbmi->ref_frame[1] > 0)
@@ -1668,42 +1514,32 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
int k;
MACROBLOCKD *xd = &x->e_mbd;
struct macroblockd_plane *const pd = &xd->plane[0];
- MODE_INFO *const mi = xd->mode_info_context;
- const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+ MODE_INFO *const mi = xd->this_mi;
+ const BLOCK_SIZE bsize = mi->mbmi.sb_type;
const int width = plane_block_width(bsize, pd);
const int height = plane_block_height(bsize, pd);
int idx, idy;
const int src_stride = x->plane[0].src.stride;
- uint8_t* const src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+ uint8_t* const src = raster_block_offset_uint8(BLOCK_8X8, i,
x->plane[0].src.buf,
src_stride);
- int16_t* src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
+ int16_t* src_diff = raster_block_offset_int16(BLOCK_8X8, i,
x->plane[0].src_diff);
- int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
- uint8_t* const pre = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
- pd->pre[0].buf,
- pd->pre[0].stride);
- uint8_t* const dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
- pd->dst.buf,
- pd->dst.stride);
+ int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, i);
+ uint8_t* const dst = raster_block_offset_uint8(BLOCK_8X8, i,
+ pd->dst.buf, pd->dst.stride);
int64_t thisdistortion = 0, thissse = 0;
int thisrate = 0;
+ int ref, second_ref = has_second_ref(&mi->mbmi);
- vp9_build_inter_predictor(pre, pd->pre[0].stride,
- dst, pd->dst.stride,
- &mi->bmi[i].as_mv[0].as_mv,
- &xd->scale_factor[0],
- width, height, 0, &xd->subpix, MV_PRECISION_Q3);
-
- if (mi->mbmi.ref_frame[1] > 0) {
- uint8_t* const second_pre =
- raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
- pd->pre[1].buf, pd->pre[1].stride);
- vp9_build_inter_predictor(second_pre, pd->pre[1].stride,
+ for (ref = 0; ref < 1 + second_ref; ++ref) {
+ const uint8_t *pre = raster_block_offset_uint8(BLOCK_8X8, i,
+ pd->pre[ref].buf, pd->pre[ref].stride);
+ vp9_build_inter_predictor(pre, pd->pre[ref].stride,
dst, pd->dst.stride,
- &mi->bmi[i].as_mv[1].as_mv,
- &xd->scale_factor[1],
- width, height, 1, &xd->subpix, MV_PRECISION_Q3);
+ &mi->bmi[i].as_mv[ref].as_mv,
+ &xd->scale_factor[ref],
+ width, height, ref, &xd->subpix, MV_PRECISION_Q3);
}
vp9_subtract_block(height, width, src_diff, 8, src, src_stride,
@@ -1715,15 +1551,15 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
int64_t ssz, rd, rd1, rd2;
k += (idy * 2 + idx);
- src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
+ src_diff = raster_block_offset_int16(BLOCK_8X8, k,
x->plane[0].src_diff);
- coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, k);
+ coeff = BLOCK_OFFSET(x->plane[0].coeff, k);
x->fwd_txm4x4(src_diff, coeff, 16);
x->quantize_b_4x4(x, k, DCT_DCT, 16);
- thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k, 16),
+ thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
16, &ssz);
thissse += ssz;
- thisrate += cost_coeffs(x, 0, k, PLANE_TYPE_Y_WITH_DC,
+ thisrate += cost_coeffs(x, 0, k,
ta + (k & 1),
tl + (k >> 1), TX_4X4,
vp9_default_scan_4x4,
@@ -1764,7 +1600,7 @@ typedef struct {
int64_t sse;
int segment_yrate;
MB_PREDICTION_MODE modes[4];
- SEG_RDSTAT rdstat[4][VP9_INTER_MODES];
+ SEG_RDSTAT rdstat[4][INTER_MODES];
int mvthresh;
} BEST_SEG_INFO;
@@ -1778,26 +1614,23 @@ static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
}
static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
- MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
- x->plane[0].src.buf =
- raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
- x->plane[0].src.buf,
- x->plane[0].src.stride);
- assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
- x->e_mbd.plane[0].pre[0].buf =
- raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
- x->e_mbd.plane[0].pre[0].buf,
- x->e_mbd.plane[0].pre[0].stride);
+ MB_MODE_INFO *const mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
+
+ p->src.buf = raster_block_offset_uint8(BLOCK_8X8, i, p->src.buf,
+ p->src.stride);
+ assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
+ pd->pre[0].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[0].buf,
+ pd->pre[0].stride);
if (mbmi->ref_frame[1])
- x->e_mbd.plane[0].pre[1].buf =
- raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
- x->e_mbd.plane[0].pre[1].buf,
- x->e_mbd.plane[0].pre[1].stride);
+ pd->pre[1].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[1].buf,
+ pd->pre[1].stride);
}
static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
struct buf_2d orig_pre[2]) {
- MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
+ MB_MODE_INFO *mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
x->plane[0].src = orig_src;
x->e_mbd.plane[0].pre[0] = orig_pre[0];
if (mbmi->ref_frame[1])
@@ -1811,13 +1644,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
int i, j, br = 0, idx, idy;
int64_t bd = 0, block_sse = 0;
MB_PREDICTION_MODE this_mode;
- MODE_INFO *mi = x->e_mbd.mode_info_context;
+ MODE_INFO *mi = x->e_mbd.mi_8x8[0];
MB_MODE_INFO *const mbmi = &mi->mbmi;
const int label_count = 4;
int64_t this_segment_rd = 0;
int label_mv_thresh;
int segmentyrate = 0;
- BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
vp9_variance_fn_ptr_t *v_fn_ptr;
@@ -1874,7 +1707,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 &&
(mbmi->ref_frame[1] <= 0 ||
frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) {
- int rfc = mbmi->mb_mode_context[mbmi->ref_frame[0]];
+ int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
int c1 = cost_mv_ref(cpi, NEARMV, rfc);
int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
@@ -1919,6 +1752,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
int thissme, bestsme = INT_MAX;
int sadpb = x->sadperbit4;
int_mv mvp_full;
+ int max_mv;
/* Is the best so far sufficiently good that we cant justify doing
* and new motion search. */
@@ -1928,40 +1762,58 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
if (cpi->compressor_speed) {
// use previous block's result as next block's MV predictor.
if (i > 0) {
- bsi->mvp.as_int =
- x->e_mbd.mode_info_context->bmi[i - 1].as_mv[0].as_int;
+ bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
if (i == 2)
- bsi->mvp.as_int =
- x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int;
+ bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
}
}
+ if (i == 0)
+ max_mv = x->max_mv_context[mbmi->ref_frame[0]];
+ else
+ max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
+
if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
// Take wtd average of the step_params based on the last frame's
// max mv magnitude and the best ref mvs of the current block for
// the given reference.
- if (i == 0)
- step_param = (vp9_init_search_range(
- cpi, x->max_mv_context[mbmi->ref_frame[0]]) +
- cpi->mv_step_param) >> 1;
- else
- step_param = (vp9_init_search_range(
- cpi, MAX(abs(bsi->mvp.as_mv.row),
- abs(bsi->mvp.as_mv.col)) >> 3) +
- cpi->mv_step_param) >> 1;
+ step_param = (vp9_init_search_range(cpi, max_mv) +
+ cpi->mv_step_param) >> 1;
} else {
step_param = cpi->mv_step_param;
}
- further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
-
mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
+ if (cpi->sf.adaptive_motion_search && cpi->common.show_frame) {
+ mvp_full.as_mv.row = x->pred_mv[mbmi->ref_frame[0]].as_mv.row >> 3;
+ mvp_full.as_mv.col = x->pred_mv[mbmi->ref_frame[0]].as_mv.col >> 3;
+ step_param = MAX(step_param, 8);
+ }
+
+ further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
// adjust src pointer for this block
mi_buf_shift(x, i);
- bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
- sadpb, further_steps, 0, v_fn_ptr,
- bsi->ref_mv, &mode_mv[NEWMV]);
+ if (cpi->sf.search_method == HEX) {
+ bestsme = vp9_hex_search(x, &mvp_full,
+ step_param,
+ sadpb, 1, v_fn_ptr, 1,
+ bsi->ref_mv, &mode_mv[NEWMV]);
+ } else if (cpi->sf.search_method == SQUARE) {
+ bestsme = vp9_square_search(x, &mvp_full,
+ step_param,
+ sadpb, 1, v_fn_ptr, 1,
+ bsi->ref_mv, &mode_mv[NEWMV]);
+ } else if (cpi->sf.search_method == BIGDIA) {
+ bestsme = vp9_bigdia_search(x, &mvp_full,
+ step_param,
+ sadpb, 1, v_fn_ptr, 1,
+ bsi->ref_mv, &mode_mv[NEWMV]);
+ } else {
+ bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
+ sadpb, further_steps, 0, v_fn_ptr,
+ bsi->ref_mv, &mode_mv[NEWMV]);
+ }
// Should we do a full search (best quality only)
if (cpi->compressor_speed == 0) {
@@ -1976,13 +1828,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
if (thissme < bestsme) {
bestsme = thissme;
- mode_mv[NEWMV].as_int =
- x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int;
+ mode_mv[NEWMV].as_int = mi->bmi[i].as_mv[0].as_int;
} else {
/* The full search result is actually worse so re-instate the
* previous best vector */
- x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int =
- mode_mv[NEWMV].as_int;
+ mi->bmi[i].as_mv[0].as_int = mode_mv[NEWMV].as_int;
}
}
@@ -1991,19 +1841,23 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
unsigned int sse;
cpi->find_fractional_mv_step(x, &mode_mv[NEWMV],
bsi->ref_mv, x->errorperbit, v_fn_ptr,
+ 0, cpi->sf.subpel_iters_per_step,
x->nmvjointcost, x->mvcost,
&distortion, &sse);
- // safe motion search result for use in compound prediction
+ // save motion search result for use in compound prediction
seg_mvs[i][mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int;
}
+ if (cpi->sf.adaptive_motion_search)
+ x->pred_mv[mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int;
+
// restore src pointers
mi_buf_restore(x, orig_src, orig_pre);
}
if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV &&
- mbmi->interp_filter == vp9_switchable_interp[0]) {
+ mbmi->interp_filter == EIGHTTAP) {
if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
continue;
@@ -2114,7 +1968,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
if (best_rd == INT64_MAX) {
int iy, midx;
for (iy = i + 1; iy < 4; ++iy)
- for (midx = 0; midx < VP9_INTER_MODES; ++midx)
+ for (midx = 0; midx < INTER_MODES; ++midx)
bsi->rdstat[iy][midx].brdcost = INT64_MAX;
bsi->segment_rd = INT64_MAX;
return;
@@ -2138,7 +1992,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
if (this_segment_rd > bsi->segment_rd) {
int iy, midx;
for (iy = i + 1; iy < 4; ++iy)
- for (midx = 0; midx < VP9_INTER_MODES; ++midx)
+ for (midx = 0; midx < INTER_MODES; ++midx)
bsi->rdstat[iy][midx].brdcost = INT64_MAX;
bsi->segment_rd = INT64_MAX;
return;
@@ -2182,7 +2036,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
int i;
BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
MACROBLOCKD *xd = &x->e_mbd;
- MODE_INFO *mi = xd->mode_info_context;
+ MODE_INFO *mi = xd->this_mi;
MB_MODE_INFO *mbmi = &mi->mbmi;
int mode_idx;
@@ -2217,7 +2071,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
*returntotrate = bsi->r;
*returndistortion = bsi->d;
*returnyrate = bsi->segment_yrate;
- *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8);
+ *skippable = vp9_is_skippable_in_plane(&x->e_mbd, BLOCK_8X8, 0);
*psse = bsi->sse;
mbmi->mode = bsi->modes[3];
@@ -2226,9 +2080,9 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
uint8_t *ref_y_buffer, int ref_y_stride,
- int ref_frame, BLOCK_SIZE_TYPE block_size ) {
+ int ref_frame, BLOCK_SIZE block_size ) {
MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
int_mv this_mv;
int i;
int zero_seen = 0;
@@ -2240,10 +2094,15 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
uint8_t *src_y_ptr = x->plane[0].src.buf;
uint8_t *ref_y_ptr;
int row_offset, col_offset;
+ int num_mv_refs = MAX_MV_REF_CANDIDATES +
+ (cpi->sf.adaptive_motion_search &&
+ cpi->common.show_frame &&
+ block_size < cpi->sf.max_partition_size);
// Get the sad for each candidate reference mv
- for (i = 0; i < MAX_MV_REF_CANDIDATES; i++) {
- this_mv.as_int = mbmi->ref_mvs[ref_frame][i].as_int;
+ for (i = 0; i < num_mv_refs; i++) {
+ this_mv.as_int = (i < MAX_MV_REF_CANDIDATES) ?
+ mbmi->ref_mvs[ref_frame][i].as_int : x->pred_mv[ref_frame].as_int;
max_mv = MAX(max_mv,
MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
@@ -2279,7 +2138,7 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
vp9_prob *comp_mode_p) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- int seg_ref_active = vp9_segfeature_active(&xd->seg, segment_id,
+ int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
SEG_LVL_REF_FRAME);
if (seg_ref_active) {
vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
@@ -2341,14 +2200,14 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
int_mv *second_ref_mv,
int64_t comp_pred_diff[NB_PREDICTION_TYPES],
int64_t tx_size_diff[TX_MODES],
- int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1]) {
+ int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]) {
MACROBLOCKD *const xd = &x->e_mbd;
// Take a snapshot of the coding context so it can be
// restored if we decide to encode this way
ctx->skip = x->skip;
ctx->best_mode_index = mode_index;
- ctx->mic = *xd->mode_info_context;
+ ctx->mic = *xd->this_mi;
if (partition)
ctx->partition_info = *partition;
@@ -2364,7 +2223,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
// doesn't actually work this way
memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
memcpy(ctx->best_filter_diff, best_filter_diff,
- sizeof(*best_filter_diff) * (VP9_SWITCHABLE_FILTERS + 1));
+ sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1));
}
static void setup_pred_block(const MACROBLOCKD *xd,
@@ -2395,7 +2254,7 @@ static void setup_pred_block(const MACROBLOCKD *xd,
static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
int idx, MV_REFERENCE_FRAME frame_type,
- BLOCK_SIZE_TYPE block_size,
+ BLOCK_SIZE block_size,
int mi_row, int mi_col,
int_mv frame_nearest_mv[MAX_REF_FRAMES],
int_mv frame_near_mv[MAX_REF_FRAMES],
@@ -2404,17 +2263,17 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
VP9_COMMON *cm = &cpi->common;
YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
// set up scaling factors
scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
scale[frame_type].x_offset_q4 =
ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].x_scale_fp,
- VP9_REF_SCALE_SHIFT) & 0xf;
+ REF_SCALE_SHIFT) & 0xf;
scale[frame_type].y_offset_q4 =
ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].y_scale_fp,
- VP9_REF_SCALE_SHIFT) & 0xf;
+ REF_SCALE_SHIFT) & 0xf;
// TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
// use the UV scaling factors.
@@ -2422,11 +2281,10 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
&scale[frame_type], &scale[frame_type]);
// Gets an initial list of candidate vectors from neighbours and orders them
- vp9_find_mv_refs(&cpi->common, xd, xd->mode_info_context,
- xd->prev_mode_info_context,
+ vp9_find_mv_refs(&cpi->common, xd, xd->this_mi,
+ xd->last_mi,
frame_type,
- mbmi->ref_mvs[frame_type],
- cpi->common.ref_frame_sign_bias, mi_row, mi_col);
+ mbmi->ref_mvs[frame_type], mi_row, mi_col);
// Candidate refinement carried out at encoder and decoder
vp9_find_best_ref_mvs(xd,
@@ -2437,8 +2295,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
// Further refinement that is encode side only to test the top few candidates
// in full and choose the best as the centre point for subsequent searches.
// The current implementation doesn't support scaling.
- if (scale[frame_type].x_scale_fp == VP9_REF_NO_SCALE &&
- scale[frame_type].y_scale_fp == VP9_REF_NO_SCALE)
+ if (!vp9_is_scaled(&scale[frame_type]))
mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
frame_type, block_size);
}
@@ -2446,27 +2303,27 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
int fb = get_ref_frame_idx(cpi, ref_frame);
- if (cpi->scaled_ref_idx[fb] != cpi->common.ref_frame_map[fb])
- scaled_ref_frame = &cpi->common.yv12_fb[cpi->scaled_ref_idx[fb]];
+ int fb_scale = get_scale_ref_frame_idx(cpi, ref_frame);
+ if (cpi->scaled_ref_idx[fb_scale] != cpi->common.ref_frame_map[fb])
+ scaled_ref_frame = &cpi->common.yv12_fb[cpi->scaled_ref_idx[fb_scale]];
return scaled_ref_frame;
}
-static INLINE int get_switchable_rate(MACROBLOCK *x) {
- MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-
- const int c = vp9_get_pred_context_switchable_interp(xd);
- const int m = vp9_switchable_interp_map[mbmi->interp_filter];
- return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
+static INLINE int get_switchable_rate(const MACROBLOCK *x) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+ const int ctx = vp9_get_pred_context_switchable_interp(xd);
+ return SWITCHABLE_INTERP_RATE_FACTOR *
+ x->switchable_interp_costs[ctx][mbmi->interp_filter];
}
static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
int mi_row, int mi_col,
int_mv *tmp_mv, int *rate_mv) {
MACROBLOCKD *xd = &x->e_mbd;
VP9_COMMON *cm = &cpi->common;
- MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
int bestsme = INT_MAX;
int further_steps, step_param;
@@ -2474,7 +2331,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int_mv mvp_full;
int ref = mbmi->ref_frame[0];
int_mv ref_mv = mbmi->ref_mvs[ref][0];
- const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+ const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
int tmp_col_min = x->mv_col_min;
int tmp_col_max = x->mv_col_max;
@@ -2494,7 +2351,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
}
- vp9_clamp_mv_min_max(x, &ref_mv);
+ vp9_clamp_mv_min_max(x, &ref_mv.as_mv);
// Adjust search parameters based on small partitions' result.
if (x->fast_ms) {
@@ -2506,7 +2363,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
step_param = 8;
// Get prediction MV.
- mvp_full.as_int = x->pred_mv.as_int;
+ mvp_full.as_int = x->pred_mv[ref].as_int;
// Adjust MV sign if needed.
if (cm->ref_frame_sign_bias[ref]) {
@@ -2525,21 +2382,49 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
} else {
step_param = cpi->mv_step_param;
}
- // mvp_full.as_int = ref_mv[0].as_int;
- mvp_full.as_int =
- mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int;
}
+ if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
+ cpi->common.show_frame) {
+ int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
+ b_width_log2(bsize)));
+ step_param = MAX(step_param, boffset);
+ }
+
+ mvp_full.as_int = x->mv_best_ref_index[ref] < MAX_MV_REF_CANDIDATES ?
+ mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int :
+ x->pred_mv[ref].as_int;
+
mvp_full.as_mv.col >>= 3;
mvp_full.as_mv.row >>= 3;
// Further step/diamond searches as necessary
further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
- bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
- sadpb, further_steps, 1,
- &cpi->fn_ptr[block_size],
- &ref_mv, tmp_mv);
+ if (cpi->sf.search_method == HEX) {
+ bestsme = vp9_hex_search(x, &mvp_full,
+ step_param,
+ sadpb, 1,
+ &cpi->fn_ptr[block_size], 1,
+ &ref_mv, tmp_mv);
+ } else if (cpi->sf.search_method == SQUARE) {
+ bestsme = vp9_square_search(x, &mvp_full,
+ step_param,
+ sadpb, 1,
+ &cpi->fn_ptr[block_size], 1,
+ &ref_mv, tmp_mv);
+ } else if (cpi->sf.search_method == BIGDIA) {
+ bestsme = vp9_bigdia_search(x, &mvp_full,
+ step_param,
+ sadpb, 1,
+ &cpi->fn_ptr[block_size], 1,
+ &ref_mv, tmp_mv);
+ } else {
+ bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
+ sadpb, further_steps, 1,
+ &cpi->fn_ptr[block_size],
+ &ref_mv, tmp_mv);
+ }
x->mv_col_min = tmp_col_min;
x->mv_col_max = tmp_col_max;
@@ -2547,17 +2432,22 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
x->mv_row_max = tmp_row_max;
if (bestsme < INT_MAX) {
- int dis; /* TODO: use dis in distortion calculation later. */
+ int dis; /* TODO: use dis in distortion calculation later. */
unsigned int sse;
cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv,
x->errorperbit,
&cpi->fn_ptr[block_size],
+ 0, cpi->sf.subpel_iters_per_step,
x->nmvjointcost, x->mvcost,
&dis, &sse);
}
*rate_mv = vp9_mv_bit_cost(tmp_mv, &ref_mv,
x->nmvjointcost, x->mvcost,
96);
+
+ if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
+ x->pred_mv[ref].as_int = tmp_mv->as_int;
+
if (scaled_ref_frame) {
int i;
for (i = 0; i < MAX_MB_PLANE; i++)
@@ -2566,18 +2456,18 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
}
static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
int_mv *frame_mv,
int mi_row, int mi_col,
int_mv single_newmv[MAX_REF_FRAMES],
int *rate_mv) {
int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
int refs[2] = { mbmi->ref_frame[0],
(mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
int_mv ref_mv[2];
- const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+ const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
int ite;
// Prediction buffer from second frame.
uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
@@ -2653,7 +2543,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
// Compound motion search on first ref frame.
if (id)
xd->plane[0].pre[0] = ref_yv12[id];
- vp9_clamp_mv_min_max(x, &ref_mv[id]);
+ vp9_clamp_mv_min_max(x, &ref_mv[id].as_mv);
// Use mv result from single mode as mvp.
tmp_mv.as_int = frame_mv[refs[id]].as_int;
@@ -2678,13 +2568,15 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int dis; /* TODO: use dis in distortion calculation later. */
unsigned int sse;
- bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
- &ref_mv[id],
- x->errorperbit,
- &cpi->fn_ptr[block_size],
- x->nmvjointcost, x->mvcost,
- &dis, &sse, second_pred,
- pw, ph);
+ bestsme = cpi->find_fractional_mv_step_comp(
+ x, &tmp_mv,
+ &ref_mv[id],
+ x->errorperbit,
+ &cpi->fn_ptr[block_size],
+ 0, cpi->sf.subpel_iters_per_step,
+ x->nmvjointcost, x->mvcost,
+ &dis, &sse, second_pred,
+ pw, ph);
}
if (id)
@@ -2721,7 +2613,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
}
static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
int64_t txfm_cache[],
int *rate2, int64_t *distortion,
int *skippable,
@@ -2732,10 +2624,11 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
int_mv (*mode_mv)[MAX_REF_FRAMES],
int mi_row, int mi_col,
int_mv single_newmv[MAX_REF_FRAMES],
- int64_t *psse, int64_t ref_best_rd) {
+ int64_t *psse,
+ const int64_t ref_best_rd) {
VP9_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
const int is_comp_pred = (mbmi->ref_frame[1] > 0);
const int num_refs = is_comp_pred ? 2 : 1;
const int this_mode = mbmi->mode;
@@ -2747,7 +2640,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
int64_t this_rd = 0;
DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
int pred_exists = 0;
- int interpolating_intpel_seen = 0;
int intpel_mv;
int64_t rd, best_rd = INT64_MAX;
int best_needs_copy = 0;
@@ -2782,7 +2674,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
*rate2 += rate_mv;
frame_mv[refs[0]].as_int =
- xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+ xd->this_mi->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
single_newmv[refs[0]].as_int = tmp_mv.as_int;
}
}
@@ -2790,9 +2682,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
// if we're near/nearest and mv == 0,0, compare to zeromv
if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
frame_mv[refs[0]].as_int == 0 &&
- !vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
+ !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
(num_refs == 1 || frame_mv[refs[1]].as_int == 0)) {
- int rfc = mbmi->mb_mode_context[mbmi->ref_frame[0]];
+ int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
int c1 = cost_mv_ref(cpi, NEARMV, rfc);
int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
@@ -2849,7 +2741,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
* words if you present them in that order, the second one is always known
* if the first is known */
*rate2 += cost_mv_ref(cpi, this_mode,
- mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+ mbmi->mode_context[mbmi->ref_frame[0]]);
if (!(*mode_excluded)) {
if (is_comp_pred) {
@@ -2860,7 +2752,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
pred_exists = 0;
- interpolating_intpel_seen = 0;
// Are all MVs integer pel for Y and UV
intpel_mv = (mbmi->mv[0].as_mv.row & 15) == 0 &&
(mbmi->mv[0].as_mv.col & 15) == 0;
@@ -2869,98 +2760,97 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
(mbmi->mv[1].as_mv.col & 15) == 0;
// Search for best switchable filter by checking the variance of
// pred error irrespective of whether the filter will be used
- *best_filter = EIGHTTAP;
- if (cpi->sf.use_8tap_always) {
+ if (cm->mcomp_filter_type != BILINEAR) {
*best_filter = EIGHTTAP;
- vp9_zero(cpi->rd_filter_cache);
- } else {
- int i, newbest;
- int tmp_rate_sum = 0;
- int64_t tmp_dist_sum = 0;
-
- cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
- for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
- int j;
- int64_t rs_rd;
- const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
- const int is_intpel_interp = intpel_mv;
- mbmi->interp_filter = filter;
- vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
- rs = get_switchable_rate(x);
- rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
-
- if (interpolating_intpel_seen && is_intpel_interp) {
- cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
- tmp_rate_sum, tmp_dist_sum);
- cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
- MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS],
- cpi->rd_filter_cache[i] + rs_rd);
- rd = cpi->rd_filter_cache[i];
- if (cm->mcomp_filter_type == SWITCHABLE)
- rd += rs_rd;
- } else {
- int rate_sum = 0;
- int64_t dist_sum = 0;
- if ((cm->mcomp_filter_type == SWITCHABLE &&
- (!i || best_needs_copy)) ||
- (cm->mcomp_filter_type != SWITCHABLE &&
- (cm->mcomp_filter_type == mbmi->interp_filter ||
- (!interpolating_intpel_seen && is_intpel_interp)))) {
- for (j = 0; j < MAX_MB_PLANE; j++) {
- xd->plane[j].dst.buf = orig_dst[j];
- xd->plane[j].dst.stride = orig_dst_stride[j];
- }
+ if (x->source_variance <
+ cpi->sf.disable_filter_search_var_thresh) {
+ *best_filter = EIGHTTAP;
+ vp9_zero(cpi->rd_filter_cache);
+ } else {
+ int i, newbest;
+ int tmp_rate_sum = 0;
+ int64_t tmp_dist_sum = 0;
+
+ cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ int j;
+ int64_t rs_rd;
+ mbmi->interp_filter = i;
+ vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+ rs = get_switchable_rate(x);
+ rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+
+ if (i > 0 && intpel_mv) {
+ cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
+ tmp_rate_sum, tmp_dist_sum);
+ cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
+ MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
+ cpi->rd_filter_cache[i] + rs_rd);
+ rd = cpi->rd_filter_cache[i];
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ rd += rs_rd;
} else {
- for (j = 0; j < MAX_MB_PLANE; j++) {
- xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
- xd->plane[j].dst.stride = 64;
+ int rate_sum = 0;
+ int64_t dist_sum = 0;
+ if ((cm->mcomp_filter_type == SWITCHABLE &&
+ (!i || best_needs_copy)) ||
+ (cm->mcomp_filter_type != SWITCHABLE &&
+ (cm->mcomp_filter_type == mbmi->interp_filter ||
+ (i == 0 && intpel_mv)))) {
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ xd->plane[j].dst.buf = orig_dst[j];
+ xd->plane[j].dst.stride = orig_dst_stride[j];
+ }
+ } else {
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
+ xd->plane[j].dst.stride = 64;
+ }
+ }
+ vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+ model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
+ cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
+ rate_sum, dist_sum);
+ cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
+ MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
+ cpi->rd_filter_cache[i] + rs_rd);
+ rd = cpi->rd_filter_cache[i];
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ rd += rs_rd;
+ if (i == 0 && intpel_mv) {
+ tmp_rate_sum = rate_sum;
+ tmp_dist_sum = dist_sum;
}
}
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
- model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
- cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
- rate_sum, dist_sum);
- cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
- MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS],
- cpi->rd_filter_cache[i] + rs_rd);
- rd = cpi->rd_filter_cache[i];
- if (cm->mcomp_filter_type == SWITCHABLE)
- rd += rs_rd;
- if (!interpolating_intpel_seen && is_intpel_interp) {
- tmp_rate_sum = rate_sum;
- tmp_dist_sum = dist_sum;
- }
- }
- if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
- if (rd / 2 > ref_best_rd) {
- for (i = 0; i < MAX_MB_PLANE; i++) {
- xd->plane[i].dst.buf = orig_dst[i];
- xd->plane[i].dst.stride = orig_dst_stride[i];
+ if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+ if (rd / 2 > ref_best_rd) {
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
+ }
+ return INT64_MAX;
}
- return INT64_MAX;
}
- }
- newbest = i == 0 || rd < best_rd;
-
- if (newbest) {
- best_rd = rd;
- *best_filter = mbmi->interp_filter;
- if (cm->mcomp_filter_type == SWITCHABLE && i &&
- !(interpolating_intpel_seen && is_intpel_interp))
- best_needs_copy = !best_needs_copy;
- }
+ newbest = i == 0 || rd < best_rd;
+
+ if (newbest) {
+ best_rd = rd;
+ *best_filter = mbmi->interp_filter;
+ if (cm->mcomp_filter_type == SWITCHABLE && i && !intpel_mv)
+ best_needs_copy = !best_needs_copy;
+ }
- if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
- (cm->mcomp_filter_type != SWITCHABLE &&
- cm->mcomp_filter_type == mbmi->interp_filter)) {
- pred_exists = 1;
+ if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
+ (cm->mcomp_filter_type != SWITCHABLE &&
+ cm->mcomp_filter_type == mbmi->interp_filter)) {
+ pred_exists = 1;
+ }
}
- interpolating_intpel_seen |= is_intpel_interp;
- }
- for (i = 0; i < MAX_MB_PLANE; i++) {
- xd->plane[i].dst.buf = orig_dst[i];
- xd->plane[i].dst.stride = orig_dst_stride[i];
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
+ }
}
}
// Set the appropriate filter
@@ -3003,30 +2893,34 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (cpi->common.mcomp_filter_type == SWITCHABLE)
*rate2 += get_switchable_rate(x);
- if (!is_comp_pred) {
+ if (!is_comp_pred && cpi->enable_encode_breakout) {
if (cpi->active_map_enabled && x->active_ptr[0] == 0)
x->skip = 1;
else if (x->encode_breakout) {
- const BLOCK_SIZE_TYPE y_size = get_plane_block_size(bsize, &xd->plane[0]);
- const BLOCK_SIZE_TYPE uv_size = get_plane_block_size(bsize,
- &xd->plane[1]);
+ const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
+ const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
unsigned int var, sse;
// Skipping threshold for ac.
unsigned int thresh_ac;
// The encode_breakout input
unsigned int encode_breakout = x->encode_breakout << 4;
+ int max_thresh = 36000;
+
+ // Use extreme low threshold for static frames to limit skipping.
+ if (cpi->enable_encode_breakout == 2)
+ max_thresh = 128;
// Calculate threshold according to dequant value.
thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
- // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
- if (thresh_ac > 36000)
- thresh_ac = 36000;
-
// Use encode_breakout input if it is bigger than internal threshold.
if (thresh_ac < encode_breakout)
thresh_ac = encode_breakout;
+ // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
+ if (thresh_ac > max_thresh)
+ thresh_ac = max_thresh;
+
var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].dst.buf,
xd->plane[0].dst.stride, &sse);
@@ -3065,8 +2959,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
(sse_v - var_v < thresh_dc || sse_v == var_v)) {
x->skip = 1;
- *rate2 = 500;
- *rate_uv = 0;
+ // The cost of skip bit needs to be added.
+ *rate2 += vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
// Scaling factor for SSE from spatial domain to frequency domain
// is 16. Adjust distortion accordingly.
@@ -3084,7 +2978,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (!x->skip) {
int skippable_y, skippable_uv;
- int64_t sseuv = INT_MAX;
+ int64_t sseuv = INT64_MAX;
+ int64_t rdcosty = INT64_MAX;
// Y cost and distortion
super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
@@ -3103,8 +2998,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
*rate2 += *rate_y;
*distortion += *distortion_y;
- super_block_uvrd(cm, x, rate_uv, distortion_uv,
- &skippable_uv, &sseuv, bsize);
+ rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
+ rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
+
+ super_block_uvrd(cm, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
+ bsize, ref_best_rd - rdcosty);
+ if (*rate_uv == INT_MAX) {
+ *rate2 = INT_MAX;
+ *distortion = INT64_MAX;
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
+ }
+ return INT64_MAX;
+ }
*psse += sseuv;
*rate2 += *rate_uv;
@@ -3122,17 +3029,17 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int *returnrate, int64_t *returndist,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
- int y_skip = 0, uv_skip;
+ int y_skip = 0, uv_skip = 0;
int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
x->skip_encode = 0;
ctx->skip = 0;
- xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME;
- if (bsize >= BLOCK_SIZE_SB8X8) {
+ xd->this_mi->mbmi.ref_frame[0] = INTRA_FRAME;
+ if (bsize >= BLOCK_8X8) {
if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
&dist_y, &y_skip, bsize, tx_cache,
best_rd) >= best_rd) {
@@ -3149,46 +3056,46 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
return;
}
rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
- &dist_uv, &uv_skip, BLOCK_SIZE_SB8X8);
+ &dist_uv, &uv_skip, BLOCK_8X8);
}
if (y_skip && uv_skip) {
*returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
- *returndist = dist_y + (dist_uv >> 2);
+ *returndist = dist_y + dist_uv;
vp9_zero(ctx->tx_rd_diff);
} else {
int i;
*returnrate = rate_y + rate_uv +
vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0);
- *returndist = dist_y + (dist_uv >> 2);
+ *returndist = dist_y + dist_uv;
if (cpi->sf.tx_size_search_method == USE_FULL_RD)
for (i = 0; i < TX_MODES; i++)
ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
}
- ctx->mic = *xd->mode_info_context;
+ ctx->mic = *xd->this_mi;
}
int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int mi_row, int mi_col,
int *returnrate,
int64_t *returndistortion,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx,
int64_t best_rd_so_far) {
VP9_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
- const struct segmentation *seg = &xd->seg;
- const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
- MB_PREDICTION_MODE this_mode;
+ MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
+ const struct segmentation *seg = &cm->seg;
+ const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+ RD_PREDICTION_MODE this_mode;
MV_REFERENCE_FRAME ref_frame, second_ref_frame;
- unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
+ unsigned char segment_id = mbmi->segment_id;
int comp_pred, i;
int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
struct buf_2d yv12_mb[4][MAX_MB_PLANE];
- int_mv single_newmv[MAX_REF_FRAMES];
+ int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
VP9_ALT_FLAG };
int idx_list[4] = {0,
@@ -3201,9 +3108,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int64_t best_tx_diff[TX_MODES];
int64_t best_pred_diff[NB_PREDICTION_TYPES];
int64_t best_pred_rd[NB_PREDICTION_TYPES];
- int64_t best_filter_rd[VP9_SWITCHABLE_FILTERS + 1];
- int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
- MB_MODE_INFO best_mbmode;
+ int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
+ int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+ MB_MODE_INFO best_mbmode = { 0 };
int j;
int mode_index, best_mode_index = 0;
unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
@@ -3228,14 +3135,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int_mv seg_mvs[4][MAX_REF_FRAMES];
union b_mode_info best_bmodes[4];
PARTITION_INFO best_partition;
- int bwsl = b_width_log2(bsize);
- int bws = (1 << bwsl) / 4; // mode_info step for subsize
- int bhsl = b_height_log2(bsize);
- int bhs = (1 << bhsl) / 4; // mode_info step for subsize
+ const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
+ const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
int best_skip2 = 0;
- x->skip_encode = (cpi->sf.skip_encode_frame &&
- xd->q_index < QIDX_SKIP_THRESH);
+ x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
for (i = 0; i < 4; i++) {
int j;
@@ -3248,14 +3152,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
&comp_mode_p);
- vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
- vpx_memset(&single_newmv, 0, sizeof(single_newmv));
for (i = 0; i < NB_PREDICTION_TYPES; ++i)
best_pred_rd[i] = INT64_MAX;
for (i = 0; i < TX_MODES; i++)
best_tx_rd[i] = INT64_MAX;
- for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
+ for (i = 0; i <= SWITCHABLE_FILTERS; i++)
best_filter_rd[i] = INT64_MAX;
for (i = 0; i < TX_SIZES; i++)
rate_uv_intra[i] = INT_MAX;
@@ -3312,7 +3214,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int compmode_cost = 0;
int rate2 = 0, rate_y = 0, rate_uv = 0;
int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
- int skippable;
+ int skippable = 0;
int64_t tx_cache[TX_MODES];
int i;
int this_skip2 = 0;
@@ -3327,10 +3229,31 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
ref_frame = vp9_mode_order[mode_index].ref_frame;
second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
- // Skip modes that have been masked off but always consider first mode.
- if (mode_index && (bsize > cpi->sf.unused_mode_skip_lvl) &&
- (cpi->unused_mode_skip_mask & (1 << mode_index)) )
- continue;
+ // Look at the reference frame of the best mode so far and set the
+ // skip mask to look at a subset of the remaining modes.
+ if (mode_index > cpi->sf.mode_skip_start) {
+ if (mode_index == (cpi->sf.mode_skip_start + 1)) {
+ switch (vp9_mode_order[best_mode_index].ref_frame) {
+ case INTRA_FRAME:
+ cpi->mode_skip_mask = 0;
+ break;
+ case LAST_FRAME:
+ cpi->mode_skip_mask = LAST_FRAME_MODE_MASK;
+ break;
+ case GOLDEN_FRAME:
+ cpi->mode_skip_mask = GOLDEN_FRAME_MODE_MASK;
+ break;
+ case ALTREF_FRAME:
+ cpi->mode_skip_mask = ALT_REF_MODE_MASK;
+ break;
+ case NONE:
+ case MAX_REF_FRAMES:
+ assert(!"Invalid Reference frame");
+ }
+ }
+ if (cpi->mode_skip_mask & (1 << mode_index))
+ continue;
+ }
// Skip if the current reference frame has been masked off
if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
@@ -3339,7 +3262,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// Test best rd so far against threshold for trying this mode.
if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] *
- cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 4)) ||
+ cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 5)) ||
cpi->rd_threshes[bsize][mode_index] == INT_MAX)
continue;
@@ -3355,7 +3278,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (x->fast_ms > 2 && ref_frame != x->subblock_ref)
continue;
- if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_SIZE_SB8X8) {
+ if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_8X8) {
if (!(ref_frame_mask & (1 << ref_frame))) {
continue;
}
@@ -3393,19 +3316,24 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// TODO(jingning, jkoleszar): scaling reference frame not supported for
// SPLITMV.
if (ref_frame > 0 &&
- (scale_factor[ref_frame].x_scale_fp != VP9_REF_NO_SCALE ||
- scale_factor[ref_frame].y_scale_fp != VP9_REF_NO_SCALE) &&
- this_mode == SPLITMV)
+ vp9_is_scaled(&scale_factor[ref_frame]) &&
+ this_mode == RD_SPLITMV)
continue;
if (second_ref_frame > 0 &&
- (scale_factor[second_ref_frame].x_scale_fp != VP9_REF_NO_SCALE ||
- scale_factor[second_ref_frame].y_scale_fp != VP9_REF_NO_SCALE) &&
- this_mode == SPLITMV)
+ vp9_is_scaled(&scale_factor[second_ref_frame]) &&
+ this_mode == RD_SPLITMV)
+ continue;
+
+ if (bsize >= BLOCK_8X8 &&
+ (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
+ continue;
+
+ if (bsize < BLOCK_8X8 &&
+ !(this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
continue;
set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
- mbmi->mode = this_mode;
mbmi->uv_mode = DC_PRED;
// Evaluate all sub-pel filters irrespective of whether we can use
@@ -3413,13 +3341,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
mbmi->interp_filter = cm->mcomp_filter_type;
vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
- if (bsize >= BLOCK_SIZE_SB8X8 &&
- (this_mode == I4X4_PRED || this_mode == SPLITMV))
- continue;
- if (bsize < BLOCK_SIZE_SB8X8 &&
- !(this_mode == I4X4_PRED || this_mode == SPLITMV))
- continue;
-
if (comp_pred) {
if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
continue;
@@ -3452,7 +3373,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// If the segment skip feature is enabled....
// then do nothing if the current mode is not allowed..
} else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
- (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
+ (this_mode != RD_ZEROMV && ref_frame != INTRA_FRAME)) {
continue;
// Disable this drop out case if the ref frame
// segment level feature is enabled for this segment. This is to
@@ -3464,11 +3385,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// an unfiltered alternative. We allow near/nearest as well
// because they may result in zero-zero MVs but be cheaper.
if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
- if ((this_mode != ZEROMV &&
- !(this_mode == NEARMV &&
- frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
- !(this_mode == NEARESTMV &&
- frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
+ if ((this_mode != RD_ZEROMV &&
+ !(this_mode == RD_NEARMV &&
+ frame_mv[RD_NEARMV][ALTREF_FRAME].as_int == 0) &&
+ !(this_mode == RD_NEARESTMV &&
+ frame_mv[RD_NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
ref_frame != ALTREF_FRAME) {
continue;
}
@@ -3480,11 +3401,17 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// a representative block in the boundary ( first ) and then implement a
// function that does sads when inside the border..
if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
- this_mode == NEWMV) {
+ this_mode == RD_NEWMV) {
continue;
}
- if (this_mode == I4X4_PRED) {
+#ifdef MODE_TEST_HIT_STATS
+ // TEST/DEBUG CODE
+ // Keep a rcord of the number of test hits at each size
+ cpi->mode_test_hits[bsize]++;
+#endif
+
+ if (this_mode == RD_I4X4_PRED) {
int rate;
/*
@@ -3493,8 +3420,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
continue;
*/
- // I4X4_PRED is only considered for block sizes less than 8x8.
- mbmi->txfm_size = TX_4X4;
+ // RD_I4X4_PRED is only considered for block sizes less than 8x8.
+ mbmi->tx_size = TX_4X4;
if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
&distortion_y, best_rd) >= best_rd)
continue;
@@ -3521,31 +3448,33 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// Disable intra modes other than DC_PRED for blocks with low variance
// Threshold for intra skipping based on source variance
// TODO(debargha): Specialize the threshold for super block sizes
- static const int skip_intra_var_thresh[BLOCK_SIZE_TYPES] = {
+ static const int skip_intra_var_thresh[BLOCK_SIZES] = {
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
};
if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
- this_mode != DC_PRED &&
+ this_mode != RD_DC_PRED &&
x->source_variance < skip_intra_var_thresh[mbmi->sb_type])
continue;
// Only search the oblique modes if the best so far is
// one of the neighboring directional modes
if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
- (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+ (this_mode >= RD_D45_PRED && this_mode <= RD_TM_PRED)) {
if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)
continue;
}
+ mbmi->mode = rd_mode_to_mode(this_mode);
if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
if (conditional_skipintra(mbmi->mode, best_intra_mode))
continue;
}
+
super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
bsize, tx_cache, best_rd);
if (rate_y == INT_MAX)
continue;
- uv_tx = MIN(mbmi->txfm_size, max_uv_txsize_lookup[bsize]);
+ uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]);
if (rate_uv_intra[uv_tx] == INT_MAX) {
choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx],
&rate_uv_tokenonly[uv_tx],
@@ -3559,10 +3488,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
mbmi->uv_mode = mode_uv[uv_tx];
rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
- if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+ if (this_mode != RD_DC_PRED && this_mode != RD_TM_PRED)
rate2 += intra_cost_penalty;
distortion2 = distortion_y + distortion_uv;
- } else if (this_mode == SPLITMV) {
+ } else if (this_mode == RD_SPLITMV) {
const int is_comp_pred = second_ref_frame > 0;
int rate;
int64_t distortion;
@@ -3577,7 +3506,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
union b_mode_info tmp_best_bmodes[16];
MB_MODE_INFO tmp_best_mbmode;
PARTITION_INFO tmp_best_partition;
- BEST_SEG_INFO bsi[VP9_SWITCHABLE_FILTERS];
+ BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
int pred_exists = 0;
int uv_skippable;
if (is_comp_pred) {
@@ -3595,70 +3524,79 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
cpi->rd_threshes[bsize][THR_NEWA];
this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;
- xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-
- cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
- for (switchable_filter_index = 0;
- switchable_filter_index < VP9_SWITCHABLE_FILTERS;
- ++switchable_filter_index) {
- int newbest, rs;
- int64_t rs_rd;
- mbmi->interp_filter =
- vp9_switchable_interp[switchable_filter_index];
- vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
- tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
- &mbmi->ref_mvs[ref_frame][0],
- second_ref,
- best_yrd,
- &rate, &rate_y, &distortion,
- &skippable, &total_sse,
- (int)this_rd_thresh, seg_mvs,
- bsi, switchable_filter_index,
- mi_row, mi_col);
-
- if (tmp_rd == INT64_MAX)
- continue;
- cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
- rs = get_switchable_rate(x);
- rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
- cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
- MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS], tmp_rd + rs_rd);
- if (cm->mcomp_filter_type == SWITCHABLE)
- tmp_rd += rs_rd;
-
- newbest = (tmp_rd < tmp_best_rd);
- if (newbest) {
- tmp_best_filter = mbmi->interp_filter;
- tmp_best_rd = tmp_rd;
- }
- if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
- (mbmi->interp_filter == cm->mcomp_filter_type &&
- cm->mcomp_filter_type != SWITCHABLE)) {
- tmp_best_rdu = tmp_rd;
- tmp_best_rate = rate;
- tmp_best_ratey = rate_y;
- tmp_best_distortion = distortion;
- tmp_best_sse = total_sse;
- tmp_best_skippable = skippable;
- tmp_best_mbmode = *mbmi;
- tmp_best_partition = *x->partition_info;
- for (i = 0; i < 4; i++)
- tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];
- pred_exists = 1;
- if (switchable_filter_index == 0 &&
- cpi->sf.use_rd_breakout &&
- best_rd < INT64_MAX) {
- if (tmp_best_rdu / 2 > best_rd) {
- // skip searching the other filters if the first is
- // already substantially larger than the best so far
+ xd->this_mi->mbmi.tx_size = TX_4X4;
+
+ cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
+ if (cm->mcomp_filter_type != BILINEAR) {
+ tmp_best_filter = EIGHTTAP;
+ if (x->source_variance <
+ cpi->sf.disable_filter_search_var_thresh) {
+ tmp_best_filter = EIGHTTAP;
+ vp9_zero(cpi->rd_filter_cache);
+ } else {
+ for (switchable_filter_index = 0;
+ switchable_filter_index < SWITCHABLE_FILTERS;
+ ++switchable_filter_index) {
+ int newbest, rs;
+ int64_t rs_rd;
+ mbmi->interp_filter = switchable_filter_index;
+ vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+
+ tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+ &mbmi->ref_mvs[ref_frame][0],
+ second_ref,
+ best_yrd,
+ &rate, &rate_y, &distortion,
+ &skippable, &total_sse,
+ (int)this_rd_thresh, seg_mvs,
+ bsi, switchable_filter_index,
+ mi_row, mi_col);
+
+ if (tmp_rd == INT64_MAX)
+ continue;
+ cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
+ rs = get_switchable_rate(x);
+ rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+ cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
+ MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
+ tmp_rd + rs_rd);
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ tmp_rd += rs_rd;
+
+ newbest = (tmp_rd < tmp_best_rd);
+ if (newbest) {
tmp_best_filter = mbmi->interp_filter;
- tmp_best_rdu = INT64_MAX;
- break;
+ tmp_best_rd = tmp_rd;
}
- }
+ if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
+ (mbmi->interp_filter == cm->mcomp_filter_type &&
+ cm->mcomp_filter_type != SWITCHABLE)) {
+ tmp_best_rdu = tmp_rd;
+ tmp_best_rate = rate;
+ tmp_best_ratey = rate_y;
+ tmp_best_distortion = distortion;
+ tmp_best_sse = total_sse;
+ tmp_best_skippable = skippable;
+ tmp_best_mbmode = *mbmi;
+ tmp_best_partition = *x->partition_info;
+ for (i = 0; i < 4; i++)
+ tmp_best_bmodes[i] = xd->this_mi->bmi[i];
+ pred_exists = 1;
+ if (switchable_filter_index == 0 &&
+ cpi->sf.use_rd_breakout &&
+ best_rd < INT64_MAX) {
+ if (tmp_best_rdu / 2 > best_rd) {
+ // skip searching the other filters if the first is
+ // already substantially larger than the best so far
+ tmp_best_filter = mbmi->interp_filter;
+ tmp_best_rdu = INT64_MAX;
+ break;
+ }
+ }
+ }
+ } // switchable_filter_index loop
}
- } // switchable_filter_index loop
+ }
if (tmp_best_rdu == INT64_MAX)
continue;
@@ -3694,7 +3632,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
*mbmi = tmp_best_mbmode;
*x->partition_info = tmp_best_partition;
for (i = 0; i < 4; i++)
- xd->mode_info_context->bmi[i] = tmp_best_bmodes[i];
+ xd->this_mi->bmi[i] = tmp_best_bmodes[i];
}
rate2 += rate;
@@ -3711,16 +3649,19 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred);
- if (RDCOST(x->rdmult, x->rddiv, rate2, distortion2) <
- best_rd) {
+ tmp_best_rdu = best_rd -
+ MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
+ RDCOST(x->rdmult, x->rddiv, 0, total_sse));
+
+ if (tmp_best_rdu > 0) {
// If even the 'Y' rd value of split is higher than best so far
// then dont bother looking at UV
vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
- BLOCK_SIZE_SB8X8);
- vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
- super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
- &uv_skippable, &uv_sse,
- BLOCK_SIZE_SB8X8, TX_4X4);
+ BLOCK_8X8);
+ super_block_uvrd(cm, x, &rate_uv, &distortion_uv, &uv_skippable,
+ &uv_sse, BLOCK_8X8, tmp_best_rdu);
+ if (rate_uv == INT_MAX)
+ continue;
rate2 += rate_uv;
distortion2 += distortion_uv;
skippable = skippable && uv_skippable;
@@ -3731,6 +3672,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
tx_cache[i] = tx_cache[ONLY_4X4];
}
} else {
+ mbmi->mode = rd_mode_to_mode(this_mode);
compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
this_rd = handle_inter_mode(cpi, x, bsize,
tx_cache,
@@ -3766,7 +3708,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
SEG_LVL_SKIP);
- if (skippable && bsize >= BLOCK_SIZE_SB8X8) {
+ if (skippable && bsize >= BLOCK_8X8) {
// Back out the coefficient coding costs
rate2 -= (rate_y + rate_uv);
// for best yrd calculation
@@ -3815,30 +3757,30 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
// Keep record of best intra rd
- if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME &&
- is_intra_mode(xd->mode_info_context->mbmi.mode) &&
+ if (xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
+ is_intra_mode(xd->this_mi->mbmi.mode) &&
this_rd < best_intra_rd) {
best_intra_rd = this_rd;
- best_intra_mode = xd->mode_info_context->mbmi.mode;
+ best_intra_mode = xd->this_mi->mbmi.mode;
}
// Keep record of best inter rd with single reference
- if (xd->mode_info_context->mbmi.ref_frame[0] > INTRA_FRAME &&
- xd->mode_info_context->mbmi.ref_frame[1] == NONE &&
+ if (xd->this_mi->mbmi.ref_frame[0] > INTRA_FRAME &&
+ xd->this_mi->mbmi.ref_frame[1] == NONE &&
!mode_excluded &&
this_rd < best_inter_rd) {
best_inter_rd = this_rd;
best_inter_ref_frame = ref_frame;
- // best_inter_mode = xd->mode_info_context->mbmi.mode;
+ // best_inter_mode = xd->this_mi->mbmi.mode;
}
if (!disable_skip && ref_frame == INTRA_FRAME) {
for (i = 0; i < NB_PREDICTION_TYPES; ++i)
best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
- for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
+ for (i = 0; i <= SWITCHABLE_FILTERS; i++)
best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
}
- if (this_mode != I4X4_PRED && this_mode != SPLITMV) {
+ if (this_mode != RD_I4X4_PRED && this_mode != RD_SPLITMV) {
// Store the respective mode distortions for later use.
if (mode_distortions[this_mode] == -1
|| distortion2 < mode_distortions[this_mode]) {
@@ -3870,9 +3812,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
best_skip2 = this_skip2;
best_partition = *x->partition_info;
- if (this_mode == I4X4_PRED || this_mode == SPLITMV)
+ if (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV)
for (i = 0; i < 4; i++)
- best_bmodes[i] = xd->mode_info_context->bmi[i];
+ best_bmodes[i] = xd->this_mi->bmi[i];
// TODO(debargha): enhance this test with a better distortion prediction
// based on qp, activity mask and history
@@ -3890,29 +3832,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
}
}
-#if 0
- // Testing this mode gave rise to an improvement in best error score.
- // Lower threshold a bit for next time
- cpi->rd_thresh_mult[mode_index] =
- (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
- cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
- cpi->rd_threshes[mode_index] =
- (cpi->rd_baseline_thresh[mode_index] >> 7)
- * cpi->rd_thresh_mult[mode_index];
-#endif
- } else {
- // If the mode did not help improve the best error case then
- // raise the threshold for testing that mode next time around.
-#if 0
- cpi->rd_thresh_mult[mode_index] += 4;
-
- if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
- cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
-
- cpi->rd_threshes[mode_index] =
- (cpi->rd_baseline_thresh[mode_index] >> 7)
- * cpi->rd_thresh_mult[mode_index];
-#endif
}
/* keep record of best compound/single-only prediction */
@@ -3945,9 +3864,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
cm->mcomp_filter_type != BILINEAR) {
int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
- VP9_SWITCHABLE_FILTERS :
- vp9_switchable_interp_map[cm->mcomp_filter_type]];
- for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+ SWITCHABLE_FILTERS : cm->mcomp_filter_type];
+ for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
int64_t adj_rd;
// In cases of poor prediction, filter_cache[] can contain really big
// values, which actually are bigger than this_rd itself. This can
@@ -3964,16 +3882,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
/* keep record of best txfm size */
if (bsize < BLOCK_32X32) {
if (bsize < BLOCK_16X16) {
- if (this_mode == SPLITMV || this_mode == I4X4_PRED)
+ if (this_mode == RD_SPLITMV || this_mode == RD_I4X4_PRED)
tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
}
tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
}
if (!mode_excluded && this_rd != INT64_MAX) {
- for (i = 0; i < TX_MODES; i++) {
+ for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
int64_t adj_rd = INT64_MAX;
- if (this_mode != I4X4_PRED) {
+ if (this_mode != RD_I4X4_PRED) {
adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
} else {
adj_rd = this_rd;
@@ -4003,18 +3921,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
&rate_uv_tokenonly[uv_tx_size],
&dist_uv[uv_tx_size],
&skip_uv[uv_tx_size],
- (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8
- : bsize);
+ bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
}
}
- // If indicated then mark the index of the chosen mode to be inspected at
- // other block sizes.
- if (bsize <= cpi->sf.unused_mode_skip_lvl) {
- cpi->unused_mode_skip_mask = cpi->unused_mode_skip_mask &
- (~((int64_t)1 << best_mode_index));
- }
-
// If we are using reference masking and the set mask flag is set then
// create the reference frame mask.
if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
@@ -4039,7 +3949,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
}
- if (best_rd == INT64_MAX && bsize < BLOCK_SIZE_SB8X8) {
+ if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
*returnrate = INT_MAX;
*returndistortion = INT_MAX;
return best_rd;
@@ -4057,57 +3967,43 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (cpi->sf.adaptive_rd_thresh) {
for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
if (mode_index == best_mode_index) {
- cpi->rd_thresh_freq_fact[bsize][mode_index] = BASE_RD_THRESH_FREQ_FACT;
+ cpi->rd_thresh_freq_fact[bsize][mode_index] -=
+ (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3);
} else {
- cpi->rd_thresh_freq_fact[bsize][mode_index] += MAX_RD_THRESH_FREQ_INC;
+ cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
- (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) {
+ (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) {
cpi->rd_thresh_freq_fact[bsize][mode_index] =
- cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT;
+ cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT;
}
}
}
}
- // TODO(rbultje) integrate with RD trd_thresh_freq_facthresholding
-#if 0
- // Reduce the activation RD thresholds for the best choice mode
- if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
- (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
- int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
-
- cpi->rd_thresh_mult[best_mode_index] =
- (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?
- cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
- cpi->rd_threshes[best_mode_index] =
- (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
- }
-#endif
-
// macroblock modes
*mbmi = best_mbmode;
x->skip |= best_skip2;
if (best_mbmode.ref_frame[0] == INTRA_FRAME &&
- best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
+ best_mbmode.sb_type < BLOCK_8X8) {
for (i = 0; i < 4; i++)
- xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
+ xd->this_mi->bmi[i].as_mode = best_bmodes[i].as_mode;
}
if (best_mbmode.ref_frame[0] != INTRA_FRAME &&
- best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
+ best_mbmode.sb_type < BLOCK_8X8) {
for (i = 0; i < 4; i++)
- xd->mode_info_context->bmi[i].as_mv[0].as_int =
+ xd->this_mi->bmi[i].as_mv[0].as_int =
best_bmodes[i].as_mv[0].as_int;
if (mbmi->ref_frame[1] > 0)
for (i = 0; i < 4; i++)
- xd->mode_info_context->bmi[i].as_mv[1].as_int =
+ xd->this_mi->bmi[i].as_mv[1].as_int =
best_bmodes[i].as_mv[1].as_int;
*x->partition_info = best_partition;
- mbmi->mv[0].as_int = xd->mode_info_context->bmi[3].as_mv[0].as_int;
- mbmi->mv[1].as_int = xd->mode_info_context->bmi[3].as_mv[1].as_int;
+ mbmi->mv[0].as_int = xd->this_mi->bmi[3].as_mv[0].as_int;
+ mbmi->mv[1].as_int = xd->this_mi->bmi[3].as_mv[1].as_int;
}
for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
@@ -4118,14 +4014,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
if (!x->skip) {
- for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
if (best_filter_rd[i] == INT64_MAX)
best_filter_diff[i] = 0;
else
best_filter_diff[i] = best_rd - best_filter_rd[i];
}
if (cm->mcomp_filter_type == SWITCHABLE)
- assert(best_filter_diff[VP9_SWITCHABLE_FILTERS] == 0);
+ assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
} else {
vpx_memset(best_filter_diff, 0, sizeof(best_filter_diff));
}
diff --git a/libvpx/vp9/encoder/vp9_rdopt.h b/libvpx/vp9/encoder/vp9_rdopt.h
index 7c84b48..eba7df9 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/libvpx/vp9/encoder/vp9_rdopt.h
@@ -13,8 +13,6 @@
#define VP9_ENCODER_VP9_RDOPT_H_
#define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
-#define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
-
#define QIDX_SKIP_THRESH 115
void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
@@ -22,12 +20,12 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
- int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
+ int *r, int64_t *d, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx, int64_t best_rd);
int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int mi_row, int mi_col,
- int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
+ int *r, int64_t *d, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx, int64_t best_rd);
void vp9_init_me_luts();
diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c
index 9564edc..10655e8 100644
--- a/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/libvpx/vp9/encoder/vp9_segmentation.c
@@ -17,39 +17,42 @@
void vp9_enable_segmentation(VP9_PTR ptr) {
VP9_COMP *cpi = (VP9_COMP *)ptr;
+ struct segmentation *const seg = &cpi->common.seg;
- cpi->mb.e_mbd.seg.enabled = 1;
- cpi->mb.e_mbd.seg.update_map = 1;
- cpi->mb.e_mbd.seg.update_data = 1;
+ seg->enabled = 1;
+ seg->update_map = 1;
+ seg->update_data = 1;
}
void vp9_disable_segmentation(VP9_PTR ptr) {
VP9_COMP *cpi = (VP9_COMP *)ptr;
- cpi->mb.e_mbd.seg.enabled = 0;
+ struct segmentation *const seg = &cpi->common.seg;
+ seg->enabled = 0;
}
void vp9_set_segmentation_map(VP9_PTR ptr,
unsigned char *segmentation_map) {
- VP9_COMP *cpi = (VP9_COMP *)(ptr);
+ VP9_COMP *cpi = (VP9_COMP *)ptr;
+ struct segmentation *const seg = &cpi->common.seg;
// Copy in the new segmentation map
vpx_memcpy(cpi->segmentation_map, segmentation_map,
(cpi->common.mi_rows * cpi->common.mi_cols));
// Signal that the map should be updated.
- cpi->mb.e_mbd.seg.update_map = 1;
- cpi->mb.e_mbd.seg.update_data = 1;
+ seg->update_map = 1;
+ seg->update_data = 1;
}
void vp9_set_segment_data(VP9_PTR ptr,
signed char *feature_data,
unsigned char abs_delta) {
- VP9_COMP *cpi = (VP9_COMP *)(ptr);
+ VP9_COMP *cpi = (VP9_COMP *)ptr;
+ struct segmentation *const seg = &cpi->common.seg;
- cpi->mb.e_mbd.seg.abs_delta = abs_delta;
+ seg->abs_delta = abs_delta;
- vpx_memcpy(cpi->mb.e_mbd.seg.feature_data, feature_data,
- sizeof(cpi->mb.e_mbd.seg.feature_data));
+ vpx_memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
// TBD ?? Set the feature mask
// vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,
@@ -114,7 +117,7 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) {
return cost;
}
-static void count_segs(VP9_COMP *cpi, MODE_INFO *mi,
+static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
int *no_pred_segcounts,
int (*temporal_predictor_count)[2],
int *t_unpred_seg_counts,
@@ -126,8 +129,8 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO *mi,
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- segment_id = mi->mbmi.segment_id;
- xd->mode_info_context = mi;
+ segment_id = mi_8x8[0]->mbmi.segment_id;
+
set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
// Count the number of hits on each segment with no prediction
@@ -135,7 +138,7 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO *mi,
// Temporal prediction not allowed on key frames
if (cm->frame_type != KEY_FRAME) {
- const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+ const BLOCK_SIZE bsize = mi_8x8[0]->mbmi.sb_type;
// Test to see if the segment id matches the predicted value.
const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
bsize, mi_row, mi_col);
@@ -144,7 +147,7 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO *mi,
// Store the prediction status for this mb and update counts
// as appropriate
- vp9_set_pred_flag_seg_id(cm, bsize, mi_row, mi_col, pred_flag);
+ vp9_set_pred_flag_seg_id(xd, pred_flag);
temporal_predictor_count[pred_context][pred_flag]++;
if (!pred_flag)
@@ -153,95 +156,85 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO *mi,
}
}
-static void count_segs_sb(VP9_COMP *cpi, MODE_INFO *mi,
+static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
int *no_pred_segcounts,
int (*temporal_predictor_count)[2],
int *t_unpred_seg_counts,
int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const cm = &cpi->common;
+ BLOCK_SIZE bsize) {
+ const VP9_COMMON *const cm = &cpi->common;
const int mis = cm->mode_info_stride;
- int bwl, bhl;
- const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
+ int bw, bh;
+ const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- bwl = mi_width_log2(mi->mbmi.sb_type);
- bhl = mi_height_log2(mi->mbmi.sb_type);
-
- if (bwl == bsl && bhl == bsl) {
- count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
- t_unpred_seg_counts, 1 << bsl, 1 << bsl, mi_row, mi_col);
- } else if (bwl == bsl && bhl < bsl) {
- count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
- t_unpred_seg_counts, 1 << bsl, bs, mi_row, mi_col);
- count_segs(cpi, mi + bs * mis, no_pred_segcounts, temporal_predictor_count,
- t_unpred_seg_counts, 1 << bsl, bs, mi_row + bs, mi_col);
- } else if (bwl < bsl && bhl == bsl) {
- count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
- t_unpred_seg_counts, bs, 1 << bsl, mi_row, mi_col);
- count_segs(cpi, mi + bs, no_pred_segcounts, temporal_predictor_count,
- t_unpred_seg_counts, bs, 1 << bsl, mi_row, mi_col + bs);
+ bw = num_8x8_blocks_wide_lookup[mi_8x8[0]->mbmi.sb_type];
+ bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
+
+ if (bw == bs && bh == bs) {
+ count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+ } else if (bw == bs && bh < bs) {
+ count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+ count_segs(cpi, mi_8x8 + hbs * mis, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+ mi_row + hbs, mi_col);
+ } else if (bw < bs && bh == bs) {
+ count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+ count_segs(cpi, mi_8x8 + hbs, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs);
} else {
- BLOCK_SIZE_TYPE subsize;
+ const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
int n;
- assert(bwl < bsl && bhl < bsl);
- if (bsize == BLOCK_64X64) {
- subsize = BLOCK_32X32;
- } else if (bsize == BLOCK_32X32) {
- subsize = BLOCK_16X16;
- } else {
- assert(bsize == BLOCK_16X16);
- subsize = BLOCK_8X8;
- }
+ assert(bw < bs && bh < bs);
for (n = 0; n < 4; n++) {
- const int y_idx = n >> 1, x_idx = n & 0x01;
+ const int mi_dc = hbs * (n & 1);
+ const int mi_dr = hbs * (n >> 1);
- count_segs_sb(cpi, mi + y_idx * bs * mis + x_idx * bs,
+ count_segs_sb(cpi, &mi_8x8[mi_dr * mis + mi_dc],
no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts,
- mi_row + y_idx * bs, mi_col + x_idx * bs, subsize);
+ mi_row + mi_dr, mi_col + mi_dc, subsize);
}
}
}
void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
- struct segmentation *seg = &cpi->mb.e_mbd.seg;
+ struct segmentation *seg = &cm->seg;
int no_pred_cost;
int t_pred_cost = INT_MAX;
int i, tile_col, mi_row, mi_col;
- int temporal_predictor_count[PREDICTION_PROBS][2];
- int no_pred_segcounts[MAX_SEGMENTS];
- int t_unpred_seg_counts[MAX_SEGMENTS];
+ int temporal_predictor_count[PREDICTION_PROBS][2] = { { 0 } };
+ int no_pred_segcounts[MAX_SEGMENTS] = { 0 };
+ int t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
vp9_prob no_pred_tree[SEG_TREE_PROBS];
vp9_prob t_pred_tree[SEG_TREE_PROBS];
vp9_prob t_nopred_prob[PREDICTION_PROBS];
const int mis = cm->mode_info_stride;
- MODE_INFO *mi_ptr, *mi;
+ MODE_INFO **mi_ptr, **mi;
// Set default state for the segment tree probabilities and the
// temporal coding probabilities
vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
vpx_memset(seg->pred_probs, 255, sizeof(seg->pred_probs));
- vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));
- vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));
- vpx_memset(temporal_predictor_count, 0, sizeof(temporal_predictor_count));
-
// First of all generate stats regarding how well the last segment map
// predicts this one
for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
vp9_get_tile_col_offsets(cm, tile_col);
- mi_ptr = cm->mi + cm->cur_tile_mi_col_start;
+ mi_ptr = cm->mi_grid_visible + cm->cur_tile_mi_col_start;
for (mi_row = 0; mi_row < cm->mi_rows;
mi_row += 8, mi_ptr += 8 * mis) {
mi = mi_ptr;
diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c
index a692c01..63826ee 100644
--- a/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -153,11 +153,11 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
/*cpi->sf.search_method == HEX*/
// TODO Check that the 16x16 vf & sdf are selected here
// Ignore mv costing by sending NULL pointer instead of cost arrays
- ref_mv = &x->e_mbd.mode_info_context->bmi[0].as_mv[0];
- bestsme = vp9_hex_search(x, &best_ref_mv1_full, ref_mv,
- step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16],
- NULL, NULL, NULL, NULL,
- &best_ref_mv1);
+ ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0];
+ bestsme = vp9_hex_search(x, &best_ref_mv1_full,
+ step_param, sadpb, 1,
+ &cpi->fn_ptr[BLOCK_16X16],
+ 0, &best_ref_mv1, ref_mv);
#if ALT_REF_SUBPEL_ENABLED
// Try sub-pixel MC?
@@ -170,6 +170,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
&best_ref_mv1,
x->errorperbit,
&cpi->fn_ptr[BLOCK_16X16],
+ 0, cpi->sf.subpel_iters_per_step,
NULL, NULL,
&distortion, &sse);
}
@@ -244,8 +245,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
if (cpi->frames[frame] == NULL)
continue;
- mbd->mode_info_context->bmi[0].as_mv[0].as_mv.row = 0;
- mbd->mode_info_context->bmi[0].as_mv[0].as_mv.col = 0;
+ mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row = 0;
+ mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col = 0;
if (frame == alt_ref_index) {
filter_weight = 2;
@@ -278,8 +279,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
cpi->frames[frame]->u_buffer + mb_uv_offset,
cpi->frames[frame]->v_buffer + mb_uv_offset,
cpi->frames[frame]->y_stride,
- mbd->mode_info_context->bmi[0].as_mv[0].as_mv.row,
- mbd->mode_info_context->bmi[0].as_mv[0].as_mv.col,
+ mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row,
+ mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col,
predictor);
// Apply the filter (YUV)
diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c
index caa89b2..0c9bf9d 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/libvpx/vp9/encoder/vp9_tokenize.c
@@ -97,103 +97,51 @@ struct tokenize_b_args {
TX_SIZE tx_size;
};
-static void set_entropy_context_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg) {
+static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
struct tokenize_b_args* const args = arg;
- TX_SIZE tx_size = ss_txfrm_size >> 1;
- MACROBLOCKD *xd = args->xd;
- const int bwl = b_width_log2(bsize);
- const int off = block >> (2 * tx_size);
- const int mod = bwl - tx_size - xd->plane[plane].subsampling_x;
- const int aoff = (off & ((1 << mod) - 1)) << tx_size;
- const int loff = (off >> mod) << tx_size;
- ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff;
- ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff;
- const int eob = xd->plane[plane].eobs[block];
- const int tx_size_in_blocks = 1 << tx_size;
-
- if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
- set_contexts_on_border(xd, bsize, plane, tx_size_in_blocks, eob, aoff, loff,
- A, L);
- } else {
- vpx_memset(A, eob > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
- vpx_memset(L, eob > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
- }
+ MACROBLOCKD *const xd = args->xd;
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ int aoff, loff;
+ txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
+ set_contexts(xd, pd, plane_bsize, tx_size, pd->eobs[block] > 0, aoff, loff);
}
-static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg) {
+static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
struct tokenize_b_args* const args = arg;
VP9_COMP *cpi = args->cpi;
MACROBLOCKD *xd = args->xd;
TOKENEXTRA **tp = args->tp;
- const TX_SIZE tx_size = ss_txfrm_size >> 1;
- const int tx_size_in_blocks = 1 << tx_size;
- MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
int pt; /* near block/prev token context index */
int c = 0, rc = 0;
TOKENEXTRA *t = *tp; /* store tokens starting here */
- const int eob = xd->plane[plane].eobs[block];
- const PLANE_TYPE type = xd->plane[plane].plane_type;
- const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
- const int bwl = b_width_log2(bsize);
- const int off = block >> (2 * tx_size);
- const int mod = bwl - tx_size - xd->plane[plane].subsampling_x;
- const int aoff = (off & ((1 << mod) - 1)) << tx_size;
- const int loff = (off >> mod) << tx_size;
- ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff;
- ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff;
- int seg_eob;
+ const int eob = pd->eobs[block];
+ const PLANE_TYPE type = pd->plane_type;
+ const int16_t *qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
+
const int segment_id = mbmi->segment_id;
const int16_t *scan, *nb;
- vp9_coeff_count *counts;
- vp9_coeff_probs_model *coef_probs;
+ vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
+ vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
const int ref = is_inter_block(mbmi);
- ENTROPY_CONTEXT above_ec, left_ec;
uint8_t token_cache[1024];
const uint8_t *band_translate;
- assert((!type && !plane) || (type && plane));
+ ENTROPY_CONTEXT *A, *L;
+ const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+ int aoff, loff;
+ txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
- counts = cpi->coef_counts[tx_size];
- coef_probs = cpi->common.fc.coef_probs[tx_size];
- switch (tx_size) {
- default:
- case TX_4X4:
- above_ec = A[0] != 0;
- left_ec = L[0] != 0;
- seg_eob = 16;
- scan = get_scan_4x4(get_tx_type_4x4(type, xd, block));
- band_translate = vp9_coefband_trans_4x4;
- break;
- case TX_8X8:
- above_ec = !!*(uint16_t *)A;
- left_ec = !!*(uint16_t *)L;
- seg_eob = 64;
- scan = get_scan_8x8(get_tx_type_8x8(type, xd));
- band_translate = vp9_coefband_trans_8x8plus;
- break;
- case TX_16X16:
- above_ec = !!*(uint32_t *)A;
- left_ec = !!*(uint32_t *)L;
- seg_eob = 256;
- scan = get_scan_16x16(get_tx_type_16x16(type, xd));
- band_translate = vp9_coefband_trans_8x8plus;
- break;
- case TX_32X32:
- above_ec = !!*(uint64_t *)A;
- left_ec = !!*(uint64_t *)L;
- seg_eob = 1024;
- scan = vp9_default_scan_32x32;
- band_translate = vp9_coefband_trans_8x8plus;
- break;
- }
-
- pt = combine_entropy_contexts(above_ec, left_ec);
- nb = vp9_get_coef_neighbors_handle(scan);
+ A = pd->above_context + aoff;
+ L = pd->left_context + loff;
- if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP))
- seg_eob = 0;
+ assert((!type && !plane) || (type && plane));
+ pt = get_entropy_context(xd, tx_size, type, block, A, L,
+ &scan, &band_translate);
+ nb = vp9_get_coef_neighbors_handle(scan);
c = 0;
do {
const int band = get_coef_band(band_translate, c);
@@ -227,62 +175,53 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
} while (c < eob && ++c < seg_eob);
*tp = t;
- if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
- set_contexts_on_border(xd, bsize, plane, tx_size_in_blocks, c, aoff, loff,
- A, L);
- } else {
- vpx_memset(A, c > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
- vpx_memset(L, c > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
- }
+
+ set_contexts(xd, pd, plane_bsize, tx_size, c > 0, aoff, loff);
}
struct is_skippable_args {
MACROBLOCKD *xd;
int *skippable;
};
+
static void is_skippable(int plane, int block,
- BLOCK_SIZE_TYPE bsize, int ss_txfrm_size, void *argv) {
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *argv) {
struct is_skippable_args *args = argv;
args->skippable[0] &= (!args->xd->plane[plane].eobs[block]);
}
-int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
int result = 1;
struct is_skippable_args args = {xd, &result};
foreach_transformed_block(xd, bsize, is_skippable, &args);
return result;
}
-int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
- int result = 1;
- struct is_skippable_args args = {xd, &result};
- foreach_transformed_block_in_plane(xd, bsize, 0, is_skippable, &args);
- return result;
-}
-
-int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane) {
int result = 1;
struct is_skippable_args args = {xd, &result};
- foreach_transformed_block_uv(xd, bsize, is_skippable, &args);
+ foreach_transformed_block_in_plane(xd, bsize, plane, is_skippable, &args);
return result;
}
void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
- BLOCK_SIZE_TYPE bsize) {
+ BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
TOKENEXTRA *t_backup = *t;
const int mb_skip_context = vp9_get_pred_context_mbskip(xd);
- const int skip_inc = !vp9_segfeature_active(&xd->seg, mbmi->segment_id,
+ const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
SEG_LVL_SKIP);
- struct tokenize_b_args arg = {cpi, xd, t, mbmi->txfm_size};
+ struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size};
- mbmi->mb_skip_coeff = vp9_sb_is_skippable(xd, bsize);
- if (mbmi->mb_skip_coeff) {
+ mbmi->skip_coeff = vp9_sb_is_skippable(xd, bsize);
+ if (mbmi->skip_coeff) {
if (!dry_run)
cm->counts.mbskip[mb_skip_context][1] += skip_inc;
- vp9_reset_sb_tokens_context(xd, bsize);
+ reset_skip_context(xd, bsize);
if (dry_run)
*t = t_backup;
return;
diff --git a/libvpx/vp9/encoder/vp9_tokenize.h b/libvpx/vp9/encoder/vp9_tokenize.h
index 968bec7..b78e100 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/libvpx/vp9/encoder/vp9_tokenize.h
@@ -31,13 +31,13 @@ typedef struct {
typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
[MAX_ENTROPY_TOKENS + 1];
-int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
-int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
-int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
+int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize);
+int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane);
struct VP9_COMP;
void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
- BLOCK_SIZE_TYPE bsize);
+ BLOCK_SIZE bsize);
#ifdef ENTROPY_STATS
void init_context_counters();
diff --git a/libvpx/vp9/encoder/vp9_variance_c.c b/libvpx/vp9/encoder/vp9_variance_c.c
index 23e7767..155ba8a 100644
--- a/libvpx/vp9/encoder/vp9_variance_c.c
+++ b/libvpx/vp9/encoder/vp9_variance_c.c
@@ -46,12 +46,12 @@ unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
- uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering
+ uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering
uint8_t temp2[68 * 64];
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 33, 64, hfilter);
@@ -68,13 +68,13 @@ unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
int dst_pixels_per_line,
unsigned int *sse,
const uint8_t *second_pred) {
- uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering
+ uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering
uint8_t temp2[68 * 64];
DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64); // compound pred buffer
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 33, 64, hfilter);
@@ -103,12 +103,12 @@ unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
- uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering
+ uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering
uint8_t temp2[68 * 64];
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 65, 32, hfilter);
@@ -125,13 +125,13 @@ unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
int dst_pixels_per_line,
unsigned int *sse,
const uint8_t *second_pred) {
- uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering
+ uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering
uint8_t temp2[68 * 64];
DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64); // compound pred buffer
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 65, 32, hfilter);
@@ -160,12 +160,12 @@ unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
- uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering
+ uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering
uint8_t temp2[36 * 32];
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 17, 32, hfilter);
@@ -182,13 +182,13 @@ unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
int dst_pixels_per_line,
unsigned int *sse,
const uint8_t *second_pred) {
- uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering
+ uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering
uint8_t temp2[36 * 32];
DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16); // compound pred buffer
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 17, 32, hfilter);
@@ -217,12 +217,12 @@ unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
- uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering
+ uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering
uint8_t temp2[36 * 32];
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 33, 16, hfilter);
@@ -239,13 +239,13 @@ unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
int dst_pixels_per_line,
unsigned int *sse,
const uint8_t *second_pred) {
- uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering
+ uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering
uint8_t temp2[36 * 32];
DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32); // compound pred buffer
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 33, 16, hfilter);
@@ -440,10 +440,10 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
unsigned int *sse) {
uint8_t temp2[20 * 16];
const int16_t *hfilter, *vfilter;
- uint16_t fdata3[5 * 4]; // Temp data bufffer used in filtering
+ uint16_t fdata3[5 * 4]; // Temp data buffer used in filtering
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
// First filter 1d Horizontal
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
@@ -466,10 +466,10 @@ unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
uint8_t temp2[20 * 16];
const int16_t *hfilter, *vfilter;
DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4); // compound pred buffer
- uint16_t fdata3[5 * 4]; // Temp data bufffer used in filtering
+ uint16_t fdata3[5 * 4]; // Temp data buffer used in filtering
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
// First filter 1d Horizontal
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
@@ -488,12 +488,12 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
- uint16_t fdata3[9 * 8]; // Temp data bufffer used in filtering
+ uint16_t fdata3[9 * 8]; // Temp data buffer used in filtering
uint8_t temp2[20 * 16];
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 9, 8, hfilter);
@@ -510,13 +510,13 @@ unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
int dst_pixels_per_line,
unsigned int *sse,
const uint8_t *second_pred) {
- uint16_t fdata3[9 * 8]; // Temp data bufffer used in filtering
+ uint16_t fdata3[9 * 8]; // Temp data buffer used in filtering
uint8_t temp2[20 * 16];
DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8); // compound pred buffer
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 9, 8, hfilter);
@@ -532,12 +532,12 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
- uint16_t fdata3[17 * 16]; // Temp data bufffer used in filtering
+ uint16_t fdata3[17 * 16]; // Temp data buffer used in filtering
uint8_t temp2[20 * 16];
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 17, 16, hfilter);
@@ -559,8 +559,8 @@ unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16); // compound pred buffer
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 17, 16, hfilter);
@@ -577,12 +577,12 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
- uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering
+ uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering
uint8_t temp2[68 * 64];
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 65, 64, hfilter);
@@ -599,13 +599,13 @@ unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
int dst_pixels_per_line,
unsigned int *sse,
const uint8_t *second_pred) {
- uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering
+ uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering
uint8_t temp2[68 * 64];
DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64); // compound pred buffer
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 65, 64, hfilter);
@@ -621,12 +621,12 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
- uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering
+ uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering
uint8_t temp2[36 * 32];
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 33, 32, hfilter);
@@ -643,13 +643,13 @@ unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
int dst_pixels_per_line,
unsigned int *sse,
const uint8_t *second_pred) {
- uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering
+ uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering
uint8_t temp2[36 * 32];
DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32); // compound pred buffer
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 33, 32, hfilter);
@@ -785,12 +785,12 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
- uint16_t fdata3[16 * 9]; // Temp data bufffer used in filtering
+ uint16_t fdata3[16 * 9]; // Temp data buffer used in filtering
uint8_t temp2[20 * 16];
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 9, 16, hfilter);
@@ -807,13 +807,13 @@ unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
int dst_pixels_per_line,
unsigned int *sse,
const uint8_t *second_pred) {
- uint16_t fdata3[16 * 9]; // Temp data bufffer used in filtering
+ uint16_t fdata3[16 * 9]; // Temp data buffer used in filtering
uint8_t temp2[20 * 16];
DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8); // compound pred buffer
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 9, 16, hfilter);
@@ -829,12 +829,12 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
- uint16_t fdata3[9 * 16]; // Temp data bufffer used in filtering
+ uint16_t fdata3[9 * 16]; // Temp data buffer used in filtering
uint8_t temp2[20 * 16];
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 17, 8, hfilter);
@@ -851,13 +851,13 @@ unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
int dst_pixels_per_line,
unsigned int *sse,
const uint8_t *second_pred) {
- uint16_t fdata3[9 * 16]; // Temp data bufffer used in filtering
+ uint16_t fdata3[9 * 16]; // Temp data buffer used in filtering
uint8_t temp2[20 * 16];
DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16); // compound pred buffer
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 17, 8, hfilter);
@@ -873,12 +873,12 @@ unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
- uint16_t fdata3[8 * 5]; // Temp data bufffer used in filtering
+ uint16_t fdata3[8 * 5]; // Temp data buffer used in filtering
uint8_t temp2[20 * 16];
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 5, 8, hfilter);
@@ -895,13 +895,13 @@ unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr,
int dst_pixels_per_line,
unsigned int *sse,
const uint8_t *second_pred) {
- uint16_t fdata3[8 * 5]; // Temp data bufffer used in filtering
+ uint16_t fdata3[8 * 5]; // Temp data buffer used in filtering
uint8_t temp2[20 * 16];
DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 4); // compound pred buffer
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 5, 8, hfilter);
@@ -917,14 +917,14 @@ unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
- uint16_t fdata3[5 * 8]; // Temp data bufffer used in filtering
+ uint16_t fdata3[5 * 8]; // Temp data buffer used in filtering
// FIXME(jingning,rbultje): this temp2 buffer probably doesn't need to be
// of this big? same issue appears in all other block size settings.
uint8_t temp2[20 * 16];
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 9, 4, hfilter);
@@ -941,13 +941,13 @@ unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr,
int dst_pixels_per_line,
unsigned int *sse,
const uint8_t *second_pred) {
- uint16_t fdata3[5 * 8]; // Temp data bufffer used in filtering
+ uint16_t fdata3[5 * 8]; // Temp data buffer used in filtering
uint8_t temp2[20 * 16];
DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 8); // compound pred buffer
const int16_t *hfilter, *vfilter;
- hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
- vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+ hfilter = BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
1, 9, 4, hfilter);
diff --git a/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
new file mode 100644
index 0000000..95ae266
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -0,0 +1,2650 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+#include "vp9/common/vp9_idct.h" // for cospi constants
+#include "vpx_ports/mem.h"
+
+#if FDCT32x32_HIGH_PRECISION
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+ __m128i buf0, buf1;
+ buf0 = _mm_mul_epu32(a, b);
+ a = _mm_srli_epi64(a, 32);
+ b = _mm_srli_epi64(b, 32);
+ buf1 = _mm_mul_epu32(a, b);
+ return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+ __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+ return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_cvtlo_epi16(__m128i a, __m128i mask16, __m128i kZero) {
+ // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers
+ __m128i sign_bit = _mm_and_si128(a, mask16);
+ __m128i b = _mm_unpacklo_epi16(a, kZero);
+ sign_bit = _mm_cmplt_epi16(sign_bit, kZero);
+ sign_bit = _mm_unpacklo_epi16(kZero, sign_bit);
+ return _mm_or_si128(sign_bit, b);
+}
+
+static INLINE __m128i k_cvthi_epi16(__m128i a, __m128i mask16, __m128i kZero) {
+ // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers
+ __m128i sign_bit = _mm_and_si128(a, mask16);
+ __m128i b = _mm_unpackhi_epi16(a, kZero);
+ sign_bit = _mm_cmplt_epi16(sign_bit, kZero);
+ sign_bit = _mm_unpackhi_epi16(kZero, sign_bit);
+ return _mm_or_si128(sign_bit, b);
+}
+#endif
+
+void FDCT32x32_2D(int16_t *input,
+ int16_t *output_org, int pitch) {
+ // Calculate pre-multiplied strides
+ const int str1 = pitch >> 1;
+ const int str2 = pitch;
+ const int str3 = pitch + str1;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
+ const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
+ const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
+ const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
+ const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
+ const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
+ const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
+ const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
+ const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i kOne = _mm_set1_epi16(1);
+ // Do the two transform/transpose passes
+ int pass;
+ for (pass = 0; pass < 2; ++pass) {
+ // We process eight columns (transposed rows in second pass) at a time.
+ int column_start;
+ for (column_start = 0; column_start < 32; column_start += 8) {
+ __m128i step1[32];
+ __m128i step2[32];
+ __m128i step3[32];
+ __m128i out[32];
+ // Stage 1
+ // Note: even though all the loads below are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (0 == pass) {
+ int16_t *in = &input[column_start];
+ // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ int16_t *ina = in + 0 * str1;
+ int16_t *inb = in + 31 * str1;
+ __m128i *step1a = &step1[ 0];
+ __m128i *step1b = &step1[31];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ int16_t *ina = in + 4 * str1;
+ int16_t *inb = in + 27 * str1;
+ __m128i *step1a = &step1[ 4];
+ __m128i *step1b = &step1[27];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ int16_t *ina = in + 8 * str1;
+ int16_t *inb = in + 23 * str1;
+ __m128i *step1a = &step1[ 8];
+ __m128i *step1b = &step1[23];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ int16_t *ina = in + 12 * str1;
+ int16_t *inb = in + 19 * str1;
+ __m128i *step1a = &step1[12];
+ __m128i *step1b = &step1[19];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ } else {
+ int16_t *in = &intermediate[column_start];
+ // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
+ // Note: using the same approach as above to have common offset is
+ // counter-productive as all offsets can be calculated at compile
+ // time.
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
+ __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
+ __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
+ __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
+ __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
+ __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
+ __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
+ __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
+ step1[ 0] = _mm_add_epi16(in00, in31);
+ step1[ 1] = _mm_add_epi16(in01, in30);
+ step1[ 2] = _mm_add_epi16(in02, in29);
+ step1[ 3] = _mm_add_epi16(in03, in28);
+ step1[28] = _mm_sub_epi16(in03, in28);
+ step1[29] = _mm_sub_epi16(in02, in29);
+ step1[30] = _mm_sub_epi16(in01, in30);
+ step1[31] = _mm_sub_epi16(in00, in31);
+ }
+ {
+ __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
+ __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
+ __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
+ __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
+ __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
+ __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
+ __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
+ __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
+ step1[ 4] = _mm_add_epi16(in04, in27);
+ step1[ 5] = _mm_add_epi16(in05, in26);
+ step1[ 6] = _mm_add_epi16(in06, in25);
+ step1[ 7] = _mm_add_epi16(in07, in24);
+ step1[24] = _mm_sub_epi16(in07, in24);
+ step1[25] = _mm_sub_epi16(in06, in25);
+ step1[26] = _mm_sub_epi16(in05, in26);
+ step1[27] = _mm_sub_epi16(in04, in27);
+ }
+ {
+ __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
+ __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
+ __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
+ __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
+ __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
+ __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
+ __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
+ __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
+ step1[ 8] = _mm_add_epi16(in08, in23);
+ step1[ 9] = _mm_add_epi16(in09, in22);
+ step1[10] = _mm_add_epi16(in10, in21);
+ step1[11] = _mm_add_epi16(in11, in20);
+ step1[20] = _mm_sub_epi16(in11, in20);
+ step1[21] = _mm_sub_epi16(in10, in21);
+ step1[22] = _mm_sub_epi16(in09, in22);
+ step1[23] = _mm_sub_epi16(in08, in23);
+ }
+ {
+ __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
+ __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
+ __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
+ __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
+ __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
+ __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
+ __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
+ __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
+ step1[12] = _mm_add_epi16(in12, in19);
+ step1[13] = _mm_add_epi16(in13, in18);
+ step1[14] = _mm_add_epi16(in14, in17);
+ step1[15] = _mm_add_epi16(in15, in16);
+ step1[16] = _mm_sub_epi16(in15, in16);
+ step1[17] = _mm_sub_epi16(in14, in17);
+ step1[18] = _mm_sub_epi16(in13, in18);
+ step1[19] = _mm_sub_epi16(in12, in19);
+ }
+ }
+ // Stage 2
+ {
+ step2[ 0] = _mm_add_epi16(step1[0], step1[15]);
+ step2[ 1] = _mm_add_epi16(step1[1], step1[14]);
+ step2[ 2] = _mm_add_epi16(step1[2], step1[13]);
+ step2[ 3] = _mm_add_epi16(step1[3], step1[12]);
+ step2[ 4] = _mm_add_epi16(step1[4], step1[11]);
+ step2[ 5] = _mm_add_epi16(step1[5], step1[10]);
+ step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]);
+ step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]);
+ step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]);
+ step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]);
+ step2[10] = _mm_sub_epi16(step1[5], step1[10]);
+ step2[11] = _mm_sub_epi16(step1[4], step1[11]);
+ step2[12] = _mm_sub_epi16(step1[3], step1[12]);
+ step2[13] = _mm_sub_epi16(step1[2], step1[13]);
+ step2[14] = _mm_sub_epi16(step1[1], step1[14]);
+ step2[15] = _mm_sub_epi16(step1[0], step1[15]);
+ }
+ {
+ const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
+ const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
+ const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
+ const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
+ const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
+ const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
+ const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
+ const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
+ const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
+ const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
+ const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
+ const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
+ const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
+ const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
+ const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
+ const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
+ const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
+ const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
+ const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
+ const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
+ const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
+ const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
+ const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
+ const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
+ const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
+ const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
+ const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
+ const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
+ const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
+ const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
+ const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
+ const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
+ const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
+ const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
+ const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
+ const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
+ const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
+ const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
+ const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
+ // Combine
+ step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
+ step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
+ step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
+ step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
+ step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
+ step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
+ step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
+ step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+ }
+
+#if !FDCT32x32_HIGH_PRECISION
+ // dump the magnitude by half, hence the intermediate values are within
+ // the range of 16 bits.
+ if (1 == pass) {
+ __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero);
+ __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero);
+ __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero);
+ __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero);
+ __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero);
+ __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero);
+ __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero);
+ __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero);
+ __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
+ __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
+ __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
+ __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
+ __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
+ __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
+ __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
+ __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
+ __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
+ __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
+ __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
+ __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
+ __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
+ __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
+ __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
+ __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
+ __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
+ __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
+ __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
+ __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
+ __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
+ __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
+ __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
+ __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
+
+ step2[ 0] = _mm_sub_epi16(step2[ 0], s3_00_0);
+ step2[ 1] = _mm_sub_epi16(step2[ 1], s3_01_0);
+ step2[ 2] = _mm_sub_epi16(step2[ 2], s3_02_0);
+ step2[ 3] = _mm_sub_epi16(step2[ 3], s3_03_0);
+ step2[ 4] = _mm_sub_epi16(step2[ 4], s3_04_0);
+ step2[ 5] = _mm_sub_epi16(step2[ 5], s3_05_0);
+ step2[ 6] = _mm_sub_epi16(step2[ 6], s3_06_0);
+ step2[ 7] = _mm_sub_epi16(step2[ 7], s3_07_0);
+ step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
+ step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
+ step2[10] = _mm_sub_epi16(step2[10], s3_10_0);
+ step2[11] = _mm_sub_epi16(step2[11], s3_11_0);
+ step2[12] = _mm_sub_epi16(step2[12], s3_12_0);
+ step2[13] = _mm_sub_epi16(step2[13], s3_13_0);
+ step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
+ step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
+ step1[16] = _mm_sub_epi16(step1[16], s3_16_0);
+ step1[17] = _mm_sub_epi16(step1[17], s3_17_0);
+ step1[18] = _mm_sub_epi16(step1[18], s3_18_0);
+ step1[19] = _mm_sub_epi16(step1[19], s3_19_0);
+ step2[20] = _mm_sub_epi16(step2[20], s3_20_0);
+ step2[21] = _mm_sub_epi16(step2[21], s3_21_0);
+ step2[22] = _mm_sub_epi16(step2[22], s3_22_0);
+ step2[23] = _mm_sub_epi16(step2[23], s3_23_0);
+ step2[24] = _mm_sub_epi16(step2[24], s3_24_0);
+ step2[25] = _mm_sub_epi16(step2[25], s3_25_0);
+ step2[26] = _mm_sub_epi16(step2[26], s3_26_0);
+ step2[27] = _mm_sub_epi16(step2[27], s3_27_0);
+ step1[28] = _mm_sub_epi16(step1[28], s3_28_0);
+ step1[29] = _mm_sub_epi16(step1[29], s3_29_0);
+ step1[30] = _mm_sub_epi16(step1[30], s3_30_0);
+ step1[31] = _mm_sub_epi16(step1[31], s3_31_0);
+
+ step2[ 0] = _mm_add_epi16(step2[ 0], kOne);
+ step2[ 1] = _mm_add_epi16(step2[ 1], kOne);
+ step2[ 2] = _mm_add_epi16(step2[ 2], kOne);
+ step2[ 3] = _mm_add_epi16(step2[ 3], kOne);
+ step2[ 4] = _mm_add_epi16(step2[ 4], kOne);
+ step2[ 5] = _mm_add_epi16(step2[ 5], kOne);
+ step2[ 6] = _mm_add_epi16(step2[ 6], kOne);
+ step2[ 7] = _mm_add_epi16(step2[ 7], kOne);
+ step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
+ step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
+ step2[10] = _mm_add_epi16(step2[10], kOne);
+ step2[11] = _mm_add_epi16(step2[11], kOne);
+ step2[12] = _mm_add_epi16(step2[12], kOne);
+ step2[13] = _mm_add_epi16(step2[13], kOne);
+ step2[14] = _mm_add_epi16(step2[14], kOne);
+ step2[15] = _mm_add_epi16(step2[15], kOne);
+ step1[16] = _mm_add_epi16(step1[16], kOne);
+ step1[17] = _mm_add_epi16(step1[17], kOne);
+ step1[18] = _mm_add_epi16(step1[18], kOne);
+ step1[19] = _mm_add_epi16(step1[19], kOne);
+ step2[20] = _mm_add_epi16(step2[20], kOne);
+ step2[21] = _mm_add_epi16(step2[21], kOne);
+ step2[22] = _mm_add_epi16(step2[22], kOne);
+ step2[23] = _mm_add_epi16(step2[23], kOne);
+ step2[24] = _mm_add_epi16(step2[24], kOne);
+ step2[25] = _mm_add_epi16(step2[25], kOne);
+ step2[26] = _mm_add_epi16(step2[26], kOne);
+ step2[27] = _mm_add_epi16(step2[27], kOne);
+ step1[28] = _mm_add_epi16(step1[28], kOne);
+ step1[29] = _mm_add_epi16(step1[29], kOne);
+ step1[30] = _mm_add_epi16(step1[30], kOne);
+ step1[31] = _mm_add_epi16(step1[31], kOne);
+
+ step2[ 0] = _mm_srai_epi16(step2[ 0], 2);
+ step2[ 1] = _mm_srai_epi16(step2[ 1], 2);
+ step2[ 2] = _mm_srai_epi16(step2[ 2], 2);
+ step2[ 3] = _mm_srai_epi16(step2[ 3], 2);
+ step2[ 4] = _mm_srai_epi16(step2[ 4], 2);
+ step2[ 5] = _mm_srai_epi16(step2[ 5], 2);
+ step2[ 6] = _mm_srai_epi16(step2[ 6], 2);
+ step2[ 7] = _mm_srai_epi16(step2[ 7], 2);
+ step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
+ step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
+ step2[10] = _mm_srai_epi16(step2[10], 2);
+ step2[11] = _mm_srai_epi16(step2[11], 2);
+ step2[12] = _mm_srai_epi16(step2[12], 2);
+ step2[13] = _mm_srai_epi16(step2[13], 2);
+ step2[14] = _mm_srai_epi16(step2[14], 2);
+ step2[15] = _mm_srai_epi16(step2[15], 2);
+ step1[16] = _mm_srai_epi16(step1[16], 2);
+ step1[17] = _mm_srai_epi16(step1[17], 2);
+ step1[18] = _mm_srai_epi16(step1[18], 2);
+ step1[19] = _mm_srai_epi16(step1[19], 2);
+ step2[20] = _mm_srai_epi16(step2[20], 2);
+ step2[21] = _mm_srai_epi16(step2[21], 2);
+ step2[22] = _mm_srai_epi16(step2[22], 2);
+ step2[23] = _mm_srai_epi16(step2[23], 2);
+ step2[24] = _mm_srai_epi16(step2[24], 2);
+ step2[25] = _mm_srai_epi16(step2[25], 2);
+ step2[26] = _mm_srai_epi16(step2[26], 2);
+ step2[27] = _mm_srai_epi16(step2[27], 2);
+ step1[28] = _mm_srai_epi16(step1[28], 2);
+ step1[29] = _mm_srai_epi16(step1[29], 2);
+ step1[30] = _mm_srai_epi16(step1[30], 2);
+ step1[31] = _mm_srai_epi16(step1[31], 2);
+ }
+#endif
+
+#if FDCT32x32_HIGH_PRECISION
+ if (pass == 0) {
+#endif
+ // Stage 3
+ {
+ step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
+ step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
+ step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
+ step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
+ step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
+ step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
+ step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
+ step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ // Combine
+ step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+ step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+ step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+ step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+ }
+ {
+ step3[16] = _mm_add_epi16(step2[23], step1[16]);
+ step3[17] = _mm_add_epi16(step2[22], step1[17]);
+ step3[18] = _mm_add_epi16(step2[21], step1[18]);
+ step3[19] = _mm_add_epi16(step2[20], step1[19]);
+ step3[20] = _mm_sub_epi16(step1[19], step2[20]);
+ step3[21] = _mm_sub_epi16(step1[18], step2[21]);
+ step3[22] = _mm_sub_epi16(step1[17], step2[22]);
+ step3[23] = _mm_sub_epi16(step1[16], step2[23]);
+ step3[24] = _mm_sub_epi16(step1[31], step2[24]);
+ step3[25] = _mm_sub_epi16(step1[30], step2[25]);
+ step3[26] = _mm_sub_epi16(step1[29], step2[26]);
+ step3[27] = _mm_sub_epi16(step1[28], step2[27]);
+ step3[28] = _mm_add_epi16(step2[27], step1[28]);
+ step3[29] = _mm_add_epi16(step2[26], step1[29]);
+ step3[30] = _mm_add_epi16(step2[25], step1[30]);
+ step3[31] = _mm_add_epi16(step2[24], step1[31]);
+ }
+
+ // Stage 4
+ {
+ step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
+ step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]);
+ step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]);
+ step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]);
+ step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]);
+ step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]);
+ step1[10] = _mm_sub_epi16(step2[ 9], step3[10]);
+ step1[11] = _mm_sub_epi16(step2[ 8], step3[11]);
+ step1[12] = _mm_sub_epi16(step2[15], step3[12]);
+ step1[13] = _mm_sub_epi16(step2[14], step3[13]);
+ step1[14] = _mm_add_epi16(step3[13], step2[14]);
+ step1[15] = _mm_add_epi16(step3[12], step2[15]);
+ }
+ {
+ const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+ const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+ const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+ const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+ const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+ const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+ const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+ const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+ const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // Combine
+ step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+ step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+ }
+ {
+ const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+ const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+ const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+ const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+ const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+ const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+ const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+ const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+ const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+ const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+ const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+ const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+ const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+ const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+ const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+ const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+ const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+ const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+ const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+ const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+ const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+ const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+ const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+ const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+ const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+ const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+ const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+ const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+ const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+ const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+ const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+ const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+ const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+ const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+ const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+ const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+ const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+ const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+ const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ // Combine
+ step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+ step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+ step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+ step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+ step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+ step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+ step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+ step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+ }
+ // Stage 5
+ {
+ step2[4] = _mm_add_epi16(step1[5], step3[4]);
+ step2[5] = _mm_sub_epi16(step3[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step3[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[6], step3[7]);
+ }
+ {
+ const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+ const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+ const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+ const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+ const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+ const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+ const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+ const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+ const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+ const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+ const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+ const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+ const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+ const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+ const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+ const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+ const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+ const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+ const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+ // Combine
+ out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
+ out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+ out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
+ out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+ }
+ {
+ const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
+ const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
+ const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+ const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+ const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+ const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+ const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+ const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+ const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+ const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+ const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+ const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+ const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+ const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+ const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+ const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+ const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+ const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+ const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+ // Combine
+ step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+ step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+ step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+ step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+ }
+ {
+ step2[16] = _mm_add_epi16(step1[19], step3[16]);
+ step2[17] = _mm_add_epi16(step1[18], step3[17]);
+ step2[18] = _mm_sub_epi16(step3[17], step1[18]);
+ step2[19] = _mm_sub_epi16(step3[16], step1[19]);
+ step2[20] = _mm_sub_epi16(step3[23], step1[20]);
+ step2[21] = _mm_sub_epi16(step3[22], step1[21]);
+ step2[22] = _mm_add_epi16(step1[21], step3[22]);
+ step2[23] = _mm_add_epi16(step1[20], step3[23]);
+ step2[24] = _mm_add_epi16(step1[27], step3[24]);
+ step2[25] = _mm_add_epi16(step1[26], step3[25]);
+ step2[26] = _mm_sub_epi16(step3[25], step1[26]);
+ step2[27] = _mm_sub_epi16(step3[24], step1[27]);
+ step2[28] = _mm_sub_epi16(step3[31], step1[28]);
+ step2[29] = _mm_sub_epi16(step3[30], step1[29]);
+ step2[30] = _mm_add_epi16(step1[29], step3[30]);
+ step2[31] = _mm_add_epi16(step1[28], step3[31]);
+ }
+ // Stage 6
+ {
+ const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+ const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+ const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+ const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+ const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+ const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+ const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+ const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+ // dct_const_round_shift
+ const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+ const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+ const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+ const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+ const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+ const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+ const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+ const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+ // Combine
+ out[ 4] = _mm_packs_epi32(out_04_6, out_04_7);
+ out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+ out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+ out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+ }
+ {
+ step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]);
+ step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]);
+ step3[10] = _mm_sub_epi16(step1[11], step2[10]);
+ step3[11] = _mm_add_epi16(step2[10], step1[11]);
+ step3[12] = _mm_add_epi16(step2[13], step1[12]);
+ step3[13] = _mm_sub_epi16(step1[12], step2[13]);
+ step3[14] = _mm_sub_epi16(step1[15], step2[14]);
+ step3[15] = _mm_add_epi16(step2[14], step1[15]);
+ }
+ {
+ const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+ const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+ const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+ const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+ const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+ const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+ const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+ const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+ const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+ const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+ const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+ const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+ const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+ const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+ const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+ const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+ const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+ const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+ const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+ const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+ const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+ const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+ const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+ const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+ // dct_const_round_shift
+ const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+ const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+ const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+ const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+ const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+ const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+ const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+ const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+ const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+ const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+ const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+ const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+ const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+ const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+ const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+ const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+ // Combine
+ step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+ step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+ step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+ step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+ // Combine
+ step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+ step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+ step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+ step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+ }
+ // Stage 7
+ {
+ const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
+ const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
+ const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
+ const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
+ const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+ const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+ const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+ const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+ const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+ const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+ const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+ const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+ const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+ const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+ const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+ const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+ const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+ const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+ const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+ const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+ const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+ const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+ const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+ const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+ // dct_const_round_shift
+ const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+ const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+ const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+ const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+ const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+ const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+ const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+ const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+ const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+ const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+ const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+ const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+ const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+ const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+ const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+ const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+ // Combine
+ out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
+ out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+ out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+ out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+ out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
+ out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+ out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+ out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+ }
+ {
+ step1[16] = _mm_add_epi16(step3[17], step2[16]);
+ step1[17] = _mm_sub_epi16(step2[16], step3[17]);
+ step1[18] = _mm_sub_epi16(step2[19], step3[18]);
+ step1[19] = _mm_add_epi16(step3[18], step2[19]);
+ step1[20] = _mm_add_epi16(step3[21], step2[20]);
+ step1[21] = _mm_sub_epi16(step2[20], step3[21]);
+ step1[22] = _mm_sub_epi16(step2[23], step3[22]);
+ step1[23] = _mm_add_epi16(step3[22], step2[23]);
+ step1[24] = _mm_add_epi16(step3[25], step2[24]);
+ step1[25] = _mm_sub_epi16(step2[24], step3[25]);
+ step1[26] = _mm_sub_epi16(step2[27], step3[26]);
+ step1[27] = _mm_add_epi16(step3[26], step2[27]);
+ step1[28] = _mm_add_epi16(step3[29], step2[28]);
+ step1[29] = _mm_sub_epi16(step2[28], step3[29]);
+ step1[30] = _mm_sub_epi16(step2[31], step3[30]);
+ step1[31] = _mm_add_epi16(step3[30], step2[31]);
+ }
+ // Final stage --- outputs indices are bit-reversed.
+ {
+ const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+ const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+ const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+ const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+ const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+ const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+ const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+ const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+ const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+ const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+ const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+ const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+ const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+ const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+ const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+ const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+ const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+ const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+ const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+ const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+ const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+ const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+ const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+ const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+ // dct_const_round_shift
+ const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+ const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+ const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+ const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+ const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+ const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+ const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+ const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+ const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+ const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+ const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+ const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+ const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+ const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+ const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+ const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+ // Combine
+ out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
+ out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+ out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
+ out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+ out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
+ out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+ out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+ out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+ }
+ {
+ const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+ const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+ const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+ const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+ const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+ const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+ const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+ const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+ const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+ const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+ const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+ const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+ const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+ const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+ const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+ const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+ const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+ const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+ const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+ const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+ const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+ const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+ const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+ const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+ // dct_const_round_shift
+ const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+ const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+ const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+ const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+ const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+ const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+ const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+ const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+ const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+ const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+ const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+ const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+ const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+ const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+ const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+ const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+ // Combine
+ out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
+ out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+ out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+ out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+ out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
+ out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+ out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+ out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+ }
+#if FDCT32x32_HIGH_PRECISION
+ } else {
+ __m128i lstep1[64], lstep2[64], lstep3[64];
+ __m128i u[32], v[32], sign[16];
+ const __m128i mask16 = _mm_set1_epi32(0x80008000);
+ const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
+ // start using 32-bit operations
+ // stage 3
+ {
+ // expanding to 32-bit length priori to addition operations
+ lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero);
+ lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero);
+ lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero);
+ lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero);
+ lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero);
+ lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero);
+ lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero);
+ lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero);
+ lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero);
+ lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero);
+ lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero);
+ lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero);
+ lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero);
+ lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero);
+ lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero);
+ lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero);
+
+ lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
+ lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
+ lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]);
+ lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]);
+ lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]);
+ lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]);
+ lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]);
+ lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]);
+ lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]);
+ lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]);
+ lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]);
+ lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]);
+ lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]);
+ lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]);
+ lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]);
+ lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]);
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ }
+ {
+ lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero);
+ lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero);
+ lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero);
+ lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero);
+ lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero);
+ lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero);
+ lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero);
+ lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero);
+ lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero);
+ lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero);
+ lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero);
+ lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero);
+ lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero);
+ lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero);
+ lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero);
+ lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero);
+
+ lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero);
+ lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero);
+ lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero);
+ lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero);
+ lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero);
+ lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero);
+ lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero);
+ lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero);
+ lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero);
+ lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero);
+ lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero);
+ lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero);
+ lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero);
+ lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero);
+ lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero);
+ lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero);
+
+ lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
+ lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
+ lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
+ lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
+ lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
+ lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
+ lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
+ lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
+ lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
+ lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
+ lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
+ lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
+ lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
+ lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
+ lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
+ lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
+ lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
+ lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
+ lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
+ lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
+ lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
+ lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
+ lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
+ lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
+ lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
+ lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
+ lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
+ lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
+ lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
+ lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
+ lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
+ lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
+ }
+
+ // stage 4
+ {
+ // expanding to 32-bit length priori to addition operations
+ lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero);
+ lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero);
+ lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero);
+ lstep2[19] = k_cvthi_epi16(step2[ 9], mask16, kZero);
+ lstep2[28] = k_cvtlo_epi16(step2[14], mask16, kZero);
+ lstep2[29] = k_cvthi_epi16(step2[14], mask16, kZero);
+ lstep2[30] = k_cvtlo_epi16(step2[15], mask16, kZero);
+ lstep2[31] = k_cvthi_epi16(step2[15], mask16, kZero);
+
+ lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]);
+ lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]);
+ lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]);
+ lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]);
+ lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]);
+ lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]);
+ lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]);
+ lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]);
+ lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
+ lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
+ lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
+ lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
+ lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
+ lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
+ lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
+ lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
+ lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
+ lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
+ lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
+ lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
+ lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
+ lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
+ lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
+ lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
+ }
+ {
+ // to be continued...
+ //
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
+ v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
+ v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
+ v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
+ v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
+ v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
+ v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
+ v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
+
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+ lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ }
+ {
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
+ u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
+ u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
+ u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
+ u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
+ u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
+ u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
+ u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
+ u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
+ u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
+ u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
+ u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
+ u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
+ u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
+ u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
+ u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+ v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24);
+ v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24);
+ v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24);
+ v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24);
+ v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24);
+ v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24);
+ v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24);
+ v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24);
+ v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08);
+ v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08);
+ v[10] = k_madd_epi32(u[10], k32_m24_m08);
+ v[11] = k_madd_epi32(u[11], k32_m24_m08);
+ v[12] = k_madd_epi32(u[12], k32_m24_m08);
+ v[13] = k_madd_epi32(u[13], k32_m24_m08);
+ v[14] = k_madd_epi32(u[14], k32_m24_m08);
+ v[15] = k_madd_epi32(u[15], k32_m24_m08);
+ v[16] = k_madd_epi32(u[12], k32_m08_p24);
+ v[17] = k_madd_epi32(u[13], k32_m08_p24);
+ v[18] = k_madd_epi32(u[14], k32_m08_p24);
+ v[19] = k_madd_epi32(u[15], k32_m08_p24);
+ v[20] = k_madd_epi32(u[ 8], k32_m08_p24);
+ v[21] = k_madd_epi32(u[ 9], k32_m08_p24);
+ v[22] = k_madd_epi32(u[10], k32_m08_p24);
+ v[23] = k_madd_epi32(u[11], k32_m08_p24);
+ v[24] = k_madd_epi32(u[ 4], k32_p24_p08);
+ v[25] = k_madd_epi32(u[ 5], k32_p24_p08);
+ v[26] = k_madd_epi32(u[ 6], k32_p24_p08);
+ v[27] = k_madd_epi32(u[ 7], k32_p24_p08);
+ v[28] = k_madd_epi32(u[ 0], k32_p24_p08);
+ v[29] = k_madd_epi32(u[ 1], k32_p24_p08);
+ v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
+ v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
+
+ u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64(v[10], v[11]);
+ u[ 6] = k_packs_epi64(v[12], v[13]);
+ u[ 7] = k_packs_epi64(v[14], v[15]);
+ u[ 8] = k_packs_epi64(v[16], v[17]);
+ u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+ lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+ lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+ lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+ lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+ lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+ lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+ lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+ lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+ lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 5
+ {
+ lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]);
+ lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]);
+ lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]);
+ lstep2[11] = _mm_sub_epi32(lstep3[ 9], lstep1[11]);
+ lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
+ lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
+ lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
+ lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
+ }
+ {
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
+ u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
+ u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
+ u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
+ u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
+ u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
+ u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
+ u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[ 0] = k_madd_epi32(u[0], k32_p16_p16);
+ v[ 1] = k_madd_epi32(u[1], k32_p16_p16);
+ v[ 2] = k_madd_epi32(u[2], k32_p16_p16);
+ v[ 3] = k_madd_epi32(u[3], k32_p16_p16);
+ v[ 4] = k_madd_epi32(u[0], k32_p16_m16);
+ v[ 5] = k_madd_epi32(u[1], k32_p16_m16);
+ v[ 6] = k_madd_epi32(u[2], k32_p16_m16);
+ v[ 7] = k_madd_epi32(u[3], k32_p16_m16);
+ v[ 8] = k_madd_epi32(u[4], k32_p24_p08);
+ v[ 9] = k_madd_epi32(u[5], k32_p24_p08);
+ v[10] = k_madd_epi32(u[6], k32_p24_p08);
+ v[11] = k_madd_epi32(u[7], k32_p24_p08);
+ v[12] = k_madd_epi32(u[4], k32_m08_p24);
+ v[13] = k_madd_epi32(u[5], k32_m08_p24);
+ v[14] = k_madd_epi32(u[6], k32_m08_p24);
+ v[15] = k_madd_epi32(u[7], k32_m08_p24);
+
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm_cmplt_epi32(u[0], kZero);
+ sign[1] = _mm_cmplt_epi32(u[1], kZero);
+ sign[2] = _mm_cmplt_epi32(u[2], kZero);
+ sign[3] = _mm_cmplt_epi32(u[3], kZero);
+ sign[4] = _mm_cmplt_epi32(u[4], kZero);
+ sign[5] = _mm_cmplt_epi32(u[5], kZero);
+ sign[6] = _mm_cmplt_epi32(u[6], kZero);
+ sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], sign[0]);
+ u[1] = _mm_sub_epi32(u[1], sign[1]);
+ u[2] = _mm_sub_epi32(u[2], sign[2]);
+ u[3] = _mm_sub_epi32(u[3], sign[3]);
+ u[4] = _mm_sub_epi32(u[4], sign[4]);
+ u[5] = _mm_sub_epi32(u[5], sign[5]);
+ u[6] = _mm_sub_epi32(u[6], sign[6]);
+ u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm_add_epi32(u[0], K32One);
+ u[1] = _mm_add_epi32(u[1], K32One);
+ u[2] = _mm_add_epi32(u[2], K32One);
+ u[3] = _mm_add_epi32(u[3], K32One);
+ u[4] = _mm_add_epi32(u[4], K32One);
+ u[5] = _mm_add_epi32(u[5], K32One);
+ u[6] = _mm_add_epi32(u[6], K32One);
+ u[7] = _mm_add_epi32(u[7], K32One);
+
+ u[0] = _mm_srai_epi32(u[0], 2);
+ u[1] = _mm_srai_epi32(u[1], 2);
+ u[2] = _mm_srai_epi32(u[2], 2);
+ u[3] = _mm_srai_epi32(u[3], 2);
+ u[4] = _mm_srai_epi32(u[4], 2);
+ u[5] = _mm_srai_epi32(u[5], 2);
+ u[6] = _mm_srai_epi32(u[6], 2);
+ u[7] = _mm_srai_epi32(u[7], 2);
+
+ // Combine
+ out[ 0] = _mm_packs_epi32(u[0], u[1]);
+ out[16] = _mm_packs_epi32(u[2], u[3]);
+ out[ 8] = _mm_packs_epi32(u[4], u[5]);
+ out[24] = _mm_packs_epi32(u[6], u[7]);
+ }
+ {
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
+ u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
+ u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
+ u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
+ u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
+ u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
+ u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
+ u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+ v[0] = k_madd_epi32(u[0], k32_m08_p24);
+ v[1] = k_madd_epi32(u[1], k32_m08_p24);
+ v[2] = k_madd_epi32(u[2], k32_m08_p24);
+ v[3] = k_madd_epi32(u[3], k32_m08_p24);
+ v[4] = k_madd_epi32(u[4], k32_m24_m08);
+ v[5] = k_madd_epi32(u[5], k32_m24_m08);
+ v[6] = k_madd_epi32(u[6], k32_m24_m08);
+ v[7] = k_madd_epi32(u[7], k32_m24_m08);
+ v[ 8] = k_madd_epi32(u[4], k32_m08_p24);
+ v[ 9] = k_madd_epi32(u[5], k32_m08_p24);
+ v[10] = k_madd_epi32(u[6], k32_m08_p24);
+ v[11] = k_madd_epi32(u[7], k32_m08_p24);
+ v[12] = k_madd_epi32(u[0], k32_p24_p08);
+ v[13] = k_madd_epi32(u[1], k32_p24_p08);
+ v[14] = k_madd_epi32(u[2], k32_p24_p08);
+ v[15] = k_madd_epi32(u[3], k32_p24_p08);
+
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ }
+ {
+ lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
+ lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
+ lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
+ lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
+ lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
+ lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
+ lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
+ lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
+ lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
+ lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
+ lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
+ lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
+ lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
+ lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
+ lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
+ lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
+ lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
+ lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
+ lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
+ lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
+ lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
+ lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
+ lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
+ lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
+ lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
+ lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
+ lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
+ lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
+ lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
+ lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
+ lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
+ lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
+ }
+ // stage 6
+ {
+ const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+ const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+ const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+ u[1] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+ u[2] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+ u[3] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+ u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+ u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+ u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+ u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+ u[12] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+ u[13] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+ u[14] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+ u[15] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+
+ v[0] = k_madd_epi32(u[0], k32_p28_p04);
+ v[1] = k_madd_epi32(u[1], k32_p28_p04);
+ v[2] = k_madd_epi32(u[2], k32_p28_p04);
+ v[3] = k_madd_epi32(u[3], k32_p28_p04);
+ v[4] = k_madd_epi32(u[4], k32_p12_p20);
+ v[5] = k_madd_epi32(u[5], k32_p12_p20);
+ v[6] = k_madd_epi32(u[6], k32_p12_p20);
+ v[7] = k_madd_epi32(u[7], k32_p12_p20);
+ v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
+ v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
+ v[10] = k_madd_epi32(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32(u[12], k32_m04_p28);
+ v[13] = k_madd_epi32(u[13], k32_m04_p28);
+ v[14] = k_madd_epi32(u[14], k32_m04_p28);
+ v[15] = k_madd_epi32(u[15], k32_m04_p28);
+
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+ u[4] = k_packs_epi64(v[8], v[9]);
+ u[5] = k_packs_epi64(v[10], v[11]);
+ u[6] = k_packs_epi64(v[12], v[13]);
+ u[7] = k_packs_epi64(v[14], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+ sign[0] = _mm_cmplt_epi32(u[0], kZero);
+ sign[1] = _mm_cmplt_epi32(u[1], kZero);
+ sign[2] = _mm_cmplt_epi32(u[2], kZero);
+ sign[3] = _mm_cmplt_epi32(u[3], kZero);
+ sign[4] = _mm_cmplt_epi32(u[4], kZero);
+ sign[5] = _mm_cmplt_epi32(u[5], kZero);
+ sign[6] = _mm_cmplt_epi32(u[6], kZero);
+ sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+ u[0] = _mm_sub_epi32(u[0], sign[0]);
+ u[1] = _mm_sub_epi32(u[1], sign[1]);
+ u[2] = _mm_sub_epi32(u[2], sign[2]);
+ u[3] = _mm_sub_epi32(u[3], sign[3]);
+ u[4] = _mm_sub_epi32(u[4], sign[4]);
+ u[5] = _mm_sub_epi32(u[5], sign[5]);
+ u[6] = _mm_sub_epi32(u[6], sign[6]);
+ u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+ u[0] = _mm_add_epi32(u[0], K32One);
+ u[1] = _mm_add_epi32(u[1], K32One);
+ u[2] = _mm_add_epi32(u[2], K32One);
+ u[3] = _mm_add_epi32(u[3], K32One);
+ u[4] = _mm_add_epi32(u[4], K32One);
+ u[5] = _mm_add_epi32(u[5], K32One);
+ u[6] = _mm_add_epi32(u[6], K32One);
+ u[7] = _mm_add_epi32(u[7], K32One);
+
+ u[0] = _mm_srai_epi32(u[0], 2);
+ u[1] = _mm_srai_epi32(u[1], 2);
+ u[2] = _mm_srai_epi32(u[2], 2);
+ u[3] = _mm_srai_epi32(u[3], 2);
+ u[4] = _mm_srai_epi32(u[4], 2);
+ u[5] = _mm_srai_epi32(u[5], 2);
+ u[6] = _mm_srai_epi32(u[6], 2);
+ u[7] = _mm_srai_epi32(u[7], 2);
+
+ out[ 4] = _mm_packs_epi32(u[0], u[1]);
+ out[20] = _mm_packs_epi32(u[2], u[3]);
+ out[12] = _mm_packs_epi32(u[4], u[5]);
+ out[28] = _mm_packs_epi32(u[6], u[7]);
+ }
+ {
+ lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
+ lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
+ lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
+ lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
+ lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
+ lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
+ lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
+ lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
+ lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
+ lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
+ lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
+ lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
+ lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
+ lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
+ lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
+ lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
+ }
+ {
+ const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+ const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
+ const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+ const __m128i k32_m12_m20 = pair_set_epi32(-cospi_12_64,
+ -cospi_20_64);
+ const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+ const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+
+ u[ 0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
+ u[ 1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
+ u[ 2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
+ u[ 3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
+ u[ 4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
+ u[ 5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
+ u[ 6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
+ u[ 7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
+ u[ 8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
+ u[ 9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
+ u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
+ u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
+ u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
+ u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
+ u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
+ u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+ v[ 0] = k_madd_epi32(u[ 0], k32_m04_p28);
+ v[ 1] = k_madd_epi32(u[ 1], k32_m04_p28);
+ v[ 2] = k_madd_epi32(u[ 2], k32_m04_p28);
+ v[ 3] = k_madd_epi32(u[ 3], k32_m04_p28);
+ v[ 4] = k_madd_epi32(u[ 4], k32_m28_m04);
+ v[ 5] = k_madd_epi32(u[ 5], k32_m28_m04);
+ v[ 6] = k_madd_epi32(u[ 6], k32_m28_m04);
+ v[ 7] = k_madd_epi32(u[ 7], k32_m28_m04);
+ v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
+ v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
+ v[10] = k_madd_epi32(u[10], k32_m20_p12);
+ v[11] = k_madd_epi32(u[11], k32_m20_p12);
+ v[12] = k_madd_epi32(u[12], k32_m12_m20);
+ v[13] = k_madd_epi32(u[13], k32_m12_m20);
+ v[14] = k_madd_epi32(u[14], k32_m12_m20);
+ v[15] = k_madd_epi32(u[15], k32_m12_m20);
+ v[16] = k_madd_epi32(u[12], k32_m20_p12);
+ v[17] = k_madd_epi32(u[13], k32_m20_p12);
+ v[18] = k_madd_epi32(u[14], k32_m20_p12);
+ v[19] = k_madd_epi32(u[15], k32_m20_p12);
+ v[20] = k_madd_epi32(u[ 8], k32_p12_p20);
+ v[21] = k_madd_epi32(u[ 9], k32_p12_p20);
+ v[22] = k_madd_epi32(u[10], k32_p12_p20);
+ v[23] = k_madd_epi32(u[11], k32_p12_p20);
+ v[24] = k_madd_epi32(u[ 4], k32_m04_p28);
+ v[25] = k_madd_epi32(u[ 5], k32_m04_p28);
+ v[26] = k_madd_epi32(u[ 6], k32_m04_p28);
+ v[27] = k_madd_epi32(u[ 7], k32_m04_p28);
+ v[28] = k_madd_epi32(u[ 0], k32_p28_p04);
+ v[29] = k_madd_epi32(u[ 1], k32_p28_p04);
+ v[30] = k_madd_epi32(u[ 2], k32_p28_p04);
+ v[31] = k_madd_epi32(u[ 3], k32_p28_p04);
+
+ u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64(v[10], v[11]);
+ u[ 6] = k_packs_epi64(v[12], v[13]);
+ u[ 7] = k_packs_epi64(v[14], v[15]);
+ u[ 8] = k_packs_epi64(v[16], v[17]);
+ u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep3[34] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+ lstep3[35] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+ lstep3[36] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+ lstep3[37] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+ lstep3[42] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+ lstep3[43] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+ lstep3[44] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+ lstep3[45] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+ lstep3[50] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+ lstep3[51] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ }
+ // stage 7
+ {
+ const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
+ const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
+ const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
+ const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
+ const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
+ const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
+ const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
+ const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
+
+ u[ 0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
+ u[ 1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
+ u[ 2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
+ u[ 3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
+ u[ 4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
+ u[ 5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
+ u[ 6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
+ u[ 7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
+ u[ 8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
+ u[ 9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
+ u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
+ u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
+ u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
+ u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
+ u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
+ u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+ v[ 0] = k_madd_epi32(u[ 0], k32_p30_p02);
+ v[ 1] = k_madd_epi32(u[ 1], k32_p30_p02);
+ v[ 2] = k_madd_epi32(u[ 2], k32_p30_p02);
+ v[ 3] = k_madd_epi32(u[ 3], k32_p30_p02);
+ v[ 4] = k_madd_epi32(u[ 4], k32_p14_p18);
+ v[ 5] = k_madd_epi32(u[ 5], k32_p14_p18);
+ v[ 6] = k_madd_epi32(u[ 6], k32_p14_p18);
+ v[ 7] = k_madd_epi32(u[ 7], k32_p14_p18);
+ v[ 8] = k_madd_epi32(u[ 8], k32_p22_p10);
+ v[ 9] = k_madd_epi32(u[ 9], k32_p22_p10);
+ v[10] = k_madd_epi32(u[10], k32_p22_p10);
+ v[11] = k_madd_epi32(u[11], k32_p22_p10);
+ v[12] = k_madd_epi32(u[12], k32_p06_p26);
+ v[13] = k_madd_epi32(u[13], k32_p06_p26);
+ v[14] = k_madd_epi32(u[14], k32_p06_p26);
+ v[15] = k_madd_epi32(u[15], k32_p06_p26);
+ v[16] = k_madd_epi32(u[12], k32_m26_p06);
+ v[17] = k_madd_epi32(u[13], k32_m26_p06);
+ v[18] = k_madd_epi32(u[14], k32_m26_p06);
+ v[19] = k_madd_epi32(u[15], k32_m26_p06);
+ v[20] = k_madd_epi32(u[ 8], k32_m10_p22);
+ v[21] = k_madd_epi32(u[ 9], k32_m10_p22);
+ v[22] = k_madd_epi32(u[10], k32_m10_p22);
+ v[23] = k_madd_epi32(u[11], k32_m10_p22);
+ v[24] = k_madd_epi32(u[ 4], k32_m18_p14);
+ v[25] = k_madd_epi32(u[ 5], k32_m18_p14);
+ v[26] = k_madd_epi32(u[ 6], k32_m18_p14);
+ v[27] = k_madd_epi32(u[ 7], k32_m18_p14);
+ v[28] = k_madd_epi32(u[ 0], k32_m02_p30);
+ v[29] = k_madd_epi32(u[ 1], k32_m02_p30);
+ v[30] = k_madd_epi32(u[ 2], k32_m02_p30);
+ v[31] = k_madd_epi32(u[ 3], k32_m02_p30);
+
+ u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64(v[10], v[11]);
+ u[ 6] = k_packs_epi64(v[12], v[13]);
+ u[ 7] = k_packs_epi64(v[14], v[15]);
+ u[ 8] = k_packs_epi64(v[16], v[17]);
+ u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+ u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+ u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+ u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+ u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+ u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+ u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+ u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+ u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+ u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+ v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+ v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+ v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+ v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+ v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+ v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+ v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+ v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+ v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+ u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+ u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+ u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+ u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+ u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+ u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+ u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+ u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+ u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[ 0] = _mm_add_epi32(u[ 0], K32One);
+ v[ 1] = _mm_add_epi32(u[ 1], K32One);
+ v[ 2] = _mm_add_epi32(u[ 2], K32One);
+ v[ 3] = _mm_add_epi32(u[ 3], K32One);
+ v[ 4] = _mm_add_epi32(u[ 4], K32One);
+ v[ 5] = _mm_add_epi32(u[ 5], K32One);
+ v[ 6] = _mm_add_epi32(u[ 6], K32One);
+ v[ 7] = _mm_add_epi32(u[ 7], K32One);
+ v[ 8] = _mm_add_epi32(u[ 8], K32One);
+ v[ 9] = _mm_add_epi32(u[ 9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[ 0] = _mm_srai_epi32(v[ 0], 2);
+ u[ 1] = _mm_srai_epi32(v[ 1], 2);
+ u[ 2] = _mm_srai_epi32(v[ 2], 2);
+ u[ 3] = _mm_srai_epi32(v[ 3], 2);
+ u[ 4] = _mm_srai_epi32(v[ 4], 2);
+ u[ 5] = _mm_srai_epi32(v[ 5], 2);
+ u[ 6] = _mm_srai_epi32(v[ 6], 2);
+ u[ 7] = _mm_srai_epi32(v[ 7], 2);
+ u[ 8] = _mm_srai_epi32(v[ 8], 2);
+ u[ 9] = _mm_srai_epi32(v[ 9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[ 2] = _mm_packs_epi32(u[0], u[1]);
+ out[18] = _mm_packs_epi32(u[2], u[3]);
+ out[10] = _mm_packs_epi32(u[4], u[5]);
+ out[26] = _mm_packs_epi32(u[6], u[7]);
+ out[ 6] = _mm_packs_epi32(u[8], u[9]);
+ out[22] = _mm_packs_epi32(u[10], u[11]);
+ out[14] = _mm_packs_epi32(u[12], u[13]);
+ out[30] = _mm_packs_epi32(u[14], u[15]);
+ }
+ {
+ lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
+ lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
+ lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
+ lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
+ lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
+ lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
+ lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
+ lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
+ lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
+ lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
+ lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
+ lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
+ lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
+ lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
+ lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
+ lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
+ lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
+ lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
+ lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
+ lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
+ lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
+ lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
+ lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
+ lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
+ lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
+ lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
+ lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
+ lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
+ lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
+ lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
+ lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
+ lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
+ }
+ // stage 8
+ {
+ const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
+ const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
+ const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
+ const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
+ const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
+ const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
+ const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
+ const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
+
+ u[ 0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
+ u[ 1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
+ u[ 2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
+ u[ 3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
+ u[ 4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
+ u[ 5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
+ u[ 6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
+ u[ 7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
+ u[ 8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
+ u[ 9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
+ u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
+ u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
+ u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
+ u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
+ u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
+ u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+ v[ 0] = k_madd_epi32(u[ 0], k32_p31_p01);
+ v[ 1] = k_madd_epi32(u[ 1], k32_p31_p01);
+ v[ 2] = k_madd_epi32(u[ 2], k32_p31_p01);
+ v[ 3] = k_madd_epi32(u[ 3], k32_p31_p01);
+ v[ 4] = k_madd_epi32(u[ 4], k32_p15_p17);
+ v[ 5] = k_madd_epi32(u[ 5], k32_p15_p17);
+ v[ 6] = k_madd_epi32(u[ 6], k32_p15_p17);
+ v[ 7] = k_madd_epi32(u[ 7], k32_p15_p17);
+ v[ 8] = k_madd_epi32(u[ 8], k32_p23_p09);
+ v[ 9] = k_madd_epi32(u[ 9], k32_p23_p09);
+ v[10] = k_madd_epi32(u[10], k32_p23_p09);
+ v[11] = k_madd_epi32(u[11], k32_p23_p09);
+ v[12] = k_madd_epi32(u[12], k32_p07_p25);
+ v[13] = k_madd_epi32(u[13], k32_p07_p25);
+ v[14] = k_madd_epi32(u[14], k32_p07_p25);
+ v[15] = k_madd_epi32(u[15], k32_p07_p25);
+ v[16] = k_madd_epi32(u[12], k32_m25_p07);
+ v[17] = k_madd_epi32(u[13], k32_m25_p07);
+ v[18] = k_madd_epi32(u[14], k32_m25_p07);
+ v[19] = k_madd_epi32(u[15], k32_m25_p07);
+ v[20] = k_madd_epi32(u[ 8], k32_m09_p23);
+ v[21] = k_madd_epi32(u[ 9], k32_m09_p23);
+ v[22] = k_madd_epi32(u[10], k32_m09_p23);
+ v[23] = k_madd_epi32(u[11], k32_m09_p23);
+ v[24] = k_madd_epi32(u[ 4], k32_m17_p15);
+ v[25] = k_madd_epi32(u[ 5], k32_m17_p15);
+ v[26] = k_madd_epi32(u[ 6], k32_m17_p15);
+ v[27] = k_madd_epi32(u[ 7], k32_m17_p15);
+ v[28] = k_madd_epi32(u[ 0], k32_m01_p31);
+ v[29] = k_madd_epi32(u[ 1], k32_m01_p31);
+ v[30] = k_madd_epi32(u[ 2], k32_m01_p31);
+ v[31] = k_madd_epi32(u[ 3], k32_m01_p31);
+
+ u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64(v[10], v[11]);
+ u[ 6] = k_packs_epi64(v[12], v[13]);
+ u[ 7] = k_packs_epi64(v[14], v[15]);
+ u[ 8] = k_packs_epi64(v[16], v[17]);
+ u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+ u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+ u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+ u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+ u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+ u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+ u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+ u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+ u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+ u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+ v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+ v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+ v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+ v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+ v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+ v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+ v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+ v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+ v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+ u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+ u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+ u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+ u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+ u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+ u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+ u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+ u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+ u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], K32One);
+ v[1] = _mm_add_epi32(u[1], K32One);
+ v[2] = _mm_add_epi32(u[2], K32One);
+ v[3] = _mm_add_epi32(u[3], K32One);
+ v[4] = _mm_add_epi32(u[4], K32One);
+ v[5] = _mm_add_epi32(u[5], K32One);
+ v[6] = _mm_add_epi32(u[6], K32One);
+ v[7] = _mm_add_epi32(u[7], K32One);
+ v[8] = _mm_add_epi32(u[8], K32One);
+ v[9] = _mm_add_epi32(u[9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[0] = _mm_srai_epi32(v[0], 2);
+ u[1] = _mm_srai_epi32(v[1], 2);
+ u[2] = _mm_srai_epi32(v[2], 2);
+ u[3] = _mm_srai_epi32(v[3], 2);
+ u[4] = _mm_srai_epi32(v[4], 2);
+ u[5] = _mm_srai_epi32(v[5], 2);
+ u[6] = _mm_srai_epi32(v[6], 2);
+ u[7] = _mm_srai_epi32(v[7], 2);
+ u[8] = _mm_srai_epi32(v[8], 2);
+ u[9] = _mm_srai_epi32(v[9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[ 1] = _mm_packs_epi32(u[0], u[1]);
+ out[17] = _mm_packs_epi32(u[2], u[3]);
+ out[ 9] = _mm_packs_epi32(u[4], u[5]);
+ out[25] = _mm_packs_epi32(u[6], u[7]);
+ out[ 7] = _mm_packs_epi32(u[8], u[9]);
+ out[23] = _mm_packs_epi32(u[10], u[11]);
+ out[15] = _mm_packs_epi32(u[12], u[13]);
+ out[31] = _mm_packs_epi32(u[14], u[15]);
+ }
+ {
+ const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
+ const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
+ const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
+ const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
+ const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
+ const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
+ const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
+ const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
+
+ u[ 0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
+ u[ 1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
+ u[ 2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
+ u[ 3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
+ u[ 4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
+ u[ 5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
+ u[ 6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
+ u[ 7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
+ u[ 8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
+ u[ 9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
+ u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
+ u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
+ u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
+ u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
+ u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
+ u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+ v[ 0] = k_madd_epi32(u[ 0], k32_p27_p05);
+ v[ 1] = k_madd_epi32(u[ 1], k32_p27_p05);
+ v[ 2] = k_madd_epi32(u[ 2], k32_p27_p05);
+ v[ 3] = k_madd_epi32(u[ 3], k32_p27_p05);
+ v[ 4] = k_madd_epi32(u[ 4], k32_p11_p21);
+ v[ 5] = k_madd_epi32(u[ 5], k32_p11_p21);
+ v[ 6] = k_madd_epi32(u[ 6], k32_p11_p21);
+ v[ 7] = k_madd_epi32(u[ 7], k32_p11_p21);
+ v[ 8] = k_madd_epi32(u[ 8], k32_p19_p13);
+ v[ 9] = k_madd_epi32(u[ 9], k32_p19_p13);
+ v[10] = k_madd_epi32(u[10], k32_p19_p13);
+ v[11] = k_madd_epi32(u[11], k32_p19_p13);
+ v[12] = k_madd_epi32(u[12], k32_p03_p29);
+ v[13] = k_madd_epi32(u[13], k32_p03_p29);
+ v[14] = k_madd_epi32(u[14], k32_p03_p29);
+ v[15] = k_madd_epi32(u[15], k32_p03_p29);
+ v[16] = k_madd_epi32(u[12], k32_m29_p03);
+ v[17] = k_madd_epi32(u[13], k32_m29_p03);
+ v[18] = k_madd_epi32(u[14], k32_m29_p03);
+ v[19] = k_madd_epi32(u[15], k32_m29_p03);
+ v[20] = k_madd_epi32(u[ 8], k32_m13_p19);
+ v[21] = k_madd_epi32(u[ 9], k32_m13_p19);
+ v[22] = k_madd_epi32(u[10], k32_m13_p19);
+ v[23] = k_madd_epi32(u[11], k32_m13_p19);
+ v[24] = k_madd_epi32(u[ 4], k32_m21_p11);
+ v[25] = k_madd_epi32(u[ 5], k32_m21_p11);
+ v[26] = k_madd_epi32(u[ 6], k32_m21_p11);
+ v[27] = k_madd_epi32(u[ 7], k32_m21_p11);
+ v[28] = k_madd_epi32(u[ 0], k32_m05_p27);
+ v[29] = k_madd_epi32(u[ 1], k32_m05_p27);
+ v[30] = k_madd_epi32(u[ 2], k32_m05_p27);
+ v[31] = k_madd_epi32(u[ 3], k32_m05_p27);
+
+ u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64(v[10], v[11]);
+ u[ 6] = k_packs_epi64(v[12], v[13]);
+ u[ 7] = k_packs_epi64(v[14], v[15]);
+ u[ 8] = k_packs_epi64(v[16], v[17]);
+ u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+ u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+ u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+ u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+ u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+ u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+ u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+ u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+ u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+ u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+ v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+ v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+ v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+ v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+ v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+ v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+ v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+ v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+ v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+ v[10] = _mm_cmplt_epi32(u[10], kZero);
+ v[11] = _mm_cmplt_epi32(u[11], kZero);
+ v[12] = _mm_cmplt_epi32(u[12], kZero);
+ v[13] = _mm_cmplt_epi32(u[13], kZero);
+ v[14] = _mm_cmplt_epi32(u[14], kZero);
+ v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+ u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+ u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+ u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+ u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+ u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+ u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+ u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+ u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+ u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+ u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+ u[10] = _mm_sub_epi32(u[10], v[10]);
+ u[11] = _mm_sub_epi32(u[11], v[11]);
+ u[12] = _mm_sub_epi32(u[12], v[12]);
+ u[13] = _mm_sub_epi32(u[13], v[13]);
+ u[14] = _mm_sub_epi32(u[14], v[14]);
+ u[15] = _mm_sub_epi32(u[15], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], K32One);
+ v[1] = _mm_add_epi32(u[1], K32One);
+ v[2] = _mm_add_epi32(u[2], K32One);
+ v[3] = _mm_add_epi32(u[3], K32One);
+ v[4] = _mm_add_epi32(u[4], K32One);
+ v[5] = _mm_add_epi32(u[5], K32One);
+ v[6] = _mm_add_epi32(u[6], K32One);
+ v[7] = _mm_add_epi32(u[7], K32One);
+ v[8] = _mm_add_epi32(u[8], K32One);
+ v[9] = _mm_add_epi32(u[9], K32One);
+ v[10] = _mm_add_epi32(u[10], K32One);
+ v[11] = _mm_add_epi32(u[11], K32One);
+ v[12] = _mm_add_epi32(u[12], K32One);
+ v[13] = _mm_add_epi32(u[13], K32One);
+ v[14] = _mm_add_epi32(u[14], K32One);
+ v[15] = _mm_add_epi32(u[15], K32One);
+
+ u[0] = _mm_srai_epi32(v[0], 2);
+ u[1] = _mm_srai_epi32(v[1], 2);
+ u[2] = _mm_srai_epi32(v[2], 2);
+ u[3] = _mm_srai_epi32(v[3], 2);
+ u[4] = _mm_srai_epi32(v[4], 2);
+ u[5] = _mm_srai_epi32(v[5], 2);
+ u[6] = _mm_srai_epi32(v[6], 2);
+ u[7] = _mm_srai_epi32(v[7], 2);
+ u[8] = _mm_srai_epi32(v[8], 2);
+ u[9] = _mm_srai_epi32(v[9], 2);
+ u[10] = _mm_srai_epi32(v[10], 2);
+ u[11] = _mm_srai_epi32(v[11], 2);
+ u[12] = _mm_srai_epi32(v[12], 2);
+ u[13] = _mm_srai_epi32(v[13], 2);
+ u[14] = _mm_srai_epi32(v[14], 2);
+ u[15] = _mm_srai_epi32(v[15], 2);
+
+ out[ 5] = _mm_packs_epi32(u[0], u[1]);
+ out[21] = _mm_packs_epi32(u[2], u[3]);
+ out[13] = _mm_packs_epi32(u[4], u[5]);
+ out[29] = _mm_packs_epi32(u[6], u[7]);
+ out[ 3] = _mm_packs_epi32(u[8], u[9]);
+ out[19] = _mm_packs_epi32(u[10], u[11]);
+ out[11] = _mm_packs_epi32(u[12], u[13]);
+ out[27] = _mm_packs_epi32(u[14], u[15]);
+ }
+ }
+#endif
+ // Transpose the results, do it as four 8x8 transposes.
+ {
+ int transpose_block;
+ int16_t *output;
+ if (0 == pass) {
+ output = &intermediate[column_start * 32];
+ } else {
+ output = &output_org[column_start * 32];
+ }
+ for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+ __m128i *this_out = &out[8 * transpose_block];
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ if (0 == pass) {
+ // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+ // TODO(cd): see quality impact of only doing
+ // output[j] = (output[j] + 1) >> 2;
+ // which would remove the code between here ...
+ __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
+ __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
+ __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
+ __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
+ __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
+ __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
+ __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
+ __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
+ tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
+ tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
+ tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
+ tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
+ tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
+ tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
+ tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
+ tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
+ // ... and here.
+ // PS: also change code in vp9/encoder/vp9_dct.c
+ tr2_0 = _mm_add_epi16(tr2_0, kOne);
+ tr2_1 = _mm_add_epi16(tr2_1, kOne);
+ tr2_2 = _mm_add_epi16(tr2_2, kOne);
+ tr2_3 = _mm_add_epi16(tr2_3, kOne);
+ tr2_4 = _mm_add_epi16(tr2_4, kOne);
+ tr2_5 = _mm_add_epi16(tr2_5, kOne);
+ tr2_6 = _mm_add_epi16(tr2_6, kOne);
+ tr2_7 = _mm_add_epi16(tr2_7, kOne);
+ tr2_0 = _mm_srai_epi16(tr2_0, 2);
+ tr2_1 = _mm_srai_epi16(tr2_1, 2);
+ tr2_2 = _mm_srai_epi16(tr2_2, 2);
+ tr2_3 = _mm_srai_epi16(tr2_3, 2);
+ tr2_4 = _mm_srai_epi16(tr2_4, 2);
+ tr2_5 = _mm_srai_epi16(tr2_5, 2);
+ tr2_6 = _mm_srai_epi16(tr2_6, 2);
+ tr2_7 = _mm_srai_epi16(tr2_7, 2);
+ }
+ // Note: even though all these stores are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0);
+ _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1);
+ _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2);
+ _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3);
+ _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4);
+ _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5);
+ _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6);
+ _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7);
+ // Process next 8x8
+ output += 8;
+ }
+ }
+ }
+ }
+}
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
index bf09c7a..eb271fe 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
@@ -2572,1224 +2572,14 @@ void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
write_buffer_16x16(output, in0, in1, 16);
}
-void vp9_short_fdct32x32_rd_sse2(int16_t *input,
- int16_t *output_org, int pitch) {
- // Calculate pre-multiplied strides
- const int str1 = pitch >> 1;
- const int str2 = pitch;
- const int str3 = pitch + str1;
- // We need an intermediate buffer between passes.
- DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
- const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
- const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
- const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
- const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
- const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
- const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
- const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
- const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
- const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
- const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
- const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
- const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
- const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
- const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
- const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
- const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
- const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
- const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
- const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
- const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
- const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
- const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kZero = _mm_set1_epi16(0);
- const __m128i kOne = _mm_set1_epi16(1);
- // Do the two transform/transpose passes
- int pass;
- for (pass = 0; pass < 2; ++pass) {
- // We process eight columns (transposed rows in second pass) at a time.
- int column_start;
- for (column_start = 0; column_start < 32; column_start += 8) {
- __m128i step1[32];
- __m128i step2[32];
- __m128i step3[32];
- __m128i out[32];
- // Stage 1
- // Note: even though all the loads below are aligned, using the aligned
- // intrinsic make the code slightly slower.
- if (0 == pass) {
- int16_t *in = &input[column_start];
- // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
- // Note: the next four blocks could be in a loop. That would help the
- // instruction cache but is actually slower.
- {
- int16_t *ina = in + 0 * str1;
- int16_t *inb = in + 31 * str1;
- __m128i *step1a = &step1[ 0];
- __m128i *step1b = &step1[31];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[ 0] = _mm_add_epi16(ina0, inb0);
- step1a[ 1] = _mm_add_epi16(ina1, inb1);
- step1a[ 2] = _mm_add_epi16(ina2, inb2);
- step1a[ 3] = _mm_add_epi16(ina3, inb3);
- step1b[-3] = _mm_sub_epi16(ina3, inb3);
- step1b[-2] = _mm_sub_epi16(ina2, inb2);
- step1b[-1] = _mm_sub_epi16(ina1, inb1);
- step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
- step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
- step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
- step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
- step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
- step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
- step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
- step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
- }
- {
- int16_t *ina = in + 4 * str1;
- int16_t *inb = in + 27 * str1;
- __m128i *step1a = &step1[ 4];
- __m128i *step1b = &step1[27];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[ 0] = _mm_add_epi16(ina0, inb0);
- step1a[ 1] = _mm_add_epi16(ina1, inb1);
- step1a[ 2] = _mm_add_epi16(ina2, inb2);
- step1a[ 3] = _mm_add_epi16(ina3, inb3);
- step1b[-3] = _mm_sub_epi16(ina3, inb3);
- step1b[-2] = _mm_sub_epi16(ina2, inb2);
- step1b[-1] = _mm_sub_epi16(ina1, inb1);
- step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
- step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
- step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
- step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
- step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
- step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
- step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
- step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
- }
- {
- int16_t *ina = in + 8 * str1;
- int16_t *inb = in + 23 * str1;
- __m128i *step1a = &step1[ 8];
- __m128i *step1b = &step1[23];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[ 0] = _mm_add_epi16(ina0, inb0);
- step1a[ 1] = _mm_add_epi16(ina1, inb1);
- step1a[ 2] = _mm_add_epi16(ina2, inb2);
- step1a[ 3] = _mm_add_epi16(ina3, inb3);
- step1b[-3] = _mm_sub_epi16(ina3, inb3);
- step1b[-2] = _mm_sub_epi16(ina2, inb2);
- step1b[-1] = _mm_sub_epi16(ina1, inb1);
- step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
- step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
- step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
- step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
- step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
- step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
- step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
- step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
- }
- {
- int16_t *ina = in + 12 * str1;
- int16_t *inb = in + 19 * str1;
- __m128i *step1a = &step1[12];
- __m128i *step1b = &step1[19];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[ 0] = _mm_add_epi16(ina0, inb0);
- step1a[ 1] = _mm_add_epi16(ina1, inb1);
- step1a[ 2] = _mm_add_epi16(ina2, inb2);
- step1a[ 3] = _mm_add_epi16(ina3, inb3);
- step1b[-3] = _mm_sub_epi16(ina3, inb3);
- step1b[-2] = _mm_sub_epi16(ina2, inb2);
- step1b[-1] = _mm_sub_epi16(ina1, inb1);
- step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
- step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
- step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
- step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
- step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
- step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
- step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
- step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
- }
- } else {
- int16_t *in = &intermediate[column_start];
- // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
- // Note: using the same approach as above to have common offset is
- // counter-productive as all offsets can be calculated at compile
- // time.
- // Note: the next four blocks could be in a loop. That would help the
- // instruction cache but is actually slower.
- {
- __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
- __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
- __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
- __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
- __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
- __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
- __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
- __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
- step1[ 0] = _mm_add_epi16(in00, in31);
- step1[ 1] = _mm_add_epi16(in01, in30);
- step1[ 2] = _mm_add_epi16(in02, in29);
- step1[ 3] = _mm_add_epi16(in03, in28);
- step1[28] = _mm_sub_epi16(in03, in28);
- step1[29] = _mm_sub_epi16(in02, in29);
- step1[30] = _mm_sub_epi16(in01, in30);
- step1[31] = _mm_sub_epi16(in00, in31);
- }
- {
- __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
- __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
- __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
- __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
- __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
- __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
- __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
- __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
- step1[ 4] = _mm_add_epi16(in04, in27);
- step1[ 5] = _mm_add_epi16(in05, in26);
- step1[ 6] = _mm_add_epi16(in06, in25);
- step1[ 7] = _mm_add_epi16(in07, in24);
- step1[24] = _mm_sub_epi16(in07, in24);
- step1[25] = _mm_sub_epi16(in06, in25);
- step1[26] = _mm_sub_epi16(in05, in26);
- step1[27] = _mm_sub_epi16(in04, in27);
- }
- {
- __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
- __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
- __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
- __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
- __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
- __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
- __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
- __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
- step1[ 8] = _mm_add_epi16(in08, in23);
- step1[ 9] = _mm_add_epi16(in09, in22);
- step1[10] = _mm_add_epi16(in10, in21);
- step1[11] = _mm_add_epi16(in11, in20);
- step1[20] = _mm_sub_epi16(in11, in20);
- step1[21] = _mm_sub_epi16(in10, in21);
- step1[22] = _mm_sub_epi16(in09, in22);
- step1[23] = _mm_sub_epi16(in08, in23);
- }
- {
- __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
- __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
- __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
- __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
- __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
- __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
- __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
- __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
- step1[12] = _mm_add_epi16(in12, in19);
- step1[13] = _mm_add_epi16(in13, in18);
- step1[14] = _mm_add_epi16(in14, in17);
- step1[15] = _mm_add_epi16(in15, in16);
- step1[16] = _mm_sub_epi16(in15, in16);
- step1[17] = _mm_sub_epi16(in14, in17);
- step1[18] = _mm_sub_epi16(in13, in18);
- step1[19] = _mm_sub_epi16(in12, in19);
- }
- }
- // Stage 2
- {
- step2[ 0] = _mm_add_epi16(step1[0], step1[15]);
- step2[ 1] = _mm_add_epi16(step1[1], step1[14]);
- step2[ 2] = _mm_add_epi16(step1[2], step1[13]);
- step2[ 3] = _mm_add_epi16(step1[3], step1[12]);
- step2[ 4] = _mm_add_epi16(step1[4], step1[11]);
- step2[ 5] = _mm_add_epi16(step1[5], step1[10]);
- step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]);
- step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]);
- step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]);
- step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]);
- step2[10] = _mm_sub_epi16(step1[5], step1[10]);
- step2[11] = _mm_sub_epi16(step1[4], step1[11]);
- step2[12] = _mm_sub_epi16(step1[3], step1[12]);
- step2[13] = _mm_sub_epi16(step1[2], step1[13]);
- step2[14] = _mm_sub_epi16(step1[1], step1[14]);
- step2[15] = _mm_sub_epi16(step1[0], step1[15]);
- }
- {
- const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
- const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
- const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
- const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
- const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
- const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
- const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
- const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
- const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
- const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
- const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
- const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
- const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
- const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
- const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
- const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
- const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
- const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
- const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
- const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
- const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
- const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
- const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
- const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
- const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
- const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
- const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
- const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
- const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
- const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
- const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
- const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
- const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
- const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
- const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
- const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
- const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
- const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
- const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
- // Combine
- step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
- step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
- step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
- step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
- step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
- step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
- step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
- step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
- }
- // Stage 3
- {
- step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
- step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
- step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
- step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
- step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
- step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
- step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
- step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
- }
- {
- const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
- const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
- const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
- const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
- const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
- const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
- const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
- const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
- const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
- const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
- const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
- const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
- const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
- const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
- const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
- const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
- const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
- const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
- const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
- // Combine
- step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
- step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
- step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
- step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
- }
- {
- step3[16] = _mm_add_epi16(step2[23], step1[16]);
- step3[17] = _mm_add_epi16(step2[22], step1[17]);
- step3[18] = _mm_add_epi16(step2[21], step1[18]);
- step3[19] = _mm_add_epi16(step2[20], step1[19]);
- step3[20] = _mm_sub_epi16(step1[19], step2[20]);
- step3[21] = _mm_sub_epi16(step1[18], step2[21]);
- step3[22] = _mm_sub_epi16(step1[17], step2[22]);
- step3[23] = _mm_sub_epi16(step1[16], step2[23]);
- step3[24] = _mm_sub_epi16(step1[31], step2[24]);
- step3[25] = _mm_sub_epi16(step1[30], step2[25]);
- step3[26] = _mm_sub_epi16(step1[29], step2[26]);
- step3[27] = _mm_sub_epi16(step1[28], step2[27]);
- step3[28] = _mm_add_epi16(step2[27], step1[28]);
- step3[29] = _mm_add_epi16(step2[26], step1[29]);
- step3[30] = _mm_add_epi16(step2[25], step1[30]);
- step3[31] = _mm_add_epi16(step2[24], step1[31]);
- }
- // dump the magnitude by half, hence the intermediate values are within
- // the range of 16 bits.
- if (1 == pass) {
- __m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero);
- __m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero);
- __m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero);
- __m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero);
- __m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero);
- __m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero);
- __m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero);
- __m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero);
- __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
- __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
- __m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero);
- __m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero);
- __m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero);
- __m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero);
- __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
- __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
- __m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero);
- __m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero);
- __m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero);
- __m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero);
- __m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero);
- __m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero);
- __m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero);
- __m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero);
- __m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero);
- __m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero);
- __m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero);
- __m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero);
- __m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero);
- __m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero);
- __m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero);
- __m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero);
- step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0);
- step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0);
- step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0);
- step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0);
- step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0);
- step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0);
- step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0);
- step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0);
- step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
- step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
- step3[10] = _mm_sub_epi16(step3[10], s3_10_0);
- step3[11] = _mm_sub_epi16(step3[11], s3_11_0);
- step3[12] = _mm_sub_epi16(step3[12], s3_12_0);
- step3[13] = _mm_sub_epi16(step3[13], s3_13_0);
- step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
- step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
- step3[16] = _mm_sub_epi16(step3[16], s3_16_0);
- step3[17] = _mm_sub_epi16(step3[17], s3_17_0);
- step3[18] = _mm_sub_epi16(step3[18], s3_18_0);
- step3[19] = _mm_sub_epi16(step3[19], s3_19_0);
- step3[20] = _mm_sub_epi16(step3[20], s3_20_0);
- step3[21] = _mm_sub_epi16(step3[21], s3_21_0);
- step3[22] = _mm_sub_epi16(step3[22], s3_22_0);
- step3[23] = _mm_sub_epi16(step3[23], s3_23_0);
- step3[24] = _mm_sub_epi16(step3[24], s3_24_0);
- step3[25] = _mm_sub_epi16(step3[25], s3_25_0);
- step3[26] = _mm_sub_epi16(step3[26], s3_26_0);
- step3[27] = _mm_sub_epi16(step3[27], s3_27_0);
- step3[28] = _mm_sub_epi16(step3[28], s3_28_0);
- step3[29] = _mm_sub_epi16(step3[29], s3_29_0);
- step3[30] = _mm_sub_epi16(step3[30], s3_30_0);
- step3[31] = _mm_sub_epi16(step3[31], s3_31_0);
- step3[ 0] = _mm_add_epi16(step3[ 0], kOne);
- step3[ 1] = _mm_add_epi16(step3[ 1], kOne);
- step3[ 2] = _mm_add_epi16(step3[ 2], kOne);
- step3[ 3] = _mm_add_epi16(step3[ 3], kOne);
- step3[ 4] = _mm_add_epi16(step3[ 4], kOne);
- step3[ 5] = _mm_add_epi16(step3[ 5], kOne);
- step3[ 6] = _mm_add_epi16(step3[ 6], kOne);
- step3[ 7] = _mm_add_epi16(step3[ 7], kOne);
- step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
- step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
- step3[10] = _mm_add_epi16(step3[10], kOne);
- step3[11] = _mm_add_epi16(step3[11], kOne);
- step3[12] = _mm_add_epi16(step3[12], kOne);
- step3[13] = _mm_add_epi16(step3[13], kOne);
- step2[14] = _mm_add_epi16(step2[14], kOne);
- step2[15] = _mm_add_epi16(step2[15], kOne);
- step3[16] = _mm_add_epi16(step3[16], kOne);
- step3[17] = _mm_add_epi16(step3[17], kOne);
- step3[18] = _mm_add_epi16(step3[18], kOne);
- step3[19] = _mm_add_epi16(step3[19], kOne);
- step3[20] = _mm_add_epi16(step3[20], kOne);
- step3[21] = _mm_add_epi16(step3[21], kOne);
- step3[22] = _mm_add_epi16(step3[22], kOne);
- step3[23] = _mm_add_epi16(step3[23], kOne);
- step3[24] = _mm_add_epi16(step3[24], kOne);
- step3[25] = _mm_add_epi16(step3[25], kOne);
- step3[26] = _mm_add_epi16(step3[26], kOne);
- step3[27] = _mm_add_epi16(step3[27], kOne);
- step3[28] = _mm_add_epi16(step3[28], kOne);
- step3[29] = _mm_add_epi16(step3[29], kOne);
- step3[30] = _mm_add_epi16(step3[30], kOne);
- step3[31] = _mm_add_epi16(step3[31], kOne);
- step3[ 0] = _mm_srai_epi16(step3[ 0], 2);
- step3[ 1] = _mm_srai_epi16(step3[ 1], 2);
- step3[ 2] = _mm_srai_epi16(step3[ 2], 2);
- step3[ 3] = _mm_srai_epi16(step3[ 3], 2);
- step3[ 4] = _mm_srai_epi16(step3[ 4], 2);
- step3[ 5] = _mm_srai_epi16(step3[ 5], 2);
- step3[ 6] = _mm_srai_epi16(step3[ 6], 2);
- step3[ 7] = _mm_srai_epi16(step3[ 7], 2);
- step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
- step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
- step3[10] = _mm_srai_epi16(step3[10], 2);
- step3[11] = _mm_srai_epi16(step3[11], 2);
- step3[12] = _mm_srai_epi16(step3[12], 2);
- step3[13] = _mm_srai_epi16(step3[13], 2);
- step2[14] = _mm_srai_epi16(step2[14], 2);
- step2[15] = _mm_srai_epi16(step2[15], 2);
- step3[16] = _mm_srai_epi16(step3[16], 2);
- step3[17] = _mm_srai_epi16(step3[17], 2);
- step3[18] = _mm_srai_epi16(step3[18], 2);
- step3[19] = _mm_srai_epi16(step3[19], 2);
- step3[20] = _mm_srai_epi16(step3[20], 2);
- step3[21] = _mm_srai_epi16(step3[21], 2);
- step3[22] = _mm_srai_epi16(step3[22], 2);
- step3[23] = _mm_srai_epi16(step3[23], 2);
- step3[24] = _mm_srai_epi16(step3[24], 2);
- step3[25] = _mm_srai_epi16(step3[25], 2);
- step3[26] = _mm_srai_epi16(step3[26], 2);
- step3[27] = _mm_srai_epi16(step3[27], 2);
- step3[28] = _mm_srai_epi16(step3[28], 2);
- step3[29] = _mm_srai_epi16(step3[29], 2);
- step3[30] = _mm_srai_epi16(step3[30], 2);
- step3[31] = _mm_srai_epi16(step3[31], 2);
- }
- // Stage 4
- {
- step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
- step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]);
- step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]);
- step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]);
- step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]);
- step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]);
- step1[10] = _mm_sub_epi16(step2[ 9], step3[10]);
- step1[11] = _mm_sub_epi16(step2[ 8], step3[11]);
- step1[12] = _mm_sub_epi16(step2[15], step3[12]);
- step1[13] = _mm_sub_epi16(step2[14], step3[13]);
- step1[14] = _mm_add_epi16(step3[13], step2[14]);
- step1[15] = _mm_add_epi16(step3[12], step2[15]);
- }
- {
- const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
- const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
- const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
- const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
- const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
- const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
- const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
- const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
- const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
- // Combine
- step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
- step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
- }
- {
- const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
- const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
- const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
- const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
- const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
- const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
- const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
- const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
- const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
- const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
- const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
- const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
- const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
- const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
- const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
- const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
- const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
- const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
- const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
- const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
- const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
- const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
- const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
- const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
- // dct_const_round_shift
- const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
- const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
- const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
- const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
- const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
- const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
- const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
- const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
- const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
- const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
- const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
- const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
- const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
- const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
- const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
- const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
- // Combine
- step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
- step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
- step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
- step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
- step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
- step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
- step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
- step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
- }
- // Stage 5
- {
- step2[4] = _mm_add_epi16(step1[5], step3[4]);
- step2[5] = _mm_sub_epi16(step3[4], step1[5]);
- step2[6] = _mm_sub_epi16(step3[7], step1[6]);
- step2[7] = _mm_add_epi16(step1[6], step3[7]);
- }
- {
- const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
- const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
- const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
- const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
- const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
- const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
- const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
- const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
- const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
- const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
- const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
- const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
- const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
- const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
- const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
- const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
- const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
- const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
- const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
- const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
- const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
- const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
- const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
- const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
- const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
- const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
- const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
- // Combine
- out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
- out[16] = _mm_packs_epi32(out_16_6, out_16_7);
- out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
- out[24] = _mm_packs_epi32(out_24_6, out_24_7);
- }
- {
- const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
- const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
- const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
- const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
- const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
- const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
- const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
- const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
- const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
- const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
- const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
- const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
- // dct_const_round_shift
- const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
- const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
- const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
- const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
- const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
- const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
- const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
- const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
- // Combine
- step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
- step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
- step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
- step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
- }
- {
- step2[16] = _mm_add_epi16(step1[19], step3[16]);
- step2[17] = _mm_add_epi16(step1[18], step3[17]);
- step2[18] = _mm_sub_epi16(step3[17], step1[18]);
- step2[19] = _mm_sub_epi16(step3[16], step1[19]);
- step2[20] = _mm_sub_epi16(step3[23], step1[20]);
- step2[21] = _mm_sub_epi16(step3[22], step1[21]);
- step2[22] = _mm_add_epi16(step1[21], step3[22]);
- step2[23] = _mm_add_epi16(step1[20], step3[23]);
- step2[24] = _mm_add_epi16(step1[27], step3[24]);
- step2[25] = _mm_add_epi16(step1[26], step3[25]);
- step2[26] = _mm_sub_epi16(step3[25], step1[26]);
- step2[27] = _mm_sub_epi16(step3[24], step1[27]);
- step2[28] = _mm_sub_epi16(step3[31], step1[28]);
- step2[29] = _mm_sub_epi16(step3[30], step1[29]);
- step2[30] = _mm_add_epi16(step1[29], step3[30]);
- step2[31] = _mm_add_epi16(step1[28], step3[31]);
- }
- // Stage 6
- {
- const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
- const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
- const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
- const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
- const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
- const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
- const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
- const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
- const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
- const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
- const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
- const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
- const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
- const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
- const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
- const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
- // dct_const_round_shift
- const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
- const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
- const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
- const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
- const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
- const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
- const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
- const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
- const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
- const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
- const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
- const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
- const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
- const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
- const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
- const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
- // Combine
- out[ 4] = _mm_packs_epi32(out_04_6, out_04_7);
- out[20] = _mm_packs_epi32(out_20_6, out_20_7);
- out[12] = _mm_packs_epi32(out_12_6, out_12_7);
- out[28] = _mm_packs_epi32(out_28_6, out_28_7);
- }
- {
- step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]);
- step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]);
- step3[10] = _mm_sub_epi16(step1[11], step2[10]);
- step3[11] = _mm_add_epi16(step2[10], step1[11]);
- step3[12] = _mm_add_epi16(step2[13], step1[12]);
- step3[13] = _mm_sub_epi16(step1[12], step2[13]);
- step3[14] = _mm_sub_epi16(step1[15], step2[14]);
- step3[15] = _mm_add_epi16(step2[14], step1[15]);
- }
- {
- const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
- const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
- const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
- const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
- const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
- const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
- const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
- const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
- const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
- const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
- const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
- const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
- const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
- const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
- const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
- const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
- const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
- const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
- const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
- const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
- const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
- const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
- const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
- const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
- // dct_const_round_shift
- const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
- const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
- const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
- const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
- const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
- const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
- const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
- const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
- const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
- const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
- const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
- const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
- const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
- const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
- const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
- const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
- // Combine
- step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
- step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
- step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
- step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
- // Combine
- step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
- step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
- step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
- step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
- }
- // Stage 7
- {
- const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
- const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
- const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
- const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
- const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
- const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
- const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
- const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
- const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
- const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
- const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
- const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
- const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
- const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
- const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
- const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
- const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
- const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
- const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
- const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
- const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
- const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
- const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
- const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
- // dct_const_round_shift
- const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
- const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
- const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
- const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
- const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
- const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
- const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
- const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
- const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
- const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
- const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
- const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
- const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
- const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
- const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
- const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
- const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
- const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
- const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
- const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
- const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
- const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
- const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
- const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
- const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
- const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
- const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
- const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
- const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
- const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
- const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
- const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
- // Combine
- out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
- out[18] = _mm_packs_epi32(out_18_6, out_18_7);
- out[10] = _mm_packs_epi32(out_10_6, out_10_7);
- out[26] = _mm_packs_epi32(out_26_6, out_26_7);
- out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
- out[22] = _mm_packs_epi32(out_22_6, out_22_7);
- out[14] = _mm_packs_epi32(out_14_6, out_14_7);
- out[30] = _mm_packs_epi32(out_30_6, out_30_7);
- }
- {
- step1[16] = _mm_add_epi16(step3[17], step2[16]);
- step1[17] = _mm_sub_epi16(step2[16], step3[17]);
- step1[18] = _mm_sub_epi16(step2[19], step3[18]);
- step1[19] = _mm_add_epi16(step3[18], step2[19]);
- step1[20] = _mm_add_epi16(step3[21], step2[20]);
- step1[21] = _mm_sub_epi16(step2[20], step3[21]);
- step1[22] = _mm_sub_epi16(step2[23], step3[22]);
- step1[23] = _mm_add_epi16(step3[22], step2[23]);
- step1[24] = _mm_add_epi16(step3[25], step2[24]);
- step1[25] = _mm_sub_epi16(step2[24], step3[25]);
- step1[26] = _mm_sub_epi16(step2[27], step3[26]);
- step1[27] = _mm_add_epi16(step3[26], step2[27]);
- step1[28] = _mm_add_epi16(step3[29], step2[28]);
- step1[29] = _mm_sub_epi16(step2[28], step3[29]);
- step1[30] = _mm_sub_epi16(step2[31], step3[30]);
- step1[31] = _mm_add_epi16(step3[30], step2[31]);
- }
- // Final stage --- outputs indices are bit-reversed.
- {
- const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
- const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
- const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
- const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
- const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
- const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
- const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
- const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
- const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
- const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
- const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
- const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
- const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
- const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
- const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
- const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
- const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
- const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
- const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
- const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
- const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
- const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
- const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
- const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
- // dct_const_round_shift
- const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
- const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
- const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
- const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
- const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
- const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
- const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
- const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
- const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
- const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
- const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
- const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
- const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
- const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
- const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
- const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
- const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
- const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
- const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
- const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
- const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
- const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
- const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
- const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
- const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
- const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
- const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
- const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
- const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
- const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
- const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
- const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
- // Combine
- out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
- out[17] = _mm_packs_epi32(out_17_6, out_17_7);
- out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
- out[25] = _mm_packs_epi32(out_25_6, out_25_7);
- out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
- out[23] = _mm_packs_epi32(out_23_6, out_23_7);
- out[15] = _mm_packs_epi32(out_15_6, out_15_7);
- out[31] = _mm_packs_epi32(out_31_6, out_31_7);
- }
- {
- const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
- const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
- const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
- const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
- const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
- const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
- const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
- const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
- const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
- const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
- const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
- const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
- const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
- const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
- const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
- const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
- const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
- const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
- const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
- const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
- const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
- const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
- const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
- const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
- // dct_const_round_shift
- const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
- const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
- const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
- const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
- const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
- const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
- const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
- const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
- const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
- const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
- const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
- const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
- const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
- const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
- const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
- const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
- const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
- const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
- const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
- const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
- const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
- const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
- const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
- const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
- const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
- const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
- const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
- const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
- const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
- const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
- const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
- const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
- // Combine
- out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
- out[21] = _mm_packs_epi32(out_21_6, out_21_7);
- out[13] = _mm_packs_epi32(out_13_6, out_13_7);
- out[29] = _mm_packs_epi32(out_29_6, out_29_7);
- out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
- out[19] = _mm_packs_epi32(out_19_6, out_19_7);
- out[11] = _mm_packs_epi32(out_11_6, out_11_7);
- out[27] = _mm_packs_epi32(out_27_6, out_27_7);
- }
- // Transpose the results, do it as four 8x8 transposes.
- {
- int transpose_block;
- int16_t *output;
- if (0 == pass) {
- output = &intermediate[column_start * 32];
- } else {
- output = &output_org[column_start * 32];
- }
- for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
- __m128i *this_out = &out[8 * transpose_block];
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- if (0 == pass) {
- // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
- // TODO(cd): see quality impact of only doing
- // output[j] = (output[j] + 1) >> 2;
- // which would remove the code between here ...
- __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
- __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
- __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
- __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
- __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
- __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
- __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
- __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
- tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
- tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
- tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
- tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
- tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
- tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
- tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
- tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
- // ... and here.
- // PS: also change code in vp9/encoder/vp9_dct.c
- tr2_0 = _mm_add_epi16(tr2_0, kOne);
- tr2_1 = _mm_add_epi16(tr2_1, kOne);
- tr2_2 = _mm_add_epi16(tr2_2, kOne);
- tr2_3 = _mm_add_epi16(tr2_3, kOne);
- tr2_4 = _mm_add_epi16(tr2_4, kOne);
- tr2_5 = _mm_add_epi16(tr2_5, kOne);
- tr2_6 = _mm_add_epi16(tr2_6, kOne);
- tr2_7 = _mm_add_epi16(tr2_7, kOne);
- tr2_0 = _mm_srai_epi16(tr2_0, 2);
- tr2_1 = _mm_srai_epi16(tr2_1, 2);
- tr2_2 = _mm_srai_epi16(tr2_2, 2);
- tr2_3 = _mm_srai_epi16(tr2_3, 2);
- tr2_4 = _mm_srai_epi16(tr2_4, 2);
- tr2_5 = _mm_srai_epi16(tr2_5, 2);
- tr2_6 = _mm_srai_epi16(tr2_6, 2);
- tr2_7 = _mm_srai_epi16(tr2_7, 2);
- }
- // Note: even though all these stores are aligned, using the aligned
- // intrinsic make the code slightly slower.
- _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0);
- _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1);
- _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2);
- _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3);
- _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4);
- _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5);
- _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6);
- _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7);
- // Process next 8x8
- output += 8;
- }
- }
- }
- }
-}
+#define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vp9_short_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm
index 60f7991..db30660 100644
--- a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -36,6 +36,14 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
pshufd m4, m4, 0
mova m2, [quantq] ; m2 = quant
paddw m0, m4 ; m0 = zbin + zbin_oq
+%ifidn %1, b_32x32
+ pcmpeqw m5, m5
+ psrlw m5, 15
+ paddw m0, m5
+ paddw m1, m5
+ psrlw m0, 1 ; m0 = (m0 + 1) / 2
+ psrlw m1, 1 ; m1 = (m1 + 1) / 2
+%endif
mova m3, [r2q] ; m3 = dequant
psubw m0, [pw_1]
mov r2, shiftmp
@@ -43,6 +51,9 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
mova m4, [r2] ; m4 = shift
mov r4, dqcoeffmp
mov r5, iscanmp
+%ifidn %1, b_32x32
+ psllw m4, 1
+%endif
pxor m5, m5 ; m5 = dedicated zero
DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
lea coeffq, [ coeffq+ncoeffq*2]
@@ -56,16 +67,12 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
-%ifidn %1, b_32x32
- paddw m6, m6
- paddw m11, m11
-%endif
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
punpckhqdq m0, m0
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
- paddw m6, m1 ; m6 += round
+ paddsw m6, m1 ; m6 += round
punpckhqdq m1, m1
- paddw m11, m1 ; m11 += round
+ paddsw m11, m1 ; m11 += round
pmulhw m8, m6, m2 ; m8 = m6*q>>16
punpckhqdq m2, m2
pmulhw m13, m11, m2 ; m13 = m11*q>>16
@@ -112,10 +119,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
-%ifidn %1, b_32x32
- paddw m6, m6
- paddw m11, m11
-%endif
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
%ifidn %1, b_32x32
@@ -124,8 +127,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
or r6, r2
jz .skip_iter
%endif
- paddw m6, m1 ; m6 += round
- paddw m11, m1 ; m11 += round
+ paddsw m6, m1 ; m6 += round
+ paddsw m11, m1 ; m11 += round
pmulhw m14, m6, m2 ; m14 = m6*q>>16
pmulhw m13, m11, m2 ; m13 = m11*q>>16
paddw m14, m6 ; m14 += m6
@@ -164,6 +167,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
pmaxsw m8, m13
add ncoeffq, mmsize
jl .ac_only_loop
+
%ifidn %1, b_32x32
jmp .accumulate_eob
.skip_iter:
diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
index 19e2feb..533456b 100644
--- a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -270,8 +270,13 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%if mmsize == 16
movhps m2, [srcq+src_strideq*2]
%else ; mmsize == 8
+%if %1 == 4
+ movh m1, [srcq+src_strideq*2]
+ punpckldq m2, m1
+%else
punpckldq m2, [srcq+src_strideq*2]
%endif
+%endif
movh m1, [dstq]
%if mmsize == 16
movlhps m0, m2
@@ -542,9 +547,16 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
movhps m2, [srcq+src_strideq]
movhps m3, [srcq+src_strideq+1]
%else
+%if %1 == 4
+ movh m1, [srcq+src_strideq]
+ punpckldq m2, m1
+ movh m1, [srcq+src_strideq+1]
+ punpckldq m3, m1
+%else
punpckldq m2, [srcq+src_strideq]
punpckldq m3, [srcq+src_strideq+1]
%endif
+%endif
pavgb m2, m3
%if mmsize == 16
movlhps m0, m2
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
index d3dbefe..3501cf1 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
+++ b/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
@@ -342,8 +342,8 @@ sym(vp9_get4x4var_mmx):
movsxd rdx, dword ptr arg(3) ;[recon_stride]
; Row 1
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm1, [rbx] ; Copy eight bytes to mm1
+ movd mm0, [rax] ; Copy 4 bytes to mm0
+ movd mm1, [rbx] ; Copy 4 bytes to mm1
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
@@ -351,12 +351,12 @@ sym(vp9_get4x4var_mmx):
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
+ movd mm1, [rbx] ; Copy 4 bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 2
- movq mm0, [rax] ; Copy eight bytes to mm0
+ movd mm0, [rax] ; Copy 4 bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
@@ -365,11 +365,11 @@ sym(vp9_get4x4var_mmx):
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
+ movd mm1, [rbx] ; Copy 4 bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 3
- movq mm0, [rax] ; Copy eight bytes to mm0
+ movd mm0, [rax] ; Copy 4 bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
@@ -378,11 +378,11 @@ sym(vp9_get4x4var_mmx):
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
+ movd mm1, [rbx] ; Copy 4 bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 4
- movq mm0, [rax] ; Copy eight bytes to mm0
+ movd mm0, [rax] ; Copy 4 bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
index b4ff850..cea934d 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
@@ -244,7 +244,7 @@ unsigned int vp9_variance16x16_sse2
return (var - (((unsigned int)avg * avg) >> 8));
}
-unsigned int vp9_mse16x16_wmt(
+unsigned int vp9_mse16x16_sse2(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
@@ -500,7 +500,7 @@ FNS(ssse3, ssse3);
#undef FNS
#undef FN
-unsigned int vp9_variance_halfpixvar16x16_h_wmt(
+unsigned int vp9_variance_halfpixvar16x16_h_sse2(
const unsigned char *src_ptr,
int src_pixels_per_line,
const unsigned char *dst_ptr,
@@ -519,7 +519,7 @@ unsigned int vp9_variance_halfpixvar16x16_h_wmt(
}
-unsigned int vp9_variance_halfpixvar16x16_v_wmt(
+unsigned int vp9_variance_halfpixvar16x16_v_sse2(
const unsigned char *src_ptr,
int src_pixels_per_line,
const unsigned char *dst_ptr,
@@ -537,7 +537,7 @@ unsigned int vp9_variance_halfpixvar16x16_v_wmt(
}
-unsigned int vp9_variance_halfpixvar16x16_hv_wmt(
+unsigned int vp9_variance_halfpixvar16x16_hv_sse2(
const unsigned char *src_ptr,
int src_pixels_per_line,
const unsigned char *dst_ptr,