ref: 7105e66d19811add930055959598beb68f21df29
parent: 75653b70325dfa1093bd3794ca78d7d06773f13e
author: Urvang Joshi <urvang@google.com>
date: Fri Jul 28 11:57:22 EDT 2017
Remove the DP version of vp9_optimize_b(). The greedy version was already enabled by default here: https://chromium-review.googlesource.com/c/546848/ And the speed+compression gains from greedy version were already mentioned here: https://chromium-review.googlesource.com/c/531675/ Change-Id: Iad9f7d03490c845ad1e230af028c9d39edddca97
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -53,10 +53,6 @@
{ 10, 6 }, { 8, 5 },
};
-#define USE_GREEDY_OPTIMIZE_B 1
-
-#if USE_GREEDY_OPTIMIZE_B
-
// 'num' can be negative, but 'shift' must be non-negative.
#define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \
((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift))
@@ -305,285 +301,6 @@
return final_eob;
}
#undef RIGHT_SHIFT_POSSIBLY_NEGATIVE
-
-#else
-
-#define UPDATE_RD_COST() \
- { \
- rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \
- rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \
- }
-
-// This function is a place holder for now but may ultimately need
-// to scan previous tokens to work out the correct context.
-static int trellis_get_coeff_context(const int16_t *scan, const int16_t *nb,
- int idx, int token, uint8_t *token_cache) {
- int bak = token_cache[scan[idx]], pt;
- token_cache[scan[idx]] = vp9_pt_energy_class[token];
- pt = get_coef_context(nb, token_cache, idx + 1);
- token_cache[scan[idx]] = bak;
- return pt;
-}
-
-static const int16_t band_count_table[TX_SIZES][8] = {
- { 1, 2, 3, 4, 3, 16 - 13, 0 },
- { 1, 2, 3, 4, 11, 64 - 21, 0 },
- { 1, 2, 3, 4, 11, 256 - 21, 0 },
- { 1, 2, 3, 4, 11, 1024 - 21, 0 },
-};
-static const int16_t band_cum_count_table[TX_SIZES][8] = {
- { 0, 1, 3, 6, 10, 13, 16, 0 },
- { 0, 1, 3, 6, 10, 21, 64, 0 },
- { 0, 1, 3, 6, 10, 21, 256, 0 },
- { 0, 1, 3, 6, 10, 21, 1024, 0 },
-};
-
-typedef struct vp9_token_state {
- int64_t error;
- int rate;
- int16_t next;
- int16_t token;
- tran_low_t qc;
- tran_low_t dqc;
- uint8_t best_index;
-} vp9_token_state;
-
-int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
- int ctx) {
- MACROBLOCKD *const xd = &mb->e_mbd;
- struct macroblock_plane *const p = &mb->plane[plane];
- struct macroblockd_plane *const pd = &xd->plane[plane];
- const int ref = is_inter_block(xd->mi[0]);
- vp9_token_state tokens[1025][2];
- uint8_t token_cache[1024];
- const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
- tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
- tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
- const int eob = p->eobs[block];
- const PLANE_TYPE type = get_plane_type(plane);
- const int default_eob = 16 << (tx_size << 1);
- const int shift = (tx_size == TX_32X32);
- const int16_t *const dequant_ptr = pd->dequant;
- const uint8_t *const band_translate = get_band_translate(tx_size);
- const scan_order *const so = get_scan(xd, tx_size, type, block);
- const int16_t *const scan = so->scan;
- const int16_t *const nb = so->neighbors;
- const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
- int next = eob, sz = 0;
- const int64_t rdmult = ((int64_t)mb->rdmult * plane_rd_mult[ref][type]) >> 1;
- const int64_t rddiv = mb->rddiv;
- int64_t rd_cost0, rd_cost1;
- int rate0, rate1;
- int64_t error0, error1;
- int16_t t0, t1;
- int best, band = (eob < default_eob) ? band_translate[eob]
- : band_translate[eob - 1];
- int pt, i, final_eob;
-#if CONFIG_VP9_HIGHBITDEPTH
- const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
-#else
- const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8);
-#endif
- unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
- mb->token_costs[tx_size][type][ref];
- const int16_t *band_counts = &band_count_table[tx_size][band];
- int16_t band_left = eob - band_cum_count_table[tx_size][band] + 1;
-
- token_costs += band;
-
- assert((!type && !plane) || (type && plane));
- assert(eob <= default_eob);
-
- /* Now set up a Viterbi trellis to evaluate alternative roundings. */
- /* Initialize the sentinel node of the trellis. */
- tokens[eob][0].rate = 0;
- tokens[eob][0].error = 0;
- tokens[eob][0].next = default_eob;
- tokens[eob][0].token = EOB_TOKEN;
- tokens[eob][0].qc = 0;
- tokens[eob][1] = tokens[eob][0];
-
- for (i = 0; i < eob; i++) {
- const int rc = scan[i];
- token_cache[rc] = vp9_pt_energy_class[vp9_get_token(qcoeff[rc])];
- }
-
- for (i = eob; i-- > 0;) {
- int base_bits, d2, dx;
- const int rc = scan[i];
- int x = qcoeff[rc];
- /* Only add a trellis state for non-zero coefficients. */
- if (x) {
- error0 = tokens[next][0].error;
- error1 = tokens[next][1].error;
- /* Evaluate the first possibility for this state. */
- rate0 = tokens[next][0].rate;
- rate1 = tokens[next][1].rate;
- base_bits = vp9_get_token_cost(x, &t0, cat6_high_cost);
- /* Consider both possible successor states. */
- if (next < default_eob) {
- pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
- rate0 += (*token_costs)[0][pt][tokens[next][0].token];
- rate1 += (*token_costs)[0][pt][tokens[next][1].token];
- }
- UPDATE_RD_COST();
- /* And pick the best. */
- best = rd_cost1 < rd_cost0;
- dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- dx >>= xd->bd - 8;
- }
-#endif // CONFIG_VP9_HIGHBITDEPTH
- d2 = dx * dx;
- tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
- tokens[i][0].error = d2 + (best ? error1 : error0);
- tokens[i][0].next = next;
- tokens[i][0].token = t0;
- tokens[i][0].qc = x;
- tokens[i][0].dqc = dqcoeff[rc];
- tokens[i][0].best_index = best;
-
- /* Evaluate the second possibility for this state. */
- rate0 = tokens[next][0].rate;
- rate1 = tokens[next][1].rate;
-
- if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
- (abs(x) * dequant_ptr[rc != 0] <
- (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) {
- sz = -(x < 0);
- x -= 2 * sz + 1;
- } else {
- tokens[i][1] = tokens[i][0];
- next = i;
-
- if (!(--band_left)) {
- --band_counts;
- band_left = *band_counts;
- --token_costs;
- }
- continue;
- }
-
- /* Consider both possible successor states. */
- if (!x) {
- /* If we reduced this coefficient to zero, check to see if
- * we need to move the EOB back here.
- */
- t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
- t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
- base_bits = 0;
- } else {
- base_bits = vp9_get_token_cost(x, &t0, cat6_high_cost);
- t1 = t0;
- }
- if (next < default_eob) {
- if (t0 != EOB_TOKEN) {
- pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
- rate0 += (*token_costs)[!x][pt][tokens[next][0].token];
- }
- if (t1 != EOB_TOKEN) {
- pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
- rate1 += (*token_costs)[!x][pt][tokens[next][1].token];
- }
- }
-
- UPDATE_RD_COST();
- /* And pick the best. */
- best = rd_cost1 < rd_cost0;
-
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
- } else {
- dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
- }
-#else
- dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-#endif // CONFIG_VP9_HIGHBITDEPTH
- d2 = dx * dx;
-
- tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
- tokens[i][1].error = d2 + (best ? error1 : error0);
- tokens[i][1].next = next;
- tokens[i][1].token = best ? t1 : t0;
- tokens[i][1].qc = x;
-
- if (x) {
- tran_low_t offset = dq_step[rc != 0];
- // The 32x32 transform coefficient uses half quantization step size.
- // Account for the rounding difference in the dequantized coefficeint
- // value when the quantization index is dropped from an even number
- // to an odd number.
- if (shift & x) offset += (dequant_ptr[rc != 0] & 0x01);
-
- if (sz == 0)
- tokens[i][1].dqc = dqcoeff[rc] - offset;
- else
- tokens[i][1].dqc = dqcoeff[rc] + offset;
- } else {
- tokens[i][1].dqc = 0;
- }
-
- tokens[i][1].best_index = best;
- /* Finally, make this the new head of the trellis. */
- next = i;
- } else {
- /* There's no choice to make for a zero coefficient, so we don't
- * add a new trellis node, but we do need to update the costs.
- */
- pt = get_coef_context(nb, token_cache, i + 1);
- t0 = tokens[next][0].token;
- t1 = tokens[next][1].token;
- /* Update the cost of each path if we're past the EOB token. */
- if (t0 != EOB_TOKEN) {
- tokens[next][0].rate += (*token_costs)[1][pt][t0];
- tokens[next][0].token = ZERO_TOKEN;
- }
- if (t1 != EOB_TOKEN) {
- tokens[next][1].rate += (*token_costs)[1][pt][t1];
- tokens[next][1].token = ZERO_TOKEN;
- }
- tokens[i][0].best_index = tokens[i][1].best_index = 0;
- /* Don't update next, because we didn't add a new node. */
- }
-
- if (!(--band_left)) {
- --band_counts;
- band_left = *band_counts;
- --token_costs;
- }
- }
-
- /* Now pick the best path through the whole trellis. */
- rate0 = tokens[next][0].rate;
- rate1 = tokens[next][1].rate;
- error0 = tokens[next][0].error;
- error1 = tokens[next][1].error;
- t0 = tokens[next][0].token;
- t1 = tokens[next][1].token;
- rate0 += (*token_costs)[0][ctx][t0];
- rate1 += (*token_costs)[0][ctx][t1];
- UPDATE_RD_COST();
- best = rd_cost1 < rd_cost0;
- final_eob = -1;
-
- for (i = next; i < eob; i = next) {
- const int x = tokens[i][best].qc;
- const int rc = scan[i];
- if (x) final_eob = i;
- qcoeff[rc] = x;
- dqcoeff[rc] = tokens[i][best].dqc;
- next = tokens[i][best].next;
- best = tokens[i][best].best_index;
- }
- final_eob++;
-
- mb->plane[plane].eobs[block] = final_eob;
- return final_eob;
-}
-
-#endif // USE_GREEDY_OPTIMIZE_B
static INLINE void fdct32x32(int rd_transform, const int16_t *src,
tran_low_t *dst, int src_stride) {