ref: 26b6318de83761dd268a589f0b1324153e9d0923
parent: c8defcfdeea614a780af9a2405f59c60cab876ad
author: Ronald S. Bultje <rbultje@google.com>
date: Mon Jul 1 06:40:00 EDT 2013
Make get_coef_context() branchless. This should significantly speedup cost_coeffs(). Basically what the patch does is to make the neighbour arrays padded by one item to prevent an eob check in get_coef_context(), then it populates each col/row scan and left/top edge coefficient with two times the same neighbour - this prevents a single/double context branch in get_coef_context(). Lastly, it populates neighbour arrays in pixel order (rather than scan order), so we don't have to dereference the scantable to get the correct neighbours. Total encoding time of first 50 frames of bus (speed 0) at 1500kbps goes from 2min10.1 to 2min5.3, i.e. a 2.6% overall speed increase. Change-Id: I42bcd2210fd7bec03767ef0e2945a665b851df56
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -461,25 +461,25 @@
// for each position in raster scan order.
// -1 indicates the neighbor does not exist.
DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
+ vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
+ vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
+ vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
+ vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
+ vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
+ vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
+ vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
+ vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
+ vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_32x32_neighbors[1024 * MAX_NEIGHBORS]);
+ vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
@@ -504,15 +504,17 @@
}
static void init_scan_neighbors(const int16_t *scan,
int16_t *iscan,
- int l, int16_t *neighbors,
- int max_neighbors) {
+ int l, int16_t *neighbors) {
int l2 = l * l;
int n, i, j;
- for (n = 0; n < l2; n++) {
+ // dc doesn't use this type of prediction
+ neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
+ neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
+ iscan[0] = find_in_scan(scan, l, 0);
+ for (n = 1; n < l2; n++) {
int rc = scan[n];
iscan[n] = find_in_scan(scan, l, n);
- assert(max_neighbors == MAX_NEIGHBORS);
i = rc / l;
j = rc % l;
if (i > 0 && j > 0) {
@@ -524,93 +526,84 @@
// Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
// as a context. If ADST or DCT is used in both directions, we
// use the combination of the two as a context.
- int a = find_in_scan(scan, l, (i - 1) * l + j);
- int b = find_in_scan(scan, l, i * l + j - 1);
+ int a = (i - 1) * l + j;
+ int b = i * l + j - 1;
if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
scan == vp9_col_scan_16x16) {
- neighbors[max_neighbors * n + 0] = a;
- neighbors[max_neighbors * n + 1] = -1;
+ // in the col/row scan cases (as well as left/top edge cases), we set
+ // both contexts to the same value, so we can branchlessly do a+b+1>>1
+ // which automatically becomes a if a == b
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = a;
} else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
scan == vp9_row_scan_16x16) {
- neighbors[max_neighbors * n + 0] = b;
- neighbors[max_neighbors * n + 1] = -1;
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = b;
} else {
- neighbors[max_neighbors * n + 0] = a;
- neighbors[max_neighbors * n + 1] = b;
+ neighbors[MAX_NEIGHBORS * n + 0] = a;
+ neighbors[MAX_NEIGHBORS * n + 1] = b;
}
} else if (i > 0) {
- neighbors[max_neighbors * n + 0] = find_in_scan(scan, l, (i - 1) * l + j);
- neighbors[max_neighbors * n + 1] = -1;
- } else if (j > 0) {
- neighbors[max_neighbors * n + 0] =
- find_in_scan(scan, l, i * l + j - 1);
- neighbors[max_neighbors * n + 1] = -1;
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
} else {
- assert(n == 0);
- // dc predictor doesn't use previous tokens
- neighbors[max_neighbors * n + 0] = -1;
+ assert(j > 0);
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = i * l + j - 1;
}
- assert(neighbors[max_neighbors * n + 0] < n);
+ assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
}
+ // one padding item so we don't have to add branches in code to handle
+ // calls to get_coef_context() for the token after the final dc token
+ neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
+ neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
}
void vp9_init_neighbors() {
init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
- vp9_default_scan_4x4_neighbors, MAX_NEIGHBORS);
+ vp9_default_scan_4x4_neighbors);
init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
- vp9_row_scan_4x4_neighbors, MAX_NEIGHBORS);
+ vp9_row_scan_4x4_neighbors);
init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
- vp9_col_scan_4x4_neighbors, MAX_NEIGHBORS);
+ vp9_col_scan_4x4_neighbors);
init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
- vp9_default_scan_8x8_neighbors, MAX_NEIGHBORS);
+ vp9_default_scan_8x8_neighbors);
init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
- vp9_row_scan_8x8_neighbors, MAX_NEIGHBORS);
+ vp9_row_scan_8x8_neighbors);
init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
- vp9_col_scan_8x8_neighbors, MAX_NEIGHBORS);
+ vp9_col_scan_8x8_neighbors);
init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
- vp9_default_scan_16x16_neighbors, MAX_NEIGHBORS);
+ vp9_default_scan_16x16_neighbors);
init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
- vp9_row_scan_16x16_neighbors, MAX_NEIGHBORS);
+ vp9_row_scan_16x16_neighbors);
init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
- vp9_col_scan_16x16_neighbors, MAX_NEIGHBORS);
+ vp9_col_scan_16x16_neighbors);
init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
- vp9_default_scan_32x32_neighbors, MAX_NEIGHBORS);
+ vp9_default_scan_32x32_neighbors);
}
-const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan, int *pad) {
+const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) {
if (scan == vp9_default_scan_4x4) {
- *pad = MAX_NEIGHBORS;
return vp9_default_scan_4x4_neighbors;
} else if (scan == vp9_row_scan_4x4) {
- *pad = MAX_NEIGHBORS;
return vp9_row_scan_4x4_neighbors;
} else if (scan == vp9_col_scan_4x4) {
- *pad = MAX_NEIGHBORS;
return vp9_col_scan_4x4_neighbors;
} else if (scan == vp9_default_scan_8x8) {
- *pad = MAX_NEIGHBORS;
return vp9_default_scan_8x8_neighbors;
} else if (scan == vp9_row_scan_8x8) {
- *pad = 2;
return vp9_row_scan_8x8_neighbors;
} else if (scan == vp9_col_scan_8x8) {
- *pad = 2;
return vp9_col_scan_8x8_neighbors;
} else if (scan == vp9_default_scan_16x16) {
- *pad = MAX_NEIGHBORS;
return vp9_default_scan_16x16_neighbors;
} else if (scan == vp9_row_scan_16x16) {
- *pad = 2;
return vp9_row_scan_16x16_neighbors;
} else if (scan == vp9_col_scan_16x16) {
- *pad = 2;
return vp9_col_scan_16x16_neighbors;
- } else if (scan == vp9_default_scan_32x32) {
- *pad = MAX_NEIGHBORS;
- return vp9_default_scan_32x32_neighbors;
} else {
- assert(0);
- return NULL;
+ assert(scan == vp9_default_scan_32x32);
+ return vp9_default_scan_32x32_neighbors;
}
}
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -166,28 +166,14 @@
}
#define MAX_NEIGHBORS 2
-static INLINE int get_coef_context(const int16_t *scan,
- const int16_t *neighbors,
- int nb_pad, uint8_t *token_cache,
- int c, int l) {
- int eob = l;
- assert(nb_pad == MAX_NEIGHBORS);
- if (c == eob) {
- return 0;
- } else {
- int ctx;
- assert(neighbors[MAX_NEIGHBORS * c + 0] >= 0);
- if (neighbors[MAX_NEIGHBORS * c + 1] >= 0) {
- ctx = (1 + token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]] +
- token_cache[scan[neighbors[MAX_NEIGHBORS * c + 1]]]) >> 1;
- } else {
- ctx = token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]];
- }
- return ctx;
- }
+static INLINE int get_coef_context(const int16_t *neighbors,
+ uint8_t *token_cache,
+ int c) {
+ return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
+ token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
}
-const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan, int *pad);
+const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan);
// 128 lists of probabilities are stored for the following ONE node probs:
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -97,7 +97,7 @@
TX_SIZE txfm_size, const int16_t *dq,
ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
ENTROPY_CONTEXT above_ec, left_ec;
- int pt, c = 0, pad, default_eob;
+ int pt, c = 0;
int band;
vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES];
vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
@@ -130,7 +130,6 @@
scan = get_scan_4x4(tx_type);
above_ec = A[0] != 0;
left_ec = L[0] != 0;
- default_eob = 16;
band_translate = vp9_coefband_trans_4x4;
break;
}
@@ -140,7 +139,6 @@
scan = get_scan_8x8(tx_type);
above_ec = (A[0] + A[1]) != 0;
left_ec = (L[0] + L[1]) != 0;
- default_eob = 64;
band_translate = vp9_coefband_trans_8x8plus;
break;
}
@@ -150,7 +148,6 @@
scan = get_scan_16x16(tx_type);
above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
- default_eob = 256;
band_translate = vp9_coefband_trans_8x8plus;
break;
}
@@ -158,13 +155,12 @@
scan = vp9_default_scan_32x32;
above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
- default_eob = 1024;
band_translate = vp9_coefband_trans_8x8plus;
break;
}
pt = combine_entropy_contexts(above_ec, left_ec);
- nb = vp9_get_coef_neighbors_handle(scan, &pad);
+ nb = vp9_get_coef_neighbors_handle(scan);
while (1) {
int val;
@@ -172,8 +168,7 @@
if (c >= seg_eob)
break;
if (c)
- pt = get_coef_context(scan, nb, pad, token_cache,
- c, default_eob);
+ pt = get_coef_context(nb, token_cache, c);
band = get_coef_band(band_translate, c);
prob = coef_probs[band][pt];
#if !CONFIG_BALANCED_COEFTREE
@@ -186,8 +181,7 @@
if (c >= seg_eob)
break;
if (c)
- pt = get_coef_context(scan, nb, pad, token_cache,
- c, default_eob);
+ pt = get_coef_context(nb, token_cache, c);
band = get_coef_band(band_translate, c);
prob = coef_probs[band][pt];
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -112,11 +112,10 @@
static int trellis_get_coeff_context(const int16_t *scan,
const int16_t *nb,
int idx, int token,
- uint8_t *token_cache,
- int pad, int l) {
+ uint8_t *token_cache) {
int bak = token_cache[scan[idx]], pt;
token_cache[scan[idx]] = vp9_pt_energy_class[token];
- pt = get_coef_context(scan, nb, pad, token_cache, idx + 1, l);
+ pt = get_coef_context(nb, token_cache, idx + 1);
token_cache[scan[idx]] = bak;
return pt;
}
@@ -141,7 +140,7 @@
int best, band, pt;
PLANE_TYPE type = xd->plane[plane].plane_type;
int err_mult = plane_rd_mult[type];
- int default_eob, pad;
+ int default_eob;
const int16_t *scan, *nb;
const int mul = 1 + (tx_size == TX_32X32);
uint8_t token_cache[1024];
@@ -201,7 +200,7 @@
for (i = 0; i < eob; i++)
token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
qcoeff_ptr[scan[i]]].token];
- nb = vp9_get_coef_neighbors_handle(scan, &pad);
+ nb = vp9_get_coef_neighbors_handle(scan);
for (i = eob; i-- > i0;) {
int base_bits, d2, dx;
@@ -220,8 +219,7 @@
/* Consider both possible successor states. */
if (next < default_eob) {
band = get_coef_band(band_translate, i + 1);
- pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
- pad, default_eob);
+ pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
rate0 +=
mb->token_costs[tx_size][type][ref][0][band][pt]
[tokens[next][0].token];
@@ -273,14 +271,12 @@
if (next < default_eob) {
band = get_coef_band(band_translate, i + 1);
if (t0 != DCT_EOB_TOKEN) {
- pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
- pad, default_eob);
+ pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
rate0 += mb->token_costs[tx_size][type][ref][!x][band][pt]
[tokens[next][0].token];
}
if (t1 != DCT_EOB_TOKEN) {
- pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache,
- pad, default_eob);
+ pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
rate1 += mb->token_costs[tx_size][type][ref][!x][band][pt]
[tokens[next][1].token];
}
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -304,7 +304,7 @@
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
int pt;
int c = 0;
- int cost = 0, pad;
+ int cost = 0;
const int16_t *scan, *nb;
const int eob = xd->plane[plane].eobs[block];
const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
@@ -314,7 +314,7 @@
ENTROPY_CONTEXT above_ec, left_ec;
TX_TYPE tx_type = DCT_DCT;
const int segment_id = xd->mode_info_context->mbmi.segment_id;
- int seg_eob, default_eob;
+ int seg_eob;
uint8_t token_cache[1024];
const uint8_t * band_translate;
@@ -372,8 +372,7 @@
assert(eob <= seg_eob);
pt = combine_entropy_contexts(above_ec, left_ec);
- nb = vp9_get_coef_neighbors_handle(scan, &pad);
- default_eob = seg_eob;
+ nb = vp9_get_coef_neighbors_handle(scan);
if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
seg_eob = 0;
@@ -402,7 +401,7 @@
v = qcoeff_ptr[rc];
t = vp9_dct_value_tokens_ptr[v].token;
- pt = get_coef_context(scan, nb, pad, token_cache, c, default_eob);
+ pt = get_coef_context(nb, token_cache, c);
cost += token_costs[!prev_t][band][pt][t] + vp9_dct_value_cost_ptr[v];
token_cache[rc] = vp9_pt_energy_class[t];
prev_t = t;
@@ -410,7 +409,7 @@
// eob token
if (c < seg_eob) {
- pt = get_coef_context(scan, nb, pad, token_cache, c, default_eob);
+ pt = get_coef_context(nb, token_cache, c);
cost += token_costs[0][get_coef_band(band_translate, c)][pt]
[DCT_EOB_TOKEN];
}
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -123,7 +123,7 @@
const int loff = (off >> mod) << tx_size;
ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff;
ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff;
- int seg_eob, default_eob, pad;
+ int seg_eob;
const int segment_id = mbmi->segment_id;
const int16_t *scan, *nb;
vp9_coeff_count *counts;
@@ -178,8 +178,7 @@
}
pt = combine_entropy_contexts(above_ec, left_ec);
- nb = vp9_get_coef_neighbors_handle(scan, &pad);
- default_eob = seg_eob;
+ nb = vp9_get_coef_neighbors_handle(scan);
if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
seg_eob = 0;
@@ -191,7 +190,7 @@
int v = 0;
rc = scan[c];
if (c)
- pt = get_coef_context(scan, nb, pad, token_cache, c, default_eob);
+ pt = get_coef_context(nb, token_cache, c);
if (c < eob) {
v = qcoeff_ptr[rc];
assert(-DCT_MAX_VALUE <= v && v < DCT_MAX_VALUE);
--
⑨