ref: fdf780f72a77545e64736019732f38638c84da5b
parent: c6e66595a8f553f348f19b60b8326aac536f00ad
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Tue Oct 23 06:06:08 EDT 2018
Rewrite cdef_dir C code
--- a/src/cdef_tmpl.c
+++ b/src/cdef_tmpl.c
@@ -201,94 +201,72 @@
cdef_fn(4, 8);
cdef_fn(8, 8);
-/*
- * <code copied from libaom>
- */
-
-/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
- The search minimizes the weighted variance along all the lines in a
- particular direction, i.e. the squared error between the input and a
- "predicted" block where each pixel is replaced by the average along a line
- in a particular direction. Since each direction have the same sum(x^2) term,
- that term is never computed. See Section 2, step 2, of:
- http://jmvalin.ca/notes/intra_paint.pdf */
-static const uint16_t div_table[] = {
- 0, 840, 420, 280, 210, 168, 140, 120, 105
-};
static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,
unsigned *const var)
{
- int i;
- int32_t cost[8] = { 0 };
- int partial[8][15] = { { 0 } };
- int32_t best_cost = 0;
- int best_dir = 0;
- /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
- The output is then 840 times larger, but we don't care for finding
- the max. */
- for (i = 0; i < 8; i++) {
- int j;
- for (j = 0; j < 8; j++) {
- int x;
- /* We subtract 128 here to reduce the maximum range of the squared
- partial sums. */
- x = (img[i * PXSTRIDE(stride) + j] >> (BITDEPTH - 8)) - 128;
- partial[0][i + j] += x;
- partial[1][i + j / 2] += x;
- partial[2][i] += x;
- partial[3][3 + i - j / 2] += x;
- partial[4][7 + i - j] += x;
- partial[5][3 - i / 2 + j] += x;
- partial[6][j] += x;
- partial[7][i / 2 + j] += x;
+ int partial_sum_hv[2][8] = { { 0 } };
+ int partial_sum_diag[2][15] = { { 0 } };
+ int partial_sum_alt[4][11] = { { 0 } };
+
+ for (int y = 0; y < 8; y++) {
+ for (int x = 0; x < 8; x++) {
+ const int px = (img[x] >> (BITDEPTH - 8)) - 128;
+
+ partial_sum_diag[0][ y + x ] += px;
+ partial_sum_alt [0][ y + (x >> 1)] += px;
+ partial_sum_hv [0][ y ] += px;
+ partial_sum_alt [1][3 + y - (x >> 1)] += px;
+ partial_sum_diag[1][7 + y - x ] += px;
+ partial_sum_alt [2][3 - (y >> 1) + x ] += px;
+ partial_sum_hv [1][ x ] += px;
+ partial_sum_alt [3][ (y >> 1) + x ] += px;
}
+ img += PXSTRIDE(stride);
}
- for (i = 0; i < 8; i++) {
- cost[2] += partial[2][i] * partial[2][i];
- cost[6] += partial[6][i] * partial[6][i];
+
+ unsigned cost[8] = { 0 };
+ for (int n = 0; n < 8; n++) {
+ cost[2] += partial_sum_hv[0][n] * partial_sum_hv[0][n];
+ cost[6] += partial_sum_hv[1][n] * partial_sum_hv[1][n];
}
- cost[2] *= div_table[8];
- cost[6] *= div_table[8];
- for (i = 0; i < 7; i++) {
- cost[0] += (partial[0][i] * partial[0][i] +
- partial[0][14 - i] * partial[0][14 - i]) *
- div_table[i + 1];
- cost[4] += (partial[4][i] * partial[4][i] +
- partial[4][14 - i] * partial[4][14 - i]) *
- div_table[i + 1];
+ cost[2] *= 105;
+ cost[6] *= 105;
+
+ static const uint16_t div_table[7] = { 840, 420, 280, 210, 168, 140, 120 };
+ for (int n = 0; n < 7; n++) {
+ const int d = div_table[n];
+ cost[0] += (partial_sum_diag[0][n] * partial_sum_diag[0][n] +
+ partial_sum_diag[0][14 - n] * partial_sum_diag[0][14 - n]) * d;
+ cost[4] += (partial_sum_diag[1][n] * partial_sum_diag[1][n] +
+ partial_sum_diag[1][14 - n] * partial_sum_diag[1][14 - n]) * d;
}
- cost[0] += partial[0][7] * partial[0][7] * div_table[8];
- cost[4] += partial[4][7] * partial[4][7] * div_table[8];
- for (i = 1; i < 8; i += 2) {
- int j;
- for (j = 0; j < 4 + 1; j++) {
- cost[i] += partial[i][3 + j] * partial[i][3 + j];
+ cost[0] += partial_sum_diag[0][7] * partial_sum_diag[0][7] * 105;
+ cost[4] += partial_sum_diag[1][7] * partial_sum_diag[1][7] * 105;
+
+ for (int n = 0; n < 4; n++) {
+ unsigned *const cost_ptr = &cost[n * 2 + 1];
+ for (int m = 0; m < 5; m++)
+ *cost_ptr += partial_sum_alt[n][3 + m] * partial_sum_alt[n][3 + m];
+ *cost_ptr *= 105;
+ for (int m = 0; m < 3; m++) {
+ const int d = div_table[2 * m + 1];
+ *cost_ptr += (partial_sum_alt[n][m] * partial_sum_alt[n][m] +
+ partial_sum_alt[n][10 - m] * partial_sum_alt[n][10 - m]) * d;
}
- cost[i] *= div_table[8];
- for (j = 0; j < 4 - 1; j++) {
- cost[i] += (partial[i][j] * partial[i][j] +
- partial[i][10 - j] * partial[i][10 - j]) *
- div_table[2 * j + 2];
- }
}
- for (i = 0; i < 8; i++) {
- if (cost[i] > best_cost) {
- best_cost = cost[i];
- best_dir = i;
+
+ int best_dir = 0;
+ unsigned best_cost = cost[0];
+ for (int n = 1; n < 8; n++) {
+ if (cost[n] > best_cost) {
+ best_cost = cost[n];
+ best_dir = n;
}
}
- /* Difference between the optimal variance and the variance along the
- orthogonal direction. Again, the sum(x^2) terms cancel out. */
- *var = best_cost - cost[(best_dir + 4) & 7];
- /* We'd normally divide by 840, but dividing by 1024 is close enough
- for what we're going to do with this. */
- *var >>= 10;
+
+ *var = (best_cost - (cost[best_dir ^ 4])) >> 10;
return best_dir;
}
-
-/*
- * </code copied from libaom>
- */
void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
c->dir = cdef_find_dir_c;