ref: 1d7754830ec78b9124c4c8be198aa802669675db
parent: acd90b71657b2e8810c71321a057063fbb18fd28
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Fri Oct 12 09:12:06 EDT 2018
Refactor left edge copying to reduce data copies by 50%. Also copy 4 pixels so SIMD can use a padded write (movd).
--- a/src/looprestoration.c
+++ b/src/looprestoration.c
@@ -40,6 +40,7 @@
// TODO Reuse p when no padding is needed (add and remove lpf pixels in p)
// TODO Chroma only requires 2 rows of padding.
static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
+ const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
{
@@ -84,7 +85,7 @@
// Inner UNIT_WxSTRIPE_H
for (int j = 0; j < stripe_h; j++) {
- pixel_copy(dst_tl, p, unit_w);
+ pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
dst_tl += REST_UNIT_STRIDE;
p += PXSTRIDE(p_stride);
}
@@ -107,6 +108,12 @@
dst += REST_UNIT_STRIDE;
dst_l += REST_UNIT_STRIDE;
}
+ } else {
+ dst += 3 * REST_UNIT_STRIDE;
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst, &left[j][1], 3);
+ dst += REST_UNIT_STRIDE;
+ }
}
}
@@ -115,6 +122,7 @@
// FIXME Could implement a version that requires less temporary memory
// (should be possible to implement with only 6 rows of temp storage)
static void wiener_c(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h,
const int16_t filterh[7], const int16_t filterv[7],
@@ -125,7 +133,7 @@
pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
pixel *tmp_ptr = tmp;
- padding(tmp, p, p_stride, lpf, lpf_stride, w, h, edges);
+ padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
// Values stored between horizontal and vertical filtering don't
// fit in a uint8_t.
@@ -489,6 +497,7 @@
}
static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int sgr_idx,
const int16_t sgr_w[2], const enum LrEdgeFlags edges)
@@ -497,7 +506,7 @@
// of padding above and below
pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
- padding(tmp, p, p_stride, lpf, lpf_stride, w, h, edges);
+ padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
// Selfguided filter outputs to a maximum stripe height of 64 and a
// maximum restoration width of 384 (256 * 1.5)
--- a/src/looprestoration.h
+++ b/src/looprestoration.h
@@ -45,11 +45,13 @@
// * w is constrained by the restoration unit size (w <= 256)
// * h is constrained by the stripe height (h <= 64)
typedef void (*wienerfilter_fn)(pixel *dst, ptrdiff_t dst_stride,
+ const void *left /*const pixel (*left)[4]*/,
const pixel *lpf, ptrdiff_t lpf_stride,
int w, int h, const int16_t filterh[7],
const int16_t filterv[7], enum LrEdgeFlags edges);
typedef void (*selfguided_fn)(pixel *dst, ptrdiff_t dst_stride,
+ const void *left /*const pixel (*left)[4]*/,
const pixel *lpf, ptrdiff_t lpf_stride,
int w, int h, int sgr_idx, const int16_t sgr_w[2],
const enum LrEdgeFlags edges);
--- a/src/lr_apply.c
+++ b/src/lr_apply.c
@@ -124,7 +124,8 @@
}
-static void lr_stripe(const Dav1dFrameContext *const f, pixel *p, int x, int y,
+static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
+ const pixel (*left)[4], int x, int y,
const int plane, const int unit_w, const int row_h,
const Av1RestorationUnit *const lr, enum LrEdgeFlags edges)
{
@@ -161,14 +162,15 @@
edges |= LR_HAVE_BOTTOM;
}
if (lr->type == RESTORATION_WIENER) {
- dsp->lr.wiener(p, p_stride, lpf, lpf_stride, unit_w, stripe_h,
+ dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
filterh, filterv, edges);
} else {
assert(lr->type == RESTORATION_SGRPROJ);
- dsp->lr.selfguided(p, p_stride, lpf, lpf_stride, unit_w, stripe_h,
+ dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
lr->sgr_idx, lr->sgr_weights, edges);
}
+ left += stripe_h;
y += stripe_h;
if (y + stripe_h > row_h && sbrow_has_bottom) break;
p += stripe_h * PXSTRIDE(p_stride);
@@ -179,20 +181,13 @@
}
}
-static void backup3xU(pixel *dst, const pixel *src, const ptrdiff_t src_stride,
+static void backup4xU(pixel (*dst)[4], const pixel *src, const ptrdiff_t src_stride,
int u)
{
- for (; u > 0; u--, dst += 3, src += PXSTRIDE(src_stride))
- pixel_copy(dst, src, 3);
+ for (; u > 0; u--, dst++, src += PXSTRIDE(src_stride))
+ pixel_copy(dst, src, 4);
}
-static void restore3xU(pixel *dst, const ptrdiff_t dst_stride, const pixel *src,
- int u)
-{
- for (; u > 0; u--, dst += PXSTRIDE(dst_stride), src += 3)
- pixel_copy(dst, src, 3);
-}
-
static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
const int w, const int h, const int row_h, const int plane)
{
@@ -227,15 +222,14 @@
const int filter_h =
imin(((1 << (6 + f->seq_hdr.sb128)) - 8 * !y) >> ss_ver, h - y);
- pixel pre_lr_border[128 /* maximum sbrow height is 128 */ * 3];
- pixel post_lr_border[128 /* maximum sbrow height is 128 */ * 3];
+ pixel pre_lr_border[2][128 /* maximum sbrow height is 128 */][4];
- int unit_w = unit_size;
+ int unit_w = unit_size, bit = 0;
enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) |
(row_h < h ? LR_HAVE_BOTTOM : 0);
- for (int x = 0, rux = 0; x < w; x+= unit_w, rux++, edges |= LR_HAVE_LEFT) {
+ for (int x = 0, rux = 0; x < w; x+= unit_w, rux++, edges |= LR_HAVE_LEFT, bit ^= 1) {
// TODO Clean up this if statement.
if (x + max_unit_size > w) {
unit_w = w - x;
@@ -251,22 +245,13 @@
&f->lf.mask[(((ruy << (unit_size_log2)) >> shift_ver) * f->sb128w) +
(x >> shift_hor)].lr[plane][unit_idx];
- if (edges & LR_HAVE_LEFT) {
- restore3xU(p - 3, p_stride, pre_lr_border, filter_h);
- }
// FIXME Don't backup if the next restoration unit is RESTORE_NONE
// This also requires not restoring in the same conditions.
if (edges & LR_HAVE_RIGHT) {
- backup3xU(pre_lr_border, p + unit_w - 3, p_stride, filter_h);
+ backup4xU(pre_lr_border[bit], p + unit_w - 4, p_stride, filter_h);
}
if (lr->type != RESTORATION_NONE) {
- lr_stripe(f, p, x, y, plane, unit_w, row_h, lr, edges);
- }
- if (edges & LR_HAVE_LEFT) {
- restore3xU(p - 3, p_stride, post_lr_border, filter_h);
- }
- if (edges & LR_HAVE_RIGHT) {
- backup3xU(post_lr_border, p + unit_w - 3, p_stride, filter_h);
+ lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr, edges);
}
p += unit_w;
}