shithub: dav1d

Download patch

ref: 1d7754830ec78b9124c4c8be198aa802669675db
parent: acd90b71657b2e8810c71321a057063fbb18fd28
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Fri Oct 12 09:12:06 EDT 2018

Refactor left edge copying to reduce data copies by 50%.

Also copy 4 pixels so SIMD can use a padded write (movd).

--- a/src/looprestoration.c
+++ b/src/looprestoration.c
@@ -40,6 +40,7 @@
 // TODO Reuse p when no padding is needed (add and remove lpf pixels in p)
 // TODO Chroma only requires 2 rows of padding.
 static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
+                    const pixel (*left)[4],
                     const pixel *lpf, const ptrdiff_t lpf_stride,
                     int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
 {
@@ -84,7 +85,7 @@
 
     // Inner UNIT_WxSTRIPE_H
     for (int j = 0; j < stripe_h; j++) {
-        pixel_copy(dst_tl, p, unit_w);
+        pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
         dst_tl += REST_UNIT_STRIDE;
         p += PXSTRIDE(p_stride);
     }
@@ -107,6 +108,12 @@
             dst += REST_UNIT_STRIDE;
             dst_l += REST_UNIT_STRIDE;
         }
+    } else {
+        dst += 3 * REST_UNIT_STRIDE;
+        for (int j = 0; j < stripe_h; j++) {
+            pixel_copy(dst, &left[j][1], 3);
+            dst += REST_UNIT_STRIDE;
+        }
     }
 }
 
@@ -115,6 +122,7 @@
 // FIXME Could implement a version that requires less temporary memory
 // (should be possible to implement with only 6 rows of temp storage)
 static void wiener_c(pixel *p, const ptrdiff_t p_stride,
+                     const pixel (*const left)[4],
                      const pixel *lpf, const ptrdiff_t lpf_stride,
                      const int w, const int h,
                      const int16_t filterh[7], const int16_t filterv[7],
@@ -125,7 +133,7 @@
     pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
     pixel *tmp_ptr = tmp;
 
-    padding(tmp, p, p_stride, lpf, lpf_stride, w, h, edges);
+    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
 
     // Values stored between horizontal and vertical filtering don't
     // fit in a uint8_t.
@@ -489,6 +497,7 @@
 }
 
 static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
+                         const pixel (*const left)[4],
                          const pixel *lpf, const ptrdiff_t lpf_stride,
                          const int w, const int h, const int sgr_idx,
                          const int16_t sgr_w[2], const enum LrEdgeFlags edges)
@@ -497,7 +506,7 @@
     // of padding above and below
     pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
 
-    padding(tmp, p, p_stride, lpf, lpf_stride, w, h, edges);
+    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
 
     // Selfguided filter outputs to a maximum stripe height of 64 and a
     // maximum restoration width of 384 (256 * 1.5)
--- a/src/looprestoration.h
+++ b/src/looprestoration.h
@@ -45,11 +45,13 @@
 //    * w is constrained by the restoration unit size (w <= 256)
 //    * h is constrained by the stripe height (h <= 64)
 typedef void (*wienerfilter_fn)(pixel *dst, ptrdiff_t dst_stride,
+                                const void *left /*const pixel (*left)[4]*/,
                                 const pixel *lpf, ptrdiff_t lpf_stride,
                                 int w, int h, const int16_t filterh[7],
                                 const int16_t filterv[7], enum LrEdgeFlags edges);
 
 typedef void (*selfguided_fn)(pixel *dst, ptrdiff_t dst_stride,
+                              const void *left /*const pixel (*left)[4]*/,
                               const pixel *lpf, ptrdiff_t lpf_stride,
                               int w, int h, int sgr_idx, const int16_t sgr_w[2],
                               const enum LrEdgeFlags edges);
--- a/src/lr_apply.c
+++ b/src/lr_apply.c
@@ -124,7 +124,8 @@
 }
 
 
-static void lr_stripe(const Dav1dFrameContext *const f, pixel *p, int x, int y,
+static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
+                      const pixel (*left)[4], int x, int y,
                       const int plane, const int unit_w, const int row_h,
                       const Av1RestorationUnit *const lr, enum LrEdgeFlags edges)
 {
@@ -161,14 +162,15 @@
             edges |= LR_HAVE_BOTTOM;
         }
         if (lr->type == RESTORATION_WIENER) {
-            dsp->lr.wiener(p, p_stride, lpf, lpf_stride, unit_w, stripe_h,
+            dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
                            filterh, filterv, edges);
         } else {
             assert(lr->type == RESTORATION_SGRPROJ);
-            dsp->lr.selfguided(p, p_stride, lpf, lpf_stride, unit_w, stripe_h,
+            dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
                                lr->sgr_idx, lr->sgr_weights, edges);
         }
 
+        left += stripe_h;
         y += stripe_h;
         if (y + stripe_h > row_h && sbrow_has_bottom) break;
         p += stripe_h * PXSTRIDE(p_stride);
@@ -179,20 +181,13 @@
     }
 }
 
-static void backup3xU(pixel *dst, const pixel *src, const ptrdiff_t src_stride,
+static void backup4xU(pixel (*dst)[4], const pixel *src, const ptrdiff_t src_stride,
                       int u)
 {
-    for (; u > 0; u--, dst += 3, src += PXSTRIDE(src_stride))
-        pixel_copy(dst, src, 3);
+    for (; u > 0; u--, dst++, src += PXSTRIDE(src_stride))
+        pixel_copy(dst, src, 4);
 }
 
-static void restore3xU(pixel *dst, const ptrdiff_t dst_stride, const pixel *src,
-                       int u)
-{
-    for (; u > 0; u--, dst += PXSTRIDE(dst_stride), src += 3)
-        pixel_copy(dst, src, 3);
-}
-
 static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
                      const int w, const int h, const int row_h, const int plane)
 {
@@ -227,15 +222,14 @@
     const int filter_h =
         imin(((1 << (6 + f->seq_hdr.sb128)) - 8 * !y) >> ss_ver, h - y);
 
-    pixel pre_lr_border[128 /* maximum sbrow height is 128 */ * 3];
-    pixel post_lr_border[128 /* maximum sbrow height is 128 */ * 3];
+    pixel pre_lr_border[2][128 /* maximum sbrow height is 128 */][4];
 
-    int unit_w = unit_size;
+    int unit_w = unit_size, bit = 0;
 
     enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) |
                              (row_h < h ? LR_HAVE_BOTTOM : 0);
 
-    for (int x = 0, rux = 0; x < w; x+= unit_w, rux++, edges |= LR_HAVE_LEFT) {
+    for (int x = 0, rux = 0; x < w; x+= unit_w, rux++, edges |= LR_HAVE_LEFT, bit ^= 1) {
         // TODO Clean up this if statement.
         if (x + max_unit_size > w) {
             unit_w = w - x;
@@ -251,22 +245,13 @@
             &f->lf.mask[(((ruy << (unit_size_log2)) >> shift_ver) * f->sb128w) +
                         (x >> shift_hor)].lr[plane][unit_idx];
 
-        if (edges & LR_HAVE_LEFT) {
-            restore3xU(p - 3, p_stride, pre_lr_border, filter_h);
-        }
         // FIXME Don't backup if the next restoration unit is RESTORE_NONE
         // This also requires not restoring in the same conditions.
         if (edges & LR_HAVE_RIGHT) {
-            backup3xU(pre_lr_border, p + unit_w - 3, p_stride, filter_h);
+            backup4xU(pre_lr_border[bit], p + unit_w - 4, p_stride, filter_h);
         }
         if (lr->type != RESTORATION_NONE) {
-            lr_stripe(f, p, x, y, plane, unit_w, row_h, lr, edges);
-        }
-        if (edges & LR_HAVE_LEFT) {
-            restore3xU(p - 3, p_stride, post_lr_border, filter_h);
-        }
-        if (edges & LR_HAVE_RIGHT) {
-            backup3xU(post_lr_border, p + unit_w - 3, p_stride, filter_h);
+            lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr, edges);
         }
         p += unit_w;
     }