shithub: dav1d

Download patch

ref: e413c8ed877cf4f09004fc07171d9dc4339293df
parent: 9057d286bedbbf9e3b73e0e3d7e056c4cd149fd7
author: Luc Trudeau <ltrudeau@twoorioles.com>
date: Wed Oct 21 08:08:15 EDT 2020

Combine boxsum and boxsumsqr in SGR C code

Makes C code more alike ASM

--- a/src/looprestoration_tmpl.c
+++ b/src/looprestoration_tmpl.c
@@ -208,15 +208,19 @@
 // i: Pixel summed and stored (between loops)
 // c: Pixel summed not stored
 // x: Pixel not summed not stored
-static void boxsum3(coef *dst, const pixel *src, const int w, const int h) {
+static void boxsum3(int32_t *sumsq, coef *sum, const pixel *src,
+                    const int w, const int h)
+{
     // We skip the first row, as it is never used
     src += REST_UNIT_STRIDE;
 
     // We skip the first and last columns, as they are never used
     for (int x = 1; x < w - 1; x++) {
-        coef *ds = dst + x;
+        coef *sum_v = sum + x;
+        int32_t *sumsq_v = sumsq + x;
         const pixel *s = src + x;
-        int a = s[0], b = s[REST_UNIT_STRIDE];
+        int a = s[0], a2 = a * a;
+        int b = s[REST_UNIT_STRIDE], b2 = b * b;
 
         // We skip the first 2 rows, as they are skipped in the next loop and
         // we don't need the last 2 row as it is skipped in the next loop
@@ -223,28 +227,39 @@
         for (int y = 2; y < h - 2; y++) {
             s += REST_UNIT_STRIDE;
             const int c = s[REST_UNIT_STRIDE];
-            ds += REST_UNIT_STRIDE;
-            *ds = a + b + c;
+            const int c2 = c * c;
+            sum_v += REST_UNIT_STRIDE;
+            sumsq_v += REST_UNIT_STRIDE;
+            *sum_v = a + b + c;
+            *sumsq_v = a2 + b2 + c2;
             a = b;
+            a2 = b2;
             b = c;
+            b2 = c2;
         }
      }
 
     // We skip the first row as it is never read
-    dst += REST_UNIT_STRIDE;
+    sum += REST_UNIT_STRIDE;
+    sumsq += REST_UNIT_STRIDE;
     // We skip the last 2 rows as it is never read
     for (int y = 2; y < h - 2; y++) {
-        int a = dst[1], b = dst[2];
+        int a = sum[1], a2 = sumsq[1];
+        int b = sum[2], b2 = sumsq[2];
 
         // We don't store the first column as it is never read and
         // we don't store the last 2 columns as they are never read
         for (int x = 2; x < w - 2; x++) {
-            const int c = dst[x + 1];
-            dst[x] = a + b + c;
+            const int c = sum[x + 1], c2 = sumsq[x + 1];
+            sum[x] = a + b + c;
+            sumsq[x] = a2 + b2 + c2;
             a = b;
+            a2 = b2;
             b = c;
+            b2 = c2;
         }
-        dst += REST_UNIT_STRIDE;
+        sum += REST_UNIT_STRIDE;
+        sumsq += REST_UNIT_STRIDE;
     }
 }
 
@@ -270,134 +285,62 @@
 // i: Pixel summed and stored (between loops)
 // c: Pixel summed not stored
 // x: Pixel not summed not stored
-static void boxsum5(coef *dst, const pixel *const src, const int w, const int h) {
-    for (int x = 0; x < w; x++) {
-        coef *ds = dst + x;
-        const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
-        int a = s[-3 * REST_UNIT_STRIDE];
-        int b = s[-2 * REST_UNIT_STRIDE];
-        int c = s[-1 * REST_UNIT_STRIDE];
-        int d = s[0];
-
-        // We skip the first 2 rows, as they are skipped in the next loop and
-        // we don't need the last 2 row as it is skipped in the next loop
-        for (int y = 2; y < h - 2; y++) {
-            s += REST_UNIT_STRIDE;
-            const int e = *s;
-            ds += REST_UNIT_STRIDE;
-            *ds = a + b + c + d + e;
-            a = b;
-            b = c;
-            c = d;
-            d = e;
-        }
-    }
-
-    // We skip the first row as it is never read
-    dst += REST_UNIT_STRIDE;
-    for (int y = 2; y < h - 2; y++) {
-        int a = dst[0];
-        int b = dst[1];
-        int c = dst[2];
-        int d = dst[3];
-
-        for (int x = 2; x < w - 2; x++) {
-            const int e = dst[x + 2];
-            dst[x] = a + b + c + d + e;
-            a = b;
-            b = c;
-            c = d;
-            d = e;
-        }
-        dst += REST_UNIT_STRIDE;
-    }
-}
-
-// See boxsum3 function comments for details on row and column skipping
-static void boxsum3sqr(int32_t *dst, const pixel *src, const int w, const int h) {
-    // We skip the first row, as it is never used
-    src += REST_UNIT_STRIDE;
-
-    // We skip the first and last columns, as they are never used
-    for (int x = 1; x < w - 1; x++) {
-        int32_t *ds = dst + x;
-        const pixel *s = src + x;
-        int a = s[0] * s[0];
-        int b = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
-
-        // We skip the first row, as it is skipped in the next loop and
-        // we don't need the last row as it is skipped in the next loop
-        for (int y = 2; y < h - 2; y++) {
-            s += REST_UNIT_STRIDE;
-            const int c = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
-            ds += REST_UNIT_STRIDE;
-            *ds = a + b + c;
-            a = b;
-            b = c;
-        }
-     }
-
-    // We skip the first row as it is never read
-    dst += REST_UNIT_STRIDE;
-    // We skip the last row as it is never read
-    for (int y = 2; y < h - 2; y++) {
-        int a = dst[1], b = dst[2];
-
-        // We don't store the first column as it is never read and
-        // we don't store the last 2 columns as they are never read
-        for (int x = 2; x < w - 2; x++) {
-            const int c = dst[x + 1];
-            dst[x] = a + b + c;
-            a = b;
-            b = c;
-        }
-        dst += REST_UNIT_STRIDE;
-    }
-}
-
-// See boxsum5 function comments for details on row and column skipping
-static void boxsum5sqr(int32_t *dst, const pixel *const src, const int w,
-                       const int h)
+static void boxsum5(int32_t *sumsq, coef *sum, const pixel *const src,
+                    const int w, const int h)
 {
     for (int x = 0; x < w; x++) {
-        int32_t *ds = dst + x;
+        coef *sum_v = sum + x;
+        int32_t *sumsq_v = sumsq + x;
         const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
-        int a = s[-3 * REST_UNIT_STRIDE] * s[-3 * REST_UNIT_STRIDE];
-        int b = s[-2 * REST_UNIT_STRIDE] * s[-2 * REST_UNIT_STRIDE];
-        int c = s[-1 * REST_UNIT_STRIDE] * s[-1 * REST_UNIT_STRIDE];
-        int d = s[0] * s[0];
+        int a = s[-3 * REST_UNIT_STRIDE], a2 = a * a;
+        int b = s[-2 * REST_UNIT_STRIDE], b2 = b * b;
+        int c = s[-1 * REST_UNIT_STRIDE], c2 = c * c;
+        int d = s[0], d2 = d * d;
 
         // We skip the first 2 rows, as they are skipped in the next loop and
         // we don't need the last 2 row as it is skipped in the next loop
         for (int y = 2; y < h - 2; y++) {
             s += REST_UNIT_STRIDE;
-            const int e = s[0] * s[0];
-            ds += REST_UNIT_STRIDE;
-            *ds = a + b + c + d + e;
+            const int e = *s, e2 = e * e;
+            sum_v += REST_UNIT_STRIDE;
+            sumsq_v += REST_UNIT_STRIDE;
+            *sum_v = a + b + c + d + e;
+            *sumsq_v = a2 + b2 + c2 + d2 + e2;
             a = b;
             b = c;
             c = d;
             d = e;
+            a2 = b2;
+            b2 = c2;
+            c2 = d2;
+            d2 = e2;
         }
     }
 
     // We skip the first row as it is never read
-    dst += REST_UNIT_STRIDE;
+    sum += REST_UNIT_STRIDE;
+    sumsq += REST_UNIT_STRIDE;
     for (int y = 2; y < h - 2; y++) {
-        int a = dst[0];
-        int b = dst[1];
-        int c = dst[2];
-        int d = dst[3];
+        int a = sum[0], a2 = sumsq[0];
+        int b = sum[1], b2 = sumsq[1];
+        int c = sum[2], c2 = sumsq[2];
+        int d = sum[3], d2 = sumsq[3];
 
         for (int x = 2; x < w - 2; x++) {
-            const int e = dst[x + 2];
-            dst[x] = a + b + c + d + e;
+            const int e = sum[x + 2], e2 = sumsq[x + 2];
+            sum[x] = a + b + c + d + e;
+            sumsq[x] = a2 + b2 + c2 + d2 + e2;
             a = b;
             b = c;
             c = d;
             d = e;
+            a2 = b2;
+            b2 = c2;
+            c2 = d2;
+            d2 = e2;
         }
-        dst += REST_UNIT_STRIDE;
+        sum += REST_UNIT_STRIDE;
+        sumsq += REST_UNIT_STRIDE;
     }
 }
 
@@ -410,21 +353,18 @@
 
     // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
     // of padding above and below
-    int32_t A_[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE];
-    int32_t *A = A_ + 2 * REST_UNIT_STRIDE + 3;
+    int32_t sumsq[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE];
+    int32_t *A = sumsq + 2 * REST_UNIT_STRIDE + 3;
     // By inverting A and B after the boxsums, B can be of size coef instead
     // of int32_t
-    coef B_[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE];
-    coef *B = B_ + 2 * REST_UNIT_STRIDE + 3;
+    coef sum[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE];
+    coef *B = sum + 2 * REST_UNIT_STRIDE + 3;
 
     const int step = (n == 25) + 1;
-    if (n == 25) {
-        boxsum5(B_, src, w + 6, h + 6);
-        boxsum5sqr(A_, src, w + 6, h + 6);
-    } else {
-        boxsum3(B_, src, w + 6, h + 6);
-        boxsum3sqr(A_, src, w + 6, h + 6);
-    }
+    if (n == 25)
+        boxsum5(sumsq, sum, src, w + 6, h + 6);
+    else
+        boxsum3(sumsq, sum, src, w + 6, h + 6);
     const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
 
     int32_t *AA = A - REST_UNIT_STRIDE;