shithub: dav1d

Download patch

ref: eaaf2218689d5942a050f9f9081ec473becb77e0
parent: fad6db2053b69b3dff9d1f9bd9ee68a7a2080dc2
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Jan 27 12:42:14 EST 2020

Optimize the cdef_filter C implementation

The main feature is splitting the main filter code into three
different code paths depending on the strength values.

Clipping is only required when both the primary and secondary
strengths are non-zero, which is an uncommon case. Being able to
skip that complexity in the common cases is significantly faster.

--- a/src/cdef_tmpl.c
+++ b/src/cdef_tmpl.c
@@ -32,27 +32,28 @@
 #include "common/intops.h"
 
 #include "src/cdef.h"
+#include "src/tables.h"
 
 static inline int constrain(const int diff, const int threshold,
-                            const int damping)
+                            const int shift)
 {
-    if (!threshold) return 0;
-    const int shift = imax(0, damping - ulog2(threshold));
-    return apply_sign(imin(abs(diff), imax(0, threshold - (abs(diff) >> shift))),
-                      diff);
+    const int adiff = abs(diff);
+    return apply_sign(imin(adiff, imax(0, threshold - (adiff >> shift))), diff);
 }
 
-static inline void fill(uint16_t *tmp, const ptrdiff_t stride,
+static inline void fill(int16_t *tmp, const ptrdiff_t stride,
                         const int w, const int h)
 {
+    /* Use a value that's a large positive number when interpreted as unsigned,
+     * and a large negative number when interpreted as signed. */
     for (int y = 0; y < h; y++) {
         for (int x = 0; x < w; x++)
-            tmp[x] = INT16_MAX;
+            tmp[x] = INT16_MIN;
         tmp += stride;
     }
 }
 
-static void padding(uint16_t *tmp, const ptrdiff_t tmp_stride,
+static void padding(int16_t *tmp, const ptrdiff_t tmp_stride,
                     const pixel *src, const ptrdiff_t src_stride,
                     const pixel (*left)[2], pixel *const top[2],
                     const int w, const int h,
@@ -94,74 +95,112 @@
 static NOINLINE void
 cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
                     const pixel (*left)[2], /*const*/ pixel *const top[2],
-                    const int w, const int h, const int pri_strength,
-                    const int sec_strength, const int dir,
-                    const int damping, const enum CdefEdgeFlags edges
-                    HIGHBD_DECL_SUFFIX)
+                    const int pri_strength, const int sec_strength,
+                    const int dir, const int damping, const int w, int h,
+                    const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
-    static const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
-        { -1 * 12 + 1, -2 * 12 + 2 },
-        {  0 * 12 + 1, -1 * 12 + 2 },
-        {  0 * 12 + 1,  0 * 12 + 2 },
-        {  0 * 12 + 1,  1 * 12 + 2 },
-        {  1 * 12 + 1,  2 * 12 + 2 },
-        {  1 * 12 + 0,  2 * 12 + 1 },
-        {  1 * 12 + 0,  2 * 12 + 0 },
-        {  1 * 12 + 0,  2 * 12 - 1 }
-    };
     const ptrdiff_t tmp_stride = 12;
     assert((w == 4 || w == 8) && (h == 4 || h == 8));
-    uint16_t tmp_buf[144];  // 12*12 is the maximum value of tmp_stride * (h + 4)
-    uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2;
-    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
-    const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
+    int16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4)
+    int16_t *tmp = tmp_buf + 2 * tmp_stride + 2;
 
     padding(tmp, tmp_stride, dst, dst_stride, left, top, w, h, edges);
 
-    // run actual filter
-    for (int y = 0; y < h; y++) {
-        for (int x = 0; x < w; x++) {
-            int sum = 0;
-            const int px = dst[x];
-            int max = px, min = px;
-            int pri_tap_k = pri_tap;
-            for (int k = 0; k < 2; k++) {
-                const int off1 = cdef_directions[dir][k];
-                const int p0 = tmp[x + off1];
-                const int p1 = tmp[x - off1];
-                sum += pri_tap_k * constrain(p0 - px, pri_strength, damping);
-                sum += pri_tap_k * constrain(p1 - px, pri_strength, damping);
-                // if pri_tap_k == 4 then it becomes 2 else it remains 3
-                pri_tap_k -= (pri_tap_k << 1) - 6;
-                if (p0 != INT16_MAX) max = imax(p0, max);
-                if (p1 != INT16_MAX) max = imax(p1, max);
-                min = imin(p0, min);
-                min = imin(p1, min);
-                const int off2 = cdef_directions[(dir + 2) & 7][k];
-                const int s0 = tmp[x + off2];
-                const int s1 = tmp[x - off2];
-                const int off3 = cdef_directions[(dir + 6) & 7][k];
-                const int s2 = tmp[x + off3];
-                const int s3 = tmp[x - off3];
-                if (s0 != INT16_MAX) max = imax(s0, max);
-                if (s1 != INT16_MAX) max = imax(s1, max);
-                if (s2 != INT16_MAX) max = imax(s2, max);
-                if (s3 != INT16_MAX) max = imax(s3, max);
-                min = imin(s0, min);
-                min = imin(s1, min);
-                min = imin(s2, min);
-                min = imin(s3, min);
-                // sec_tap starts at 2 and becomes 1
-                const int sec_tap = 2 - k;
-                sum += sec_tap * constrain(s0 - px, sec_strength, damping);
-                sum += sec_tap * constrain(s1 - px, sec_strength, damping);
-                sum += sec_tap * constrain(s2 - px, sec_strength, damping);
-                sum += sec_tap * constrain(s3 - px, sec_strength, damping);
-            }
-            dst[x] = iclip(px + ((8 + sum - (sum < 0)) >> 4), min, max);
+    if (pri_strength) {
+        const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+        const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
+        const int pri_shift = imax(0, damping - ulog2(pri_strength));
+        if (sec_strength) {
+            const int sec_shift = imax(0, damping - ulog2(sec_strength));
+            do {
+                for (int x = 0; x < w; x++) {
+                    const int px = dst[x];
+                    int sum = 0;
+                    int max = px, min = px;
+                    int pri_tap_k = pri_tap;
+                    for (int k = 0; k < 2; k++) {
+                        const int off1 = dav1d_cdef_directions[dir + 2][k]; // dir
+                        const int p0 = tmp[x + off1];
+                        const int p1 = tmp[x - off1];
+                        sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
+                        sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
+                        // if pri_tap_k == 4 then it becomes 2 else it remains 3
+                        pri_tap_k = (pri_tap_k & 3) | 2;
+                        min = umin(p0, min);
+                        max = imax(p0, max);
+                        min = umin(p1, min);
+                        max = imax(p1, max);
+                        const int off2 = dav1d_cdef_directions[dir + 4][k]; // dir + 2
+                        const int off3 = dav1d_cdef_directions[dir + 0][k]; // dir - 2
+                        const int s0 = tmp[x + off2];
+                        const int s1 = tmp[x - off2];
+                        const int s2 = tmp[x + off3];
+                        const int s3 = tmp[x - off3];
+                        // sec_tap starts at 2 and becomes 1
+                        const int sec_tap = 2 - k;
+                        sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
+                        sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
+                        sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
+                        sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
+                        min = umin(s0, min);
+                        max = imax(s0, max);
+                        min = umin(s1, min);
+                        max = imax(s1, max);
+                        min = umin(s2, min);
+                        max = imax(s2, max);
+                        min = umin(s3, min);
+                        max = imax(s3, max);
+                    }
+                    dst[x] = iclip(px + ((sum - (sum < 0) + 8) >> 4), min, max);
+                }
+                dst += PXSTRIDE(dst_stride);
+                tmp += tmp_stride;
+            } while (--h);
+        } else { // pri_strength only
+            do {
+                for (int x = 0; x < w; x++) {
+                    const int px = dst[x];
+                    int sum = 0;
+                    int pri_tap_k = pri_tap;
+                    for (int k = 0; k < 2; k++) {
+                        const int off = dav1d_cdef_directions[dir + 2][k]; // dir
+                        const int p0 = tmp[x + off];
+                        const int p1 = tmp[x - off];
+                        sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
+                        sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
+                        pri_tap_k = (pri_tap_k & 3) | 2;
+                    }
+                    dst[x] = px + ((sum - (sum < 0) + 8) >> 4);
+                }
+                dst += PXSTRIDE(dst_stride);
+                tmp += tmp_stride;
+            } while (--h);
         }
-        dst += PXSTRIDE(dst_stride);
-        tmp += tmp_stride;
+    } else { // sec_strength only
+        assert(sec_strength);
+        const int sec_shift = imax(0, damping - ulog2(sec_strength));
+        do {
+            for (int x = 0; x < w; x++) {
+                const int px = dst[x];
+                int sum = 0;
+                for (int k = 0; k < 2; k++) {
+                    const int off1 = dav1d_cdef_directions[dir + 4][k]; // dir + 2
+                    const int off2 = dav1d_cdef_directions[dir + 0][k]; // dir - 2
+                    const int s0 = tmp[x + off1];
+                    const int s1 = tmp[x - off1];
+                    const int s2 = tmp[x + off2];
+                    const int s3 = tmp[x - off2];
+                    const int sec_tap = 2 - k;
+                    sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
+                    sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
+                    sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
+                    sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
+                }
+                dst[x] = px + ((sum - (sum < 0) + 8) >> 4);
+            }
+            dst += PXSTRIDE(dst_stride);
+            tmp += tmp_stride;
+        } while (--h);
     }
 }
 
@@ -177,8 +216,8 @@
                                             const enum CdefEdgeFlags edges \
                                             HIGHBD_DECL_SUFFIX) \
 { \
-    cdef_filter_block_c(dst, stride, left, top, w, h, pri_strength, sec_strength, \
-                        dir, damping, edges HIGHBD_TAIL_SUFFIX); \
+    cdef_filter_block_c(dst, stride, left, top, pri_strength, sec_strength, \
+                        dir, damping, w, h, edges HIGHBD_TAIL_SUFFIX); \
 }
 
 cdef_fn(4, 4);
--- a/src/tables.c
+++ b/src/tables.c
@@ -397,6 +397,21 @@
     .delta = 0,
 };
 
+const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
+    {  1 * 12 + 0,  2 * 12 + 0 }, // 6
+    {  1 * 12 + 0,  2 * 12 - 1 }, // 7
+    { -1 * 12 + 1, -2 * 12 + 2 }, // 0
+    {  0 * 12 + 1, -1 * 12 + 2 }, // 1
+    {  0 * 12 + 1,  0 * 12 + 2 }, // 2
+    {  0 * 12 + 1,  1 * 12 + 2 }, // 3
+    {  1 * 12 + 1,  2 * 12 + 2 }, // 4
+    {  1 * 12 + 0,  2 * 12 + 1 }, // 5
+    {  1 * 12 + 0,  2 * 12 + 0 }, // 6
+    {  1 * 12 + 0,  2 * 12 - 1 }, // 7
+    { -1 * 12 + 1, -2 * 12 + 2 }, // 0
+    {  0 * 12 + 1, -1 * 12 + 2 }, // 1
+};
+
 const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
     { 2, 1, 140, 3236 }, { 2, 1, 112, 2158 }, { 2, 1, 93, 1618 },
     { 2, 1,  80, 1438 }, { 2, 1,  70, 1295 }, { 2, 1, 58, 1177 },
--- a/src/tables.h
+++ b/src/tables.h
@@ -105,6 +105,8 @@
 
 extern const Dav1dWarpedMotionParams dav1d_default_wm_params;
 
+extern const int8_t dav1d_cdef_directions[12][2];
+
 extern const int16_t dav1d_sgr_params[16][4];
 extern const uint8_t dav1d_sgr_x_by_x[256];