shithub: dav1d

Download patch

ref: 36647aaa1aec66983d2d470c3346e0102a5a240b
parent: 76646c7d94199f4e15dddd4c7e6151dc51103ed7
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Fri Oct 5 13:44:18 EDT 2018

Vertical loopfilter simplifications

- remove edge condition in maxifzero(), since it's always true;
- make subsampled level array packed;
- handle a SB worth of 4px blocks per DSP function call, so that
  SIMD will be more efficient.

--- a/src/lf_apply.c
+++ b/src/lf_apply.c
@@ -34,12 +34,11 @@
 
 #include "src/lf_apply.h"
 
-static inline int maxifzero(const uint8_t (*const a)[4], const int have_b,
+static inline int maxifzero(const uint8_t (*const a)[4],
                             const uint8_t (*const b)[4], const int diridx)
 {
     const int a_val = (*a)[diridx];
     if (a_val) return a_val;
-    if (!have_b) return a_val;
     return (*b)[diridx];
 }
 
@@ -64,7 +63,7 @@
 
         for (unsigned x = 1; hm & ~(x - 1); l++, x <<= 1, ptr += 4) {
             if ((have_left || x > 1) && (hm & x)) {
-                const int L = maxifzero(l, have_left || x > 1, &l[-1], 0);
+                const int L = maxifzero(l, &l[-1], 0);
                 if (!L) continue;
                 const int H = L >> 4;
                 const int E = f->lf.lim_lut.e[L], I = f->lf.lim_lut.i[L];
@@ -82,6 +81,7 @@
                                        const ptrdiff_t b4_stride,
                                        const uint32_t (*const mask)[3],
                                        pixel *dst, const ptrdiff_t ls,
+                                       const int w,
                                        const int starty4, const int endy4)
 {
     const Dav1dDSPContext *const dsp = f->dsp;
@@ -92,22 +92,10 @@
     for (int y = starty4; y < endy4;
          y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
     {
-        pixel *ptr = dst;
-        const uint8_t (*l)[4] = lvl;
-        const uint32_t *const vmask = mask[y];
-        const unsigned vm = vmask[0] | vmask[1] | vmask[2];
-
-        for (unsigned x = 1; vm & ~(x - 1); x <<= 1, ptr += 4, l++) {
-            if ((have_top || y) && (vm & x)) {
-                const int L = maxifzero(l, have_top || y, &l[-b4_stride], 1);
-                if (!L) continue;
-                const int H = L >> 4;
-                const int E = f->lf.lim_lut.e[L], I = f->lf.lim_lut.i[L];
-                const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);
-
-                dsp->lf.loop_filter[idx][1](ptr, ls, E, I, H);
-            }
-        }
+        if (!have_top && !y) continue;
+        dsp->lf.loop_filter_sb128y(dst, ls, mask[y],
+                                   (const uint8_t(*)[4]) &lvl[0][1], b4_stride,
+                                   &f->lf.lim_lut, w);
     }
 }
 
@@ -125,12 +113,10 @@
     ptrdiff_t off_l;
     const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int hstep = 1 << ss_hor;
 
     // filter edges between columns (e.g. block1 | block2)
-    lvl += ss_hor + ss_ver * b4_stride;
     for (off_l = 0, y = starty4; y < endy4;
-         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride << ss_ver)
+         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
     {
         ptrdiff_t off = off_l;
         const uint8_t (*l)[4] = lvl;
@@ -137,11 +123,11 @@
         const uint32_t *const hmask = mask[y];
         const unsigned hm = hmask[0] | hmask[1];
 
-        for (unsigned x = 1; hm & ~(x - 1); l += hstep, x <<= 1, off += 4) {
+        for (unsigned x = 1; hm & ~(x - 1); l++, x <<= 1, off += 4) {
             if ((have_left || x > 1) && (hm & x)) {
                 const int idx = !!(hmask[1] & x);
 
-                const int Lu = maxifzero(l, have_left || x > 1, &l[-hstep], 2);
+                const int Lu = maxifzero(l, &l[-1], 2);
                 if (Lu) {
                     const int H = Lu >> 4;
                     const int E = f->lf.lim_lut.e[Lu], I = f->lf.lim_lut.i[Lu];
@@ -149,7 +135,7 @@
                     dsp->lf.loop_filter_uv[idx][0](&u[off], ls, E, I, H);
                 }
 
-                const int Lv = maxifzero(l, have_left || x > 1, &l[-hstep], 3);
+                const int Lv = maxifzero(l, &l[-1], 3);
                 if (Lv) {
                     const int H = Lv >> 4;
                     const int E = f->lf.lim_lut.e[Lv], I = f->lf.lim_lut.i[Lv];
@@ -167,51 +153,26 @@
                                         const ptrdiff_t b4_stride,
                                         const uint32_t (*const mask)[2],
                                         pixel *const u, pixel *const v,
-                                        const ptrdiff_t ls,
+                                        const ptrdiff_t ls, const int w,
                                         const int starty4, const int endy4)
 {
     const Dav1dDSPContext *const dsp = f->dsp;
     int y;
     ptrdiff_t off_l;
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int hstep = 1 << ss_hor;
 
     //                                 block1
     // filter edges between rows (e.g. ------)
     //                                 block2
-    lvl += ss_ver * b4_stride + ss_hor;
     for (off_l = 0, y = starty4; y < endy4;
-         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride << ss_ver)
+         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
     {
-        ptrdiff_t off = off_l;
-        const uint8_t (*l)[4] = lvl;
-        const uint32_t *const vmask = mask[y];
-        const unsigned vm = vmask[0] | vmask[1];
-
-        for (unsigned x = 1; vm & ~(x - 1); x <<= 1, off += 4, l += hstep) {
-            if ((have_top || y) && (vm & x)) {
-                const int idx = !!(vmask[1] & x);
-
-                const int Lu = maxifzero(l, have_top || y,
-                                         &l[-(b4_stride << ss_ver)], 2);
-                if (Lu) {
-                    const int H = Lu >> 4;
-                    const int E = f->lf.lim_lut.e[Lu], I = f->lf.lim_lut.i[Lu];
-
-                    dsp->lf.loop_filter_uv[idx][1](&u[off], ls, E, I, H);
-                }
-
-                const int Lv = maxifzero(l, have_top || y,
-                                         &l[-(b4_stride << ss_ver)], 3);
-                if (Lv) {
-                    const int H = Lv >> 4;
-                    const int E = f->lf.lim_lut.e[Lv], I = f->lf.lim_lut.i[Lv];
-
-                    dsp->lf.loop_filter_uv[idx][1](&v[off], ls, E, I, H);
-                }
-            }
-        }
+        if (!have_top && !y) continue;
+        dsp->lf.loop_filter_sb128uv(&u[off_l], ls, mask[y],
+                                    (const uint8_t(*)[4]) &lvl[0][2], b4_stride,
+                                    &f->lf.lim_lut, w);
+        dsp->lf.loop_filter_sb128uv(&v[off_l], ls, mask[y],
+                                    (const uint8_t(*)[4]) &lvl[0][3], b4_stride,
+                                    &f->lf.lim_lut, w);
     }
 }
 
@@ -303,8 +264,8 @@
     level_ptr = f->lf.level + f->b4_stride * sby * sbsz;
     for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
         filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
-                            lflvl[x].filter_y[1],
-                            ptr, f->cur.p.stride[0], starty4, endy4);
+                            lflvl[x].filter_y[1], ptr, f->cur.p.stride[0],
+                            imin(32, f->bw - x * 32), starty4, endy4);
     }
 
     if (!f->frame_hdr.loopfilter.level_u && !f->frame_hdr.loopfilter.level_v)
@@ -311,9 +272,9 @@
         return;
 
     ptrdiff_t uv_off;
-    level_ptr = f->lf.level + f->b4_stride * sby * sbsz;
+    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
     for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w;
-         x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32)
+         x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
     {
         filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,
                              lflvl[x].filter_uv[0],
@@ -321,13 +282,14 @@
                              starty4 >> ss_ver, uv_endy4);
     }
 
-    level_ptr = f->lf.level + f->b4_stride * sby * sbsz;
+    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
     for (uv_off = 0, x = 0; x < f->sb128w;
-         x++, uv_off += 128 >> ss_hor, level_ptr += 32)
+         x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
     {
         filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,
                              lflvl[x].filter_uv[1],
                              &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
+                             (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
                              starty4 >> ss_ver, uv_endy4);
     }
 }
--- a/src/lf_mask.c
+++ b/src/lf_mask.c
@@ -233,7 +233,7 @@
 }
 
 void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
-                                uint8_t (*level_cache)[4],
+                                uint8_t (*const level_cache)[4],
                                 const ptrdiff_t b4_stride,
                                 const Av1FrameHeader *const hdr,
                                 const uint8_t (*filter_level)[8][2],
@@ -255,15 +255,13 @@
     const int bx4 = bx & 31;
     const int by4 = by & 31;
 
-    level_cache += by * b4_stride + bx;
+    uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx;
     for (int y = 0; y < bh4; y++) {
         for (int x = 0; x < bw4; x++) {
-            level_cache[x][0] = filter_level[0][0][0];
-            level_cache[x][1] = filter_level[1][0][0];
-            level_cache[x][2] = filter_level[2][0][0];
-            level_cache[x][3] = filter_level[3][0][0];
+            level_cache_ptr[x][0] = filter_level[0][0][0];
+            level_cache_ptr[x][1] = filter_level[1][0][0];
         }
-        level_cache += b4_stride;
+        level_cache_ptr += b4_stride;
     }
 
     mask_edges_intra(lflvl->filter_y, by4, bx4, bw4, bh4, ytx, ay, ly);
@@ -277,11 +275,20 @@
     const int cbx4 = bx4 >> ss_hor;
     const int cby4 = by4 >> ss_ver;
 
+    level_cache_ptr = level_cache + (by >> ss_ver) * b4_stride + (bx >> ss_hor);
+    for (int y = 0; y < cbh4; y++) {
+        for (int x = 0; x < cbw4; x++) {
+            level_cache_ptr[x][2] = filter_level[2][0][0];
+            level_cache_ptr[x][3] = filter_level[3][0][0];
+        }
+        level_cache_ptr += b4_stride;
+    }
+
     mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, 0, uvtx, auv, luv);
 }
 
 void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
-                                uint8_t (*level_cache)[4],
+                                uint8_t (*const level_cache)[4],
                                 const ptrdiff_t b4_stride,
                                 const Av1FrameHeader *const hdr,
                                 const uint8_t (*filter_level)[8][2],
@@ -303,15 +310,13 @@
     const int bx4 = bx & 31;
     const int by4 = by & 31;
 
-    level_cache += by * b4_stride + bx;
+    uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx;
     for (int y = 0; y < bh4; y++) {
         for (int x = 0; x < bw4; x++) {
-            level_cache[x][0] = filter_level[0][0][0];
-            level_cache[x][1] = filter_level[1][0][0];
-            level_cache[x][2] = filter_level[2][0][0];
-            level_cache[x][3] = filter_level[3][0][0];
+            level_cache_ptr[x][0] = filter_level[0][0][0];
+            level_cache_ptr[x][1] = filter_level[1][0][0];
         }
-        level_cache += b4_stride;
+        level_cache_ptr += b4_stride;
     }
 
     mask_edges_inter(lflvl->filter_y, by4, bx4, bw4, bh4, skip,
@@ -325,6 +330,15 @@
     const int cbh4 = (bh4 + ss_ver) >> ss_ver;
     const int cbx4 = bx4 >> ss_hor;
     const int cby4 = by4 >> ss_ver;
+
+    level_cache_ptr = level_cache + (by >> ss_ver) * b4_stride + (bx >> ss_hor);
+    for (int y = 0; y < cbh4; y++) {
+        for (int x = 0; x < cbw4; x++) {
+            level_cache_ptr[x][2] = filter_level[2][0][0];
+            level_cache_ptr[x][3] = filter_level[3][0][0];
+        }
+        level_cache_ptr += b4_stride;
+    }
 
     mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, skip, uvtx, auv, luv);
 }
--- a/src/loopfilter.c
+++ b/src/loopfilter.c
@@ -67,7 +67,6 @@
                 fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I;
             }
         }
-
         if (!fm) continue;
 
         if (wd >= 16) {
@@ -181,6 +180,42 @@
 #undef lf_4_fn
 #undef lf_4_fns
 
+static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,
+                                   const uint32_t *const vmask,
+                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                                   const Av1FilterLUT *lut, const int w)
+{
+    const unsigned vm = vmask[0] | vmask[1] | vmask[2];
+    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
+        if (vm & x) {
+            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
+            if (!L) continue;
+            const int H = L >> 4;
+            const int E = lut->e[L], I = lut->i[L];
+            const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);
+            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx);
+        }
+    }
+}
+
+static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
+                                    const uint32_t *const vmask,
+                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                                    const Av1FilterLUT *lut, const int w)
+{
+    const unsigned vm = vmask[0] | vmask[1];
+    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
+        if (vm & x) {
+            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
+            if (!L) continue;
+            const int H = L >> 4;
+            const int E = lut->e[L], I = lut->i[L];
+            const int idx = !!(vmask[1] & x);
+            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx);
+        }
+    }
+}
+
 void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
     c->loop_filter[0][0] = loop_filter_h_4wd_4px_c;
     c->loop_filter[0][1] = loop_filter_v_4wd_4px_c;
@@ -193,4 +228,7 @@
     c->loop_filter_uv[0][1] = loop_filter_v_4wd_4px_c;
     c->loop_filter_uv[1][0] = loop_filter_h_6wd_4px_c;
     c->loop_filter_uv[1][1] = loop_filter_v_6wd_4px_c;
+
+    c->loop_filter_sb128y = loop_filter_v_sb128y_c;
+    c->loop_filter_sb128uv = loop_filter_v_sb128uv_c;
 }
--- a/src/loopfilter.h
+++ b/src/loopfilter.h
@@ -34,11 +34,18 @@
 #include "common/bitdepth.h"
 
 #include "src/levels.h"
+#include "src/lf_mask.h"
 
 #define decl_loopfilter_fn(name) \
 void (name)(pixel *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
 typedef decl_loopfilter_fn(*loopfilter_fn);
 
+#define decl_loopfilter_sb_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const uint32_t *mask, \
+            const uint8_t (*lvl)[4], ptrdiff_t lvl_stride, \
+            const Av1FilterLUT *lut, int w)
+typedef decl_loopfilter_sb_fn(*loopfilter_sb_fn);
+
 typedef struct Dav1dLoopFilterDSPContext {
     /*
      * dimension 1: filter taps (0=4, 1=8, 2=16 for luma; 0=4, 1=6 for chroma)
@@ -48,6 +55,8 @@
      */
     loopfilter_fn loop_filter[3][2];
     loopfilter_fn loop_filter_uv[2][2];
+    loopfilter_sb_fn loop_filter_sb128y;
+    loopfilter_sb_fn loop_filter_sb128uv;
 } Dav1dLoopFilterDSPContext;
 
 void dav1d_loop_filter_dsp_init_8bpc(Dav1dLoopFilterDSPContext *c);