shithub: dav1d

--- a/src/lf_apply.c

+++ b/src/lf_apply.c

@@ -34,14 +34,6 @@

 #include "src/lf_apply.h"

-static inline int maxifzero(const uint8_t (*const a)[4],

-                            const uint8_t (*const b)[4], const int diridx)

-{

-    const int a_val = (*a)[diridx];

-    if (a_val) return a_val;

-    return (*b)[diridx];

-}

 static inline void filter_plane_cols_y(const Dav1dFrameContext *const f,

                                        const int have_left,

                                        const uint8_t (*lvl)[4],

@@ -48,30 +40,22 @@

                                        const ptrdiff_t b4_stride,

                                        const uint32_t (*const mask)[3],

                                        pixel *dst, const ptrdiff_t ls,

+                                       const int w,

                                        const int starty4, const int endy4)

     const Dav1dDSPContext *const dsp = f->dsp;

     // filter edges between columns (e.g. block1 | block2)

-    for (int y = starty4; y < endy4;

-         y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)

-    {

-        pixel *ptr = dst;

-        const uint8_t (*l)[4] = lvl;

-        const uint32_t *const hmask = mask[y];

-        const unsigned hm = hmask[0] | hmask[1] | hmask[2];

-        for (unsigned x = 1; hm & ~(x - 1); l++, x <<= 1, ptr += 4) {

-            if ((have_left || x > 1) && (hm & x)) {

-                const int L = maxifzero(l, &l[-1], 0);

-                if (!L) continue;

-                const int H = L >> 4;

-                const int E = f->lf.lim_lut.e[L], I = f->lf.lim_lut.i[L];

-                const int idx = (hmask[2] & x) ? 2 : !!(hmask[1] & x);

-                dsp->lf.loop_filter[idx][0](ptr, ls, E, I, H);

-            }

-        }

+    for (int x = 0; x < w; x++) {

+        if (!have_left && !x) continue;

+        dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls,

+                                     starty4 ? (const uint32_t[3]) {

+                                         mask[x][0] >> starty4,

+                                         mask[x][1] >> starty4,

+                                         mask[x][2] >> starty4,

+                                     } : mask[x],

+                                     (const uint8_t(*)[4]) &lvl[x][0], b4_stride,

+                                     &f->lf.lim_lut, endy4 - starty4);

@@ -93,9 +77,9 @@

          y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)

         if (!have_top && !y) continue;

-        dsp->lf.loop_filter_sb128y(dst, ls, mask[y],

-                                   (const uint8_t(*)[4]) &lvl[0][1], b4_stride,

-                                   &f->lf.lim_lut, w);

+        dsp->lf.loop_filter_sb[0][1](dst, ls, mask[y],

+                                     (const uint8_t(*)[4]) &lvl[0][1], b4_stride,

+                                     &f->lf.lim_lut, w);

@@ -105,45 +89,28 @@

                                         const ptrdiff_t b4_stride,

                                         const uint32_t (*const mask)[2],

                                         pixel *const u, pixel *const v,

-                                        const ptrdiff_t ls,

+                                        const ptrdiff_t ls, const int w,

                                         const int starty4, const int endy4)

     const Dav1dDSPContext *const dsp = f->dsp;

-    int y;

-    ptrdiff_t off_l;

-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

     // filter edges between columns (e.g. block1 | block2)

-    for (off_l = 0, y = starty4; y < endy4;

-         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)

-    {

-        ptrdiff_t off = off_l;

-        const uint8_t (*l)[4] = lvl;

-        const uint32_t *const hmask = mask[y];

-        const unsigned hm = hmask[0] | hmask[1];

-        for (unsigned x = 1; hm & ~(x - 1); l++, x <<= 1, off += 4) {

-            if ((have_left || x > 1) && (hm & x)) {

-                const int idx = !!(hmask[1] & x);

-                const int Lu = maxifzero(l, &l[-1], 2);

-                if (Lu) {

-                    const int H = Lu >> 4;

-                    const int E = f->lf.lim_lut.e[Lu], I = f->lf.lim_lut.i[Lu];

-                    dsp->lf.loop_filter_uv[idx][0](&u[off], ls, E, I, H);

-                }

-                const int Lv = maxifzero(l, &l[-1], 3);

-                if (Lv) {

-                    const int H = Lv >> 4;

-                    const int E = f->lf.lim_lut.e[Lv], I = f->lf.lim_lut.i[Lv];

-                    dsp->lf.loop_filter_uv[idx][0](&v[off], ls, E, I, H);

-                }

-            }

-        }

+    for (int x = 0; x < w; x++) {

+        if (!have_left && !x) continue;

+        dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls,

+                                     starty4 ? (const uint32_t[2]) {

+                                         mask[x][0] >> starty4,

+                                         mask[x][1] >> starty4,

+                                     } : mask[x],

+                                     (const uint8_t(*)[4]) &lvl[x][2], b4_stride,

+                                     &f->lf.lim_lut, endy4 - starty4);

+        dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls,

+                                     starty4 ? (const uint32_t[2]) {

+                                         mask[x][0] >> starty4,

+                                         mask[x][1] >> starty4,

+                                     } : mask[x],

+                                     (const uint8_t(*)[4]) &lvl[x][3], b4_stride,

+                                     &f->lf.lim_lut, endy4 - starty4);

@@ -167,12 +134,12 @@

          y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)

         if (!have_top && !y) continue;

-        dsp->lf.loop_filter_sb128uv(&u[off_l], ls, mask[y],

-                                    (const uint8_t(*)[4]) &lvl[0][2], b4_stride,

-                                    &f->lf.lim_lut, w);

-        dsp->lf.loop_filter_sb128uv(&v[off_l], ls, mask[y],

-                                    (const uint8_t(*)[4]) &lvl[0][3], b4_stride,

-                                    &f->lf.lim_lut, w);

+        dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, mask[y],

+                                     (const uint8_t(*)[4]) &lvl[0][2], b4_stride,

+                                     &f->lf.lim_lut, w);

+        dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, mask[y],

+                                     (const uint8_t(*)[4]) &lvl[0][3], b4_stride,

+                                     &f->lf.lim_lut, w);

@@ -200,22 +167,23 @@

     for (int tile_col = 1;; tile_col++) {

         x = f->frame_hdr.tiling.col_start_sb[tile_col];

         if ((x << sbl2) >= f->bw) break;

-        const int mask = x & is_sb64 ? 1 << 16 : 1;

-        const int uv_mask = x & is_sb64 ? 1 << (16 >> ss_hor) : 1;

+        const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;

         x >>= is_sb64;

-        for (int y = starty4; y < endy4; y++) {

-            const int idx = 2 * !!(lflvl[x].filter_y[0][y][2] & mask) +

-                                !!(lflvl[x].filter_y[0][y][1] & mask);

-            lflvl[x].filter_y[0][y][2] &= ~mask;

-            lflvl[x].filter_y[0][y][1] &= ~mask;

-            lflvl[x].filter_y[0][y][0] &= ~mask;

-            lflvl[x].filter_y[0][y][imin(idx, lpf_y[y - starty4])] |= mask;

+        for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {

+            const int idx = 2 * !!(lflvl[x].filter_y[0][bx4][2] & mask) +

+                                !!(lflvl[x].filter_y[0][bx4][1] & mask);

+            lflvl[x].filter_y[0][bx4][2] &= ~mask;

+            lflvl[x].filter_y[0][bx4][1] &= ~mask;

+            lflvl[x].filter_y[0][bx4][0] &= ~mask;

+            lflvl[x].filter_y[0][bx4][imin(idx, lpf_y[y - starty4])] |= mask;

-        for (int y = starty4 >> ss_ver; y < uv_endy4; y++) {

-            const int idx = !!(lflvl[x].filter_uv[0][y][1] & uv_mask);

-            lflvl[x].filter_uv[0][y][1] &= ~uv_mask;

-            lflvl[x].filter_uv[0][y][0] &= ~uv_mask;

-            lflvl[x].filter_uv[0][y][imin(idx, lpf_uv[y - (starty4 >> ss_ver)])] |= uv_mask;

+        for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;

+             y++, uv_mask <<= 1)

+        {

+            const int idx = !!(lflvl[x].filter_uv[0][cbx4][1] & uv_mask);

+            lflvl[x].filter_uv[0][cbx4][1] &= ~uv_mask;

+            lflvl[x].filter_uv[0][cbx4][0] &= ~uv_mask;

+            lflvl[x].filter_uv[0][cbx4][imin(idx, lpf_uv[y - (starty4 >> ss_ver)])] |= uv_mask;

         lpf_y  += halign;

         lpf_uv += halign >> ss_ver;

@@ -257,8 +225,8 @@

          x++, have_left = 1, ptr += 128, level_ptr += 32)

         filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,

-                            lflvl[x].filter_y[0],

-                            ptr, f->cur.p.stride[0], starty4, endy4);

+                            lflvl[x].filter_y[0], ptr, f->cur.p.stride[0],

+                            imin(32, f->bw - x * 32), starty4, endy4);

     level_ptr = f->lf.level + f->b4_stride * sby * sbsz;

@@ -279,6 +247,7 @@

         filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,

                              lflvl[x].filter_uv[0],

                              &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],

+                             (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,

                              starty4 >> ss_ver, uv_endy4);

--- a/src/lf_mask.c

+++ b/src/lf_mask.c

@@ -98,30 +98,24 @@

                       max_tx, 0, y_off, x_off, tx_masks);

     // left block edge

-    unsigned mask = 1U << bx4;

-    for (y = 0; y < h4; y++)

-        masks[0][by4 + y][imin(txa[0][0][y][0], l[y])] |= mask;

+    unsigned mask = 1U << by4;

+    for (y = 0; y < h4; y++, mask <<= 1)

+        masks[0][bx4][imin(txa[0][0][y][0], l[y])] |= mask;

     // top block edge

-    for (x = 0; x < w4; x++, mask <<= 1)

+    for (mask = 1U << bx4, x = 0; x < w4; x++, mask <<= 1)

         masks[1][by4][imin(txa[1][0][0][x], a[x])] |= mask;

     if (!skip) {

         // inner (tx) left|right edges

-        for (y = 0; y < h4; y++) {

+        for (y = 0, mask = 1U << by4; y < h4; y++, mask <<= 1) {

             int ltx = txa[0][0][y][0];

             int step = txa[0][1][y][0];

-            if (step < w4) {

-                x = step;

-                mask = 1U << (bx4 + step);

-                do {

-                    const int rtx = txa[0][0][y][x];

-                    masks[0][by4 + y][imin(rtx, ltx)] |= mask;

-                    ltx = rtx;

-                    step = txa[0][1][y][x];

-                    x += step;

-                    mask <<= step;

-                } while (x < w4);

+            for (x = step; x < w4; x += step) {

+                const int rtx = txa[0][0][y][x];

+                masks[0][bx4 + x][imin(rtx, ltx)] |= mask;

+                ltx = rtx;

+                step = txa[0][1][y][x];

@@ -157,29 +151,27 @@

     int y, x;

     // left block edge

-    unsigned mask = 1U << bx4;

-    for (y = 0; y < h4; y++)

-        masks[0][by4 + y][imin(twl4c, l[y])] |= mask;

+    unsigned mask = 1U << by4;

+    for (y = 0; y < h4; y++, mask <<= 1)

+        masks[0][bx4][imin(twl4c, l[y])] |= mask;

     // top block edge

     for (mask = 1U << bx4, x = 0; x < w4; x++, mask <<= 1)

         masks[1][by4][imin(thl4c, a[x])] |= mask;

-    static const uint32_t hstep[] = {

-        0xffffffff, 0x55555555, 0x11111111, 0x01010101, 0x00010001

-    };

     // inner (tx) left|right edges

-    const unsigned t = 1U << bx4;

-    const unsigned inner = (((uint64_t) t) << w4) - t;

-    mask = (inner - t) & hstep[twl4];

-    for (y = 0; y < h4; y++)

-        masks[0][by4 + y][twl4c] |= mask;

+    const int hstep = t_dim->w;

+    unsigned t = 1U << by4;

+    unsigned inner = (((uint64_t) t) << h4) - t;

+    for (x = hstep; x < w4; x += hstep)

+        masks[0][bx4 + x][twl4c] |= inner;

     //            top

     // inner (tx) --- edges

     //           bottom

     const int vstep = t_dim->h;

+    t = 1U << bx4;

+    inner = (((uint64_t) t) << w4) - t;

     for (y = vstep; y < h4; y += vstep)

         masks[1][by4 + y][thl4c] |= inner;

@@ -200,9 +192,9 @@

     int y, x;

     // left block edge

-    unsigned mask = 1U << cbx4;

-    for (y = 0; y < ch4; y++)

-        masks[0][cby4 + y][imin(twl4c, l[y])] |= mask;

+    unsigned mask = 1U << cby4;

+    for (y = 0; y < ch4; y++, mask <<= 1)

+        masks[0][cbx4][imin(twl4c, l[y])] |= mask;

     // top block edge

     for (mask = 1U << cbx4, x = 0; x < cw4; x++, mask <<= 1)

@@ -209,21 +201,19 @@

         masks[1][cby4][imin(thl4c, a[x])] |= mask;

     if (!skip_inter) {

-        static const uint32_t hstep[] = {

-            0xffffffff, 0x55555555, 0x11111111, 0x01010101

-        };

         // inner (tx) left|right edges

-        const int t = 1U << cbx4;

-        const unsigned inner = (((uint64_t) t) << cw4) - t;

-        mask = (inner - t) & hstep[twl4];

-        for (y = 0; y < ch4; y++)

-            masks[0][cby4 + y][twl4c] |= mask;

+        const int hstep = t_dim->w;

+        int t = 1U << cby4;

+        unsigned inner = (((uint64_t) t) << ch4) - t;

+        for (x = hstep; x < cw4; x += hstep)

+            masks[0][cbx4 + x][twl4c] |= inner;

         //            top

         // inner (tx) --- edges

         //           bottom

         const int vstep = t_dim->h;

+        t = 1U << cbx4;

+        inner = (((uint64_t) t) << cw4) - t;

         for (y = vstep; y < ch4; y += vstep)

             masks[1][cby4 + y][thl4c] |= inner;

--- a/src/loopfilter.c

+++ b/src/loopfilter.c

@@ -159,27 +159,26 @@

-#define lf_4_fn(dir, wd, stridea, strideb) \

-static void loop_filter_##dir##_##wd##wd_4px_c(pixel *const dst, \

-                                               const ptrdiff_t stride, \

-                                               const int E, const int I, \

-                                               const int H) \

-{ \

-    loop_filter(dst, E, I, H, stridea, strideb, wd); \

+static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,

+                                   const uint32_t *const vmask,

+                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,

+                                   const Av1FilterLUT *lut, const int h)

+{

+    const unsigned vm = (vmask[0] | vmask[1] | vmask[2]) & ((1ULL << h) - 1);

+    for (unsigned y = 1; vm & ~(y - 1);

+         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)

+    {

+        if (vm & y) {

+            const int L = l[0][0] ? l[0][0] : l[-1][0];

+            if (!L) continue;

+            const int H = L >> 4;

+            const int E = lut->e[L], I = lut->i[L];

+            const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);

+            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx);

+        }

+    }

-#define lf_4_fns(wd) \

-lf_4_fn(h, wd, PXSTRIDE(stride), 1) \

-lf_4_fn(v, wd, 1, PXSTRIDE(stride))

-lf_4_fns(4)

-lf_4_fns(6)

-lf_4_fns(8)

-lf_4_fns(16)

-#undef lf_4_fn

-#undef lf_4_fns

 static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,

                                    const uint32_t *const vmask,

                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,

@@ -198,6 +197,26 @@

+static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,

+                                    const uint32_t *const vmask,

+                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,

+                                    const Av1FilterLUT *lut, const int h)

+{

+    const unsigned vm = (vmask[0] | vmask[1]) & ((1ULL << h) - 1);

+    for (unsigned y = 1; vm & ~(y - 1);

+         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)

+    {

+        if (vm & y) {

+            const int L = l[0][0] ? l[0][0] : l[-1][0];

+            if (!L) continue;

+            const int H = L >> 4;

+            const int E = lut->e[L], I = lut->i[L];

+            const int idx = !!(vmask[1] & y);

+            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx);

+        }

+    }

+}

 static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,

                                     const uint32_t *const vmask,

                                     const uint8_t (*l)[4], ptrdiff_t b4_stride,

@@ -217,20 +236,10 @@

 void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {

-    c->loop_filter[0][0] = loop_filter_h_4wd_4px_c;

-    c->loop_filter[0][1] = loop_filter_v_4wd_4px_c;

-    c->loop_filter[1][0] = loop_filter_h_8wd_4px_c;

-    c->loop_filter[1][1] = loop_filter_v_8wd_4px_c;

-    c->loop_filter[2][0] = loop_filter_h_16wd_4px_c;

-    c->loop_filter[2][1] = loop_filter_v_16wd_4px_c;

-    c->loop_filter_uv[0][0] = loop_filter_h_4wd_4px_c;

-    c->loop_filter_uv[0][1] = loop_filter_v_4wd_4px_c;

-    c->loop_filter_uv[1][0] = loop_filter_h_6wd_4px_c;

-    c->loop_filter_uv[1][1] = loop_filter_v_6wd_4px_c;

-    c->loop_filter_sb128y = loop_filter_v_sb128y_c;

-    c->loop_filter_sb128uv = loop_filter_v_sb128uv_c;

+    c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;

+    c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;

+    c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;

+    c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;

 #if HAVE_ASM && ARCH_X86

     bitfn(dav1d_loop_filter_dsp_init_x86)(c);

--- a/src/loopfilter.h

+++ b/src/loopfilter.h

@@ -36,10 +36,6 @@

 #include "src/levels.h"

 #include "src/lf_mask.h"

-#define decl_loopfilter_fn(name) \

-void (name)(pixel *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)

-typedef decl_loopfilter_fn(*loopfilter_fn);

 #define decl_loopfilter_sb_fn(name) \

 void (name)(pixel *dst, ptrdiff_t stride, const uint32_t *mask, \

             const uint8_t (*lvl)[4], ptrdiff_t lvl_stride, \

@@ -48,15 +44,12 @@

 typedef struct Dav1dLoopFilterDSPContext {

/*

-     * dimension 1: filter taps (0=4, 1=8, 2=16 for luma; 0=4, 1=6 for chroma)

+     * dimension 1: plane (0=luma, 1=chroma)

      * dimension 2: 0=col-edge filter (h), 1=row-edge filter (v)

-     * dst/stride are aligned by 4

+     * dst/stride are aligned by 32

*/

-    loopfilter_fn loop_filter[3][2];

-    loopfilter_fn loop_filter_uv[2][2];

-    loopfilter_sb_fn loop_filter_sb128y;

-    loopfilter_sb_fn loop_filter_sb128uv;

+    loopfilter_sb_fn loop_filter_sb[2][2];

 } Dav1dLoopFilterDSPContext;

 void dav1d_loop_filter_dsp_init_8bpc(Dav1dLoopFilterDSPContext *c);

--- a/src/x86/loopfilter_init.c

+++ b/src/x86/loopfilter_init.c

@@ -37,7 +37,7 @@

     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;

 #if BITDEPTH == 8 && ARCH_X86_64

-    c->loop_filter_sb128y = dav1d_lpf_v_sb128y_avx2;

-    c->loop_filter_sb128uv = dav1d_lpf_v_sb128uv_avx2;

+    c->loop_filter_sb[0][1] = dav1d_lpf_v_sb128y_avx2;

+    c->loop_filter_sb[1][1] = dav1d_lpf_v_sb128uv_avx2;

 #endif

--- a/tests/checkasm/loopfilter.c

+++ b/tests/checkasm/loopfilter.c

@@ -161,6 +161,6 @@

     bitfn(dav1d_loop_filter_dsp_init)(&c);

-    check_lpf_sb(c.loop_filter_sb128y, "lpf_v_sb128y", 3, 32, 1);

-    check_lpf_sb(c.loop_filter_sb128uv, "lpf_v_sb128uv", 2, 16, 2);

+    check_lpf_sb(c.loop_filter_sb[0][1], "lpf_v_sb128y", 3, 32, 1);

+    check_lpf_sb(c.loop_filter_sb[1][1], "lpf_v_sb128uv", 2, 16, 2);

--

⑨