shithub: dav1d

--- a/src/lf_apply.c

+++ b/src/lf_apply.c

@@ -34,12 +34,11 @@

 #include "src/lf_apply.h"

-static inline int maxifzero(const uint8_t (*const a)[4], const int have_b,

+static inline int maxifzero(const uint8_t (*const a)[4],

                             const uint8_t (*const b)[4], const int diridx)

     const int a_val = (*a)[diridx];

     if (a_val) return a_val;

-    if (!have_b) return a_val;

     return (*b)[diridx];

@@ -64,7 +63,7 @@

         for (unsigned x = 1; hm & ~(x - 1); l++, x <<= 1, ptr += 4) {

             if ((have_left || x > 1) && (hm & x)) {

-                const int L = maxifzero(l, have_left || x > 1, &l[-1], 0);

+                const int L = maxifzero(l, &l[-1], 0);

                 if (!L) continue;

                 const int H = L >> 4;

                 const int E = f->lf.lim_lut.e[L], I = f->lf.lim_lut.i[L];

@@ -82,6 +81,7 @@

                                        const ptrdiff_t b4_stride,

                                        const uint32_t (*const mask)[3],

                                        pixel *dst, const ptrdiff_t ls,

+                                       const int w,

                                        const int starty4, const int endy4)

     const Dav1dDSPContext *const dsp = f->dsp;

@@ -92,22 +92,10 @@

     for (int y = starty4; y < endy4;

          y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)

-        pixel *ptr = dst;

-        const uint8_t (*l)[4] = lvl;

-        const uint32_t *const vmask = mask[y];

-        const unsigned vm = vmask[0] | vmask[1] | vmask[2];

-        for (unsigned x = 1; vm & ~(x - 1); x <<= 1, ptr += 4, l++) {

-            if ((have_top || y) && (vm & x)) {

-                const int L = maxifzero(l, have_top || y, &l[-b4_stride], 1);

-                if (!L) continue;

-                const int H = L >> 4;

-                const int E = f->lf.lim_lut.e[L], I = f->lf.lim_lut.i[L];

-                const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);

-                dsp->lf.loop_filter[idx][1](ptr, ls, E, I, H);

-            }

-        }

+        if (!have_top && !y) continue;

+        dsp->lf.loop_filter_sb128y(dst, ls, mask[y],

+                                   (const uint8_t(*)[4]) &lvl[0][1], b4_stride,

+                                   &f->lf.lim_lut, w);

@@ -125,12 +113,10 @@

     ptrdiff_t off_l;

     const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

     const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

-    const int hstep = 1 << ss_hor;

     // filter edges between columns (e.g. block1 | block2)

-    lvl += ss_hor + ss_ver * b4_stride;

     for (off_l = 0, y = starty4; y < endy4;

-         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride << ss_ver)

+         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)

         ptrdiff_t off = off_l;

         const uint8_t (*l)[4] = lvl;

@@ -137,11 +123,11 @@

         const uint32_t *const hmask = mask[y];

         const unsigned hm = hmask[0] | hmask[1];

-        for (unsigned x = 1; hm & ~(x - 1); l += hstep, x <<= 1, off += 4) {

+        for (unsigned x = 1; hm & ~(x - 1); l++, x <<= 1, off += 4) {

             if ((have_left || x > 1) && (hm & x)) {

                 const int idx = !!(hmask[1] & x);

-                const int Lu = maxifzero(l, have_left || x > 1, &l[-hstep], 2);

+                const int Lu = maxifzero(l, &l[-1], 2);

                 if (Lu) {

                     const int H = Lu >> 4;

                     const int E = f->lf.lim_lut.e[Lu], I = f->lf.lim_lut.i[Lu];

@@ -149,7 +135,7 @@

                     dsp->lf.loop_filter_uv[idx][0](&u[off], ls, E, I, H);

-                const int Lv = maxifzero(l, have_left || x > 1, &l[-hstep], 3);

+                const int Lv = maxifzero(l, &l[-1], 3);

                 if (Lv) {

                     const int H = Lv >> 4;

                     const int E = f->lf.lim_lut.e[Lv], I = f->lf.lim_lut.i[Lv];

@@ -167,51 +153,26 @@

                                         const ptrdiff_t b4_stride,

                                         const uint32_t (*const mask)[2],

                                         pixel *const u, pixel *const v,

-                                        const ptrdiff_t ls,

+                                        const ptrdiff_t ls, const int w,

                                         const int starty4, const int endy4)

     const Dav1dDSPContext *const dsp = f->dsp;

     int y;

     ptrdiff_t off_l;

-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

-    const int hstep = 1 << ss_hor;

     //                                 block1

     // filter edges between rows (e.g. ------)

     //                                 block2

-    lvl += ss_ver * b4_stride + ss_hor;

     for (off_l = 0, y = starty4; y < endy4;

-         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride << ss_ver)

+         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)

-        ptrdiff_t off = off_l;

-        const uint8_t (*l)[4] = lvl;

-        const uint32_t *const vmask = mask[y];

-        const unsigned vm = vmask[0] | vmask[1];

-        for (unsigned x = 1; vm & ~(x - 1); x <<= 1, off += 4, l += hstep) {

-            if ((have_top || y) && (vm & x)) {

-                const int idx = !!(vmask[1] & x);

-                const int Lu = maxifzero(l, have_top || y,

-                                         &l[-(b4_stride << ss_ver)], 2);

-                if (Lu) {

-                    const int H = Lu >> 4;

-                    const int E = f->lf.lim_lut.e[Lu], I = f->lf.lim_lut.i[Lu];

-                    dsp->lf.loop_filter_uv[idx][1](&u[off], ls, E, I, H);

-                }

-                const int Lv = maxifzero(l, have_top || y,

-                                         &l[-(b4_stride << ss_ver)], 3);

-                if (Lv) {

-                    const int H = Lv >> 4;

-                    const int E = f->lf.lim_lut.e[Lv], I = f->lf.lim_lut.i[Lv];

-                    dsp->lf.loop_filter_uv[idx][1](&v[off], ls, E, I, H);

-                }

-            }

-        }

+        if (!have_top && !y) continue;

+        dsp->lf.loop_filter_sb128uv(&u[off_l], ls, mask[y],

+                                    (const uint8_t(*)[4]) &lvl[0][2], b4_stride,

+                                    &f->lf.lim_lut, w);

+        dsp->lf.loop_filter_sb128uv(&v[off_l], ls, mask[y],

+                                    (const uint8_t(*)[4]) &lvl[0][3], b4_stride,

+                                    &f->lf.lim_lut, w);

@@ -303,8 +264,8 @@

     level_ptr = f->lf.level + f->b4_stride * sby * sbsz;

     for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {

         filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,

-                            lflvl[x].filter_y[1],

-                            ptr, f->cur.p.stride[0], starty4, endy4);

+                            lflvl[x].filter_y[1], ptr, f->cur.p.stride[0],

+                            imin(32, f->bw - x * 32), starty4, endy4);

     if (!f->frame_hdr.loopfilter.level_u && !f->frame_hdr.loopfilter.level_v)

@@ -311,9 +272,9 @@

         return;

     ptrdiff_t uv_off;

-    level_ptr = f->lf.level + f->b4_stride * sby * sbsz;

+    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);

     for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w;

-         x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32)

+         x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)

         filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,

                              lflvl[x].filter_uv[0],

@@ -321,13 +282,14 @@

                              starty4 >> ss_ver, uv_endy4);

-    level_ptr = f->lf.level + f->b4_stride * sby * sbsz;

+    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);

     for (uv_off = 0, x = 0; x < f->sb128w;

-         x++, uv_off += 128 >> ss_hor, level_ptr += 32)

+         x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)

         filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,

                              lflvl[x].filter_uv[1],

                              &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],

+                             (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,

                              starty4 >> ss_ver, uv_endy4);

--- a/src/lf_mask.c

+++ b/src/lf_mask.c

@@ -233,7 +233,7 @@

 void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,

-                                uint8_t (*level_cache)[4],

+                                uint8_t (*const level_cache)[4],

                                 const ptrdiff_t b4_stride,

                                 const Av1FrameHeader *const hdr,

                                 const uint8_t (*filter_level)[8][2],

@@ -255,15 +255,13 @@

     const int bx4 = bx & 31;

     const int by4 = by & 31;

-    level_cache += by * b4_stride + bx;

+    uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx;

     for (int y = 0; y < bh4; y++) {

         for (int x = 0; x < bw4; x++) {

-            level_cache[x][0] = filter_level[0][0][0];

-            level_cache[x][1] = filter_level[1][0][0];

-            level_cache[x][2] = filter_level[2][0][0];

-            level_cache[x][3] = filter_level[3][0][0];

+            level_cache_ptr[x][0] = filter_level[0][0][0];

+            level_cache_ptr[x][1] = filter_level[1][0][0];

-        level_cache += b4_stride;

+        level_cache_ptr += b4_stride;

     mask_edges_intra(lflvl->filter_y, by4, bx4, bw4, bh4, ytx, ay, ly);

@@ -277,11 +275,20 @@

     const int cbx4 = bx4 >> ss_hor;

     const int cby4 = by4 >> ss_ver;

+    level_cache_ptr = level_cache + (by >> ss_ver) * b4_stride + (bx >> ss_hor);

+    for (int y = 0; y < cbh4; y++) {

+        for (int x = 0; x < cbw4; x++) {

+            level_cache_ptr[x][2] = filter_level[2][0][0];

+            level_cache_ptr[x][3] = filter_level[3][0][0];

+        }

+        level_cache_ptr += b4_stride;

+    }

     mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, 0, uvtx, auv, luv);

 void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,

-                                uint8_t (*level_cache)[4],

+                                uint8_t (*const level_cache)[4],

                                 const ptrdiff_t b4_stride,

                                 const Av1FrameHeader *const hdr,

                                 const uint8_t (*filter_level)[8][2],

@@ -303,15 +310,13 @@

     const int bx4 = bx & 31;

     const int by4 = by & 31;

-    level_cache += by * b4_stride + bx;

+    uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx;

     for (int y = 0; y < bh4; y++) {

         for (int x = 0; x < bw4; x++) {

-            level_cache[x][0] = filter_level[0][0][0];

-            level_cache[x][1] = filter_level[1][0][0];

-            level_cache[x][2] = filter_level[2][0][0];

-            level_cache[x][3] = filter_level[3][0][0];

+            level_cache_ptr[x][0] = filter_level[0][0][0];

+            level_cache_ptr[x][1] = filter_level[1][0][0];

-        level_cache += b4_stride;

+        level_cache_ptr += b4_stride;

     mask_edges_inter(lflvl->filter_y, by4, bx4, bw4, bh4, skip,

@@ -325,6 +330,15 @@

     const int cbh4 = (bh4 + ss_ver) >> ss_ver;

     const int cbx4 = bx4 >> ss_hor;

     const int cby4 = by4 >> ss_ver;

+    level_cache_ptr = level_cache + (by >> ss_ver) * b4_stride + (bx >> ss_hor);

+    for (int y = 0; y < cbh4; y++) {

+        for (int x = 0; x < cbw4; x++) {

+            level_cache_ptr[x][2] = filter_level[2][0][0];

+            level_cache_ptr[x][3] = filter_level[3][0][0];

+        }

+        level_cache_ptr += b4_stride;

+    }

     mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, skip, uvtx, auv, luv);

--- a/src/loopfilter.c

+++ b/src/loopfilter.c

@@ -67,7 +67,6 @@

                 fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I;

         if (!fm) continue;

         if (wd >= 16) {

@@ -181,6 +180,42 @@

 #undef lf_4_fn

 #undef lf_4_fns

+static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,

+                                   const uint32_t *const vmask,

+                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,

+                                   const Av1FilterLUT *lut, const int w)

+{

+    const unsigned vm = vmask[0] | vmask[1] | vmask[2];

+    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {

+        if (vm & x) {

+            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];

+            if (!L) continue;

+            const int H = L >> 4;

+            const int E = lut->e[L], I = lut->i[L];

+            const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);

+            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx);

+        }

+    }

+}

+static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,

+                                    const uint32_t *const vmask,

+                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,

+                                    const Av1FilterLUT *lut, const int w)

+{

+    const unsigned vm = vmask[0] | vmask[1];

+    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {

+        if (vm & x) {

+            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];

+            if (!L) continue;

+            const int H = L >> 4;

+            const int E = lut->e[L], I = lut->i[L];

+            const int idx = !!(vmask[1] & x);

+            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx);

+        }

+    }

+}

 void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {

     c->loop_filter[0][0] = loop_filter_h_4wd_4px_c;

     c->loop_filter[0][1] = loop_filter_v_4wd_4px_c;

@@ -193,4 +228,7 @@

     c->loop_filter_uv[0][1] = loop_filter_v_4wd_4px_c;

     c->loop_filter_uv[1][0] = loop_filter_h_6wd_4px_c;

     c->loop_filter_uv[1][1] = loop_filter_v_6wd_4px_c;

+    c->loop_filter_sb128y = loop_filter_v_sb128y_c;

+    c->loop_filter_sb128uv = loop_filter_v_sb128uv_c;

--- a/src/loopfilter.h

+++ b/src/loopfilter.h

@@ -34,11 +34,18 @@

 #include "common/bitdepth.h"

 #include "src/levels.h"

+#include "src/lf_mask.h"

 #define decl_loopfilter_fn(name) \

 void (name)(pixel *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)

 typedef decl_loopfilter_fn(*loopfilter_fn);

+#define decl_loopfilter_sb_fn(name) \

+void (name)(pixel *dst, ptrdiff_t stride, const uint32_t *mask, \

+            const uint8_t (*lvl)[4], ptrdiff_t lvl_stride, \

+            const Av1FilterLUT *lut, int w)

+typedef decl_loopfilter_sb_fn(*loopfilter_sb_fn);

 typedef struct Dav1dLoopFilterDSPContext {

/*

      * dimension 1: filter taps (0=4, 1=8, 2=16 for luma; 0=4, 1=6 for chroma)

@@ -48,6 +55,8 @@

*/

     loopfilter_fn loop_filter[3][2];

     loopfilter_fn loop_filter_uv[2][2];

+    loopfilter_sb_fn loop_filter_sb128y;

+    loopfilter_sb_fn loop_filter_sb128uv;

 } Dav1dLoopFilterDSPContext;

 void dav1d_loop_filter_dsp_init_8bpc(Dav1dLoopFilterDSPContext *c);

--

⑨