shithub: dav1d

Download patch

ref: 128715b5fdd513f9824bef32119285efa44c1d1b
parent: 32c62b5f38e9c935454b95e3aa1730ebd00f87cf
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Wed Oct 24 15:21:46 EDT 2018

Make access into loopfilter masks safe across sb64 tile threads

Fixes #100.

--- a/src/lf_apply.c
+++ b/src/lf_apply.c
@@ -38,7 +38,7 @@
                                        const int have_left,
                                        const uint8_t (*lvl)[4],
                                        const ptrdiff_t b4_stride,
-                                       const uint32_t (*const mask)[3],
+                                       const uint16_t (*const mask)[3][2],
                                        pixel *dst, const ptrdiff_t ls,
                                        const int w,
                                        const int starty4, const int endy4)
@@ -48,12 +48,23 @@
     // filter edges between columns (e.g. block1 | block2)
     for (int x = 0; x < w; x++) {
         if (!have_left && !x) continue;
-        dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls,
-                                     starty4 ? (const uint32_t[4]) {
-                                         mask[x][0] >> starty4,
-                                         mask[x][1] >> starty4,
-                                         mask[x][2] >> starty4,
-                                     } : mask[x],
+        uint32_t hmask[4];
+        if (!starty4) {
+            hmask[0] = mask[x][0][0];
+            hmask[1] = mask[x][1][0];
+            hmask[2] = mask[x][2][0];
+            if (endy4 > 16) {
+                hmask[0] |= mask[x][0][1] << 16;
+                hmask[1] |= mask[x][1][1] << 16;
+                hmask[2] |= mask[x][2][1] << 16;
+            }
+        } else {
+            hmask[0] = mask[x][0][1];
+            hmask[1] = mask[x][1][1];
+            hmask[2] = mask[x][2][1];
+        }
+        hmask[3] = 0;
+        dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls, hmask,
                                      (const uint8_t(*)[4]) &lvl[x][0], b4_stride,
                                      &f->lf.lim_lut, endy4 - starty4);
     }
@@ -63,7 +74,7 @@
                                        const int have_top,
                                        const uint8_t (*lvl)[4],
                                        const ptrdiff_t b4_stride,
-                                       const uint32_t (*const mask)[3],
+                                       const uint16_t (*const mask)[3][2],
                                        pixel *dst, const ptrdiff_t ls,
                                        const int w,
                                        const int starty4, const int endy4)
@@ -77,7 +88,13 @@
          y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
     {
         if (!have_top && !y) continue;
-        dsp->lf.loop_filter_sb[0][1](dst, ls, mask[y],
+        const uint32_t vmask[4] = {
+            mask[y][0][0] | (mask[y][0][1] << 16),
+            mask[y][1][0] | (mask[y][1][1] << 16),
+            mask[y][2][0] | (mask[y][2][1] << 16),
+            0,
+        };
+        dsp->lf.loop_filter_sb[0][1](dst, ls, vmask,
                                      (const uint8_t(*)[4]) &lvl[0][1], b4_stride,
                                      &f->lf.lim_lut, w);
     }
@@ -87,10 +104,11 @@
                                         const int have_left,
                                         const uint8_t (*lvl)[4],
                                         const ptrdiff_t b4_stride,
-                                        const uint32_t (*const mask)[2],
+                                        const uint16_t (*const mask)[2][2],
                                         pixel *const u, pixel *const v,
                                         const ptrdiff_t ls, const int w,
-                                        const int starty4, const int endy4)
+                                        const int starty4, const int endy4,
+                                        const int ss_ver)
 {
     const Dav1dDSPContext *const dsp = f->dsp;
 
@@ -97,18 +115,23 @@
     // filter edges between columns (e.g. block1 | block2)
     for (int x = 0; x < w; x++) {
         if (!have_left && !x) continue;
-        dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls,
-                                     starty4 ? (const uint32_t[3]) {
-                                         mask[x][0] >> starty4,
-                                         mask[x][1] >> starty4,
-                                     } : mask[x],
+        uint32_t hmask[3];
+        if (!starty4) {
+            hmask[0] = mask[x][0][0];
+            hmask[1] = mask[x][1][0];
+            if (endy4 > (16 >> ss_ver)) {
+                hmask[0] |= mask[x][0][1] << (16 >> ss_ver);
+                hmask[1] |= mask[x][1][1] << (16 >> ss_ver);
+            }
+        } else {
+            hmask[0] = mask[x][0][1];
+            hmask[1] = mask[x][1][1];
+        }
+        hmask[2] = 0;
+        dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls, hmask,
                                      (const uint8_t(*)[4]) &lvl[x][2], b4_stride,
                                      &f->lf.lim_lut, endy4 - starty4);
-        dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls,
-                                     starty4 ? (const uint32_t[3]) {
-                                         mask[x][0] >> starty4,
-                                         mask[x][1] >> starty4,
-                                     } : mask[x],
+        dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls, hmask,
                                      (const uint8_t(*)[4]) &lvl[x][3], b4_stride,
                                      &f->lf.lim_lut, endy4 - starty4);
     }
@@ -118,10 +141,11 @@
                                         const int have_top,
                                         const uint8_t (*lvl)[4],
                                         const ptrdiff_t b4_stride,
-                                        const uint32_t (*const mask)[2],
+                                        const uint16_t (*const mask)[2][2],
                                         pixel *const u, pixel *const v,
                                         const ptrdiff_t ls, const int w,
-                                        const int starty4, const int endy4)
+                                        const int starty4, const int endy4,
+                                        const int ss_hor)
 {
     const Dav1dDSPContext *const dsp = f->dsp;
     ptrdiff_t off_l = 0;
@@ -133,10 +157,15 @@
          y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
     {
         if (!have_top && !y) continue;
-        dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, mask[y],
+        const uint32_t vmask[3] = {
+            mask[y][0][0] | (mask[y][0][1] << (16 >> ss_hor)),
+            mask[y][1][0] | (mask[y][1][1] << (16 >> ss_hor)),
+            0,
+        };
+        dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, vmask,
                                      (const uint8_t(*)[4]) &lvl[0][2], b4_stride,
                                      &f->lf.lim_lut, w);
-        dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, mask[y],
+        dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, vmask,
                                      (const uint8_t(*)[4]) &lvl[0][3], b4_stride,
                                      &f->lf.lim_lut, w);
     }
@@ -157,6 +186,8 @@
     const int halign = (f->bh + 31) & ~31;
     const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
+    const unsigned vmax = 1 << vmask, hmax = 1 << hmask;
     const unsigned endy4 = starty4 + imin(hy4 - sby * sbsz, sbsz);
     const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
 
@@ -169,24 +200,29 @@
         const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;
         x >>= is_sb64;
 
-        uint32_t *const y_hmask = lflvl[x].filter_y[0][bx4];
+        uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4];
         for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {
-            const int idx = 2 * !!(y_hmask[2] & mask) + !!(y_hmask[1] & mask);
-            y_hmask[2] &= ~mask;
-            y_hmask[1] &= ~mask;
-            y_hmask[0] &= ~mask;
-            y_hmask[imin(idx, lpf_y[y - starty4])] |= mask;
+            const int sidx = mask >= 0x10000;
+            const unsigned smask = mask >> (sidx << 4);
+            const int idx = 2 * !!(y_hmask[2][sidx] & smask) +
+                                !!(y_hmask[1][sidx] & smask);
+            y_hmask[2][sidx] &= ~smask;
+            y_hmask[1][sidx] &= ~smask;
+            y_hmask[0][sidx] &= ~smask;
+            y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask;
         }
 
         if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
-            uint32_t *const uv_hmask = lflvl[x].filter_uv[0][cbx4];
+            uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4];
             for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;
                  y++, uv_mask <<= 1)
             {
-                const int idx = !!(uv_hmask[1] & uv_mask);
-                uv_hmask[1] &= ~uv_mask;
-                uv_hmask[0] &= ~uv_mask;
-                uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])] |= uv_mask;
+                const int sidx = uv_mask >= vmax;
+                const unsigned smask = uv_mask >> (sidx << (4 - ss_ver));
+                const int idx = !!(uv_hmask[1][sidx] & smask);
+                uv_hmask[1][sidx] &= ~smask;
+                uv_hmask[0][sidx] &= ~smask;
+                uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask;
             }
         }
         lpf_y  += halign;
@@ -199,22 +235,27 @@
         for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)];
              x < f->sb128w; x++, a++)
         {
-            uint32_t *const y_vmask = lflvl[x].filter_y[1][starty4];
+            uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4];
             for (unsigned mask = 1, i = 0; i < 32; mask <<= 1, i++) {
-                const int idx = 2 * !!(y_vmask[2] & mask) + !!(y_vmask[1] & mask);
-                y_vmask[2] &= ~mask;
-                y_vmask[1] &= ~mask;
-                y_vmask[0] &= ~mask;
-                y_vmask[imin(idx, a->tx_lpf_y[i])] |= mask;
+                const int sidx = mask >= 0x10000;
+                const unsigned smask = mask >> (sidx << 4);
+                const int idx = 2 * !!(y_vmask[2][sidx] & smask) +
+                                    !!(y_vmask[1][sidx] & smask);
+                y_vmask[2][sidx] &= ~smask;
+                y_vmask[1][sidx] &= ~smask;
+                y_vmask[0][sidx] &= ~smask;
+                y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask;
             }
 
             if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
-                uint32_t *const uv_vmask = lflvl[x].filter_uv[1][starty4 >> ss_ver];
-                for (unsigned mask = 1, i = 0; i < (32U >> ss_hor); mask <<= 1, i++) {
-                    const int idx = !!(uv_vmask[1] & mask);
-                    uv_vmask[1] &= ~mask;
-                    uv_vmask[0] &= ~mask;
-                    uv_vmask[imin(idx, a->tx_lpf_uv[i])] |= mask;
+                uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver];
+                for (unsigned uv_mask = 1, i = 0; i < (32U >> ss_hor); uv_mask <<= 1, i++) {
+                    const int sidx = uv_mask >= hmax;
+                    const unsigned smask = uv_mask >> (sidx << (4 - ss_hor));
+                    const int idx = !!(uv_vmask[1][sidx] & smask);
+                    uv_vmask[1][sidx] &= ~smask;
+                    uv_vmask[0][sidx] &= ~smask;
+                    uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask;
                 }
             }
         }
@@ -249,7 +290,7 @@
                              lflvl[x].filter_uv[0],
                              &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
                              (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
-                             starty4 >> ss_ver, uv_endy4);
+                             starty4 >> ss_ver, uv_endy4, ss_ver);
     }
 
     level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
@@ -260,6 +301,6 @@
                              lflvl[x].filter_uv[1],
                              &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
                              (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
-                             starty4 >> ss_ver, uv_endy4);
+                             starty4 >> ss_ver, uv_endy4, ss_hor);
     }
 }
--- a/src/lf_mask.c
+++ b/src/lf_mask.c
@@ -73,7 +73,7 @@
     }
 }
 
-static inline void mask_edges_inter(uint32_t (*const masks)[32][3],
+static inline void mask_edges_inter(uint16_t (*const masks)[32][3][2],
                                     const int by4, const int bx4,
                                     const int w4, const int h4, const int skip,
                                     const enum RectTxfmSize max_tx,
@@ -91,21 +91,29 @@
 
     // left block edge
     unsigned mask = 1U << by4;
-    for (y = 0; y < h4; y++, mask <<= 1)
-        masks[0][bx4][imin(txa[0][0][y][0], l[y])] |= mask;
+    for (y = 0; y < h4; y++, mask <<= 1) {
+        const int sidx = mask >= 0x10000;
+        const unsigned smask = mask >> (sidx << 4);
+        masks[0][bx4][imin(txa[0][0][y][0], l[y])][sidx] |= smask;
+    }
 
     // top block edge
-    for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1)
-        masks[1][by4][imin(txa[1][0][0][x], a[x])] |= mask;
+    for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
+        const int sidx = mask >= 0x10000;
+        const unsigned smask = mask >> (sidx << 4);
+        masks[1][by4][imin(txa[1][0][0][x], a[x])][sidx] |= smask;
+    }
 
     if (!skip) {
         // inner (tx) left|right edges
         for (y = 0, mask = 1U << by4; y < h4; y++, mask <<= 1) {
+            const int sidx = mask >= 0x10000U;
+            const unsigned smask = mask >> (sidx << 4);
             int ltx = txa[0][0][y][0];
             int step = txa[0][1][y][0];
             for (x = step; x < w4; x += step) {
                 const int rtx = txa[0][0][y][x];
-                masks[0][bx4 + x][imin(rtx, ltx)] |= mask;
+                masks[0][bx4 + x][imin(rtx, ltx)][sidx] |= smask;
                 ltx = rtx;
                 step = txa[0][1][y][x];
             }
@@ -115,11 +123,13 @@
         // inner (tx) --- edges
         //           bottom
         for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
+            const int sidx = mask >= 0x10000U;
+            const unsigned smask = mask >> (sidx << 4);
             int ttx = txa[1][0][0][x];
             int step = txa[1][1][0][x];
             for (y = step; y < h4; y += step) {
                 const int btx = txa[1][0][y][x];
-                masks[1][by4 + y][imin(ttx, btx)] |= mask;
+                masks[1][by4 + y][imin(ttx, btx)][sidx] |= smask;
                 ttx = btx;
                 step = txa[1][1][y][x];
             }
@@ -131,7 +141,7 @@
     memcpy(a, txa[1][0][h4 - 1], w4);
 }
 
-static inline void mask_edges_intra(uint32_t (*const masks)[32][3],
+static inline void mask_edges_intra(uint16_t (*const masks)[32][3][2],
                                     const int by4, const int bx4,
                                     const int w4, const int h4,
                                     const enum RectTxfmSize tx,
@@ -144,19 +154,28 @@
 
     // left block edge
     unsigned mask = 1U << by4;
-    for (y = 0; y < h4; y++, mask <<= 1)
-        masks[0][bx4][imin(twl4c, l[y])] |= mask;
+    for (y = 0; y < h4; y++, mask <<= 1) {
+        const int sidx = mask >= 0x10000;
+        const unsigned smask = mask >> (sidx << 4);
+        masks[0][bx4][imin(twl4c, l[y])][sidx] |= smask;
+    }
 
     // top block edge
-    for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1)
-        masks[1][by4][imin(thl4c, a[x])] |= mask;
+    for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
+        const int sidx = mask >= 0x10000;
+        const unsigned smask = mask >> (sidx << 4);
+        masks[1][by4][imin(thl4c, a[x])][sidx] |= smask;
+    }
 
     // inner (tx) left|right edges
     const int hstep = t_dim->w;
     unsigned t = 1U << by4;
     unsigned inner = (((uint64_t) t) << h4) - t;
-    for (x = hstep; x < w4; x += hstep)
-        masks[0][bx4 + x][twl4c] |= inner;
+    unsigned inner1 = inner & 0xffff, inner2 = inner >> 16;
+    for (x = hstep; x < w4; x += hstep) {
+        if (inner1) masks[0][bx4 + x][twl4c][0] |= inner1;
+        if (inner2) masks[0][bx4 + x][twl4c][1] |= inner2;
+    }
 
     //            top
     // inner (tx) --- edges
@@ -164,41 +183,58 @@
     const int vstep = t_dim->h;
     t = 1U << bx4;
     inner = (((uint64_t) t) << w4) - t;
-    for (y = vstep; y < h4; y += vstep)
-        masks[1][by4 + y][thl4c] |= inner;
+    inner1 = inner & 0xffff;
+    inner2 = inner >> 16;
+    for (y = vstep; y < h4; y += vstep) {
+        if (inner1) masks[1][by4 + y][thl4c][0] |= inner1;
+        if (inner2) masks[1][by4 + y][thl4c][1] |= inner2;
+    }
 
     memset(a, thl4c, w4);
     memset(l, twl4c, h4);
 }
 
-static inline void mask_edges_chroma(uint32_t (*const masks)[32][2],
+static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
                                      const int cby4, const int cbx4,
                                      const int cw4, const int ch4,
                                      const int skip_inter,
                                      const enum RectTxfmSize tx,
-                                     uint8_t *const a, uint8_t *const l)
+                                     uint8_t *const a, uint8_t *const l,
+                                     const int ss_hor, const int ss_ver)
 {
     const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
     const int twl4 = t_dim->lw, thl4 = t_dim->lh;
     const int twl4c = !!twl4, thl4c = !!thl4;
     int y, x;
+    const int vbits = 4 - ss_ver, hbits = 4 - ss_hor;
+    const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
+    const unsigned vmax = 1 << vmask, hmax = 1 << hmask;
 
     // left block edge
     unsigned mask = 1U << cby4;
-    for (y = 0; y < ch4; y++, mask <<= 1)
-        masks[0][cbx4][imin(twl4c, l[y])] |= mask;
+    for (y = 0; y < ch4; y++, mask <<= 1) {
+        const int sidx = mask >= vmax;
+        const unsigned smask = mask >> (sidx << vbits);
+        masks[0][cbx4][imin(twl4c, l[y])][sidx] |= smask;
+    }
 
     // top block edge
-    for (x = 0, mask = 1U << cbx4; x < cw4; x++, mask <<= 1)
-        masks[1][cby4][imin(thl4c, a[x])] |= mask;
+    for (x = 0, mask = 1U << cbx4; x < cw4; x++, mask <<= 1) {
+        const int sidx = mask >= hmax;
+        const unsigned smask = mask >> (sidx << hbits);
+        masks[1][cby4][imin(thl4c, a[x])][sidx] |= smask;
+    }
 
     if (!skip_inter) {
         // inner (tx) left|right edges
         const int hstep = t_dim->w;
-        int t = 1U << cby4;
+        unsigned t = 1U << cby4;
         unsigned inner = (((uint64_t) t) << ch4) - t;
-        for (x = hstep; x < cw4; x += hstep)
-            masks[0][cbx4 + x][twl4c] |= inner;
+        unsigned inner1 = inner & ((1 << vmask) - 1), inner2 = inner >> vmask;
+        for (x = hstep; x < cw4; x += hstep) {
+            if (inner1) masks[0][cbx4 + x][twl4c][0] |= inner1;
+            if (inner2) masks[0][cbx4 + x][twl4c][1] |= inner2;
+        }
 
         //            top
         // inner (tx) --- edges
@@ -206,8 +242,11 @@
         const int vstep = t_dim->h;
         t = 1U << cbx4;
         inner = (((uint64_t) t) << cw4) - t;
-        for (y = vstep; y < ch4; y += vstep)
-            masks[1][cby4 + y][thl4c] |= inner;
+        inner1 = inner & ((1 << hmask) - 1), inner2 = inner >> hmask;
+        for (y = vstep; y < ch4; y += vstep) {
+            if (inner1) masks[1][cby4 + y][thl4c][0] |= inner1;
+            if (inner2) masks[1][cby4 + y][thl4c][1] |= inner2;
+        }
     }
 
     memset(a, thl4c, cw4);
@@ -271,7 +310,8 @@
         level_cache_ptr += b4_stride;
     }
 
-    mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, 0, uvtx, auv, luv);
+    mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, 0, uvtx,
+                      auv, luv, ss_hor, ss_ver);
 }
 
 void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
@@ -332,7 +372,8 @@
         level_cache_ptr += b4_stride;
     }
 
-    mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, skip, uvtx, auv, luv);
+    mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, skip, uvtx,
+                      auv, luv, ss_hor, ss_ver);
 }
 
 void dav1d_calc_eih(Av1FilterLUT *const lim_lut, const int filter_sharpness) {
--- a/src/lf_mask.h
+++ b/src/lf_mask.h
@@ -50,8 +50,8 @@
 // each struct describes one 128x128 area (1 or 4 SBs)
 typedef struct Av1Filter {
     // each bit is 1 col
-    uint32_t filter_y[2 /* 0=col, 1=row */][32][3];
-    uint32_t filter_uv[2 /* 0=col, 1=row */][32][2];
+    uint16_t filter_y[2 /* 0=col, 1=row */][32][3][2];
+    uint16_t filter_uv[2 /* 0=col, 1=row */][32][2][2];
     int8_t cdef_idx[4]; // -1 means "unset"
     uint16_t noskip_mask[32][2];
     Av1RestorationUnit lr[3][4];