ref: bf7a4786bf0d097ca7e36e133d17d825ac4552be
parent: 04b70ea56d3258bedef3002ea877cc90277e5ab2
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Fri Oct 5 12:30:50 EDT 2018
Rewrite horizontal loopfilter Loop inside SIMD (instead of in the caller) so that we can handle multiple 4px blocks per iteration, allowing for more efficient SIMD. To make this easier, also transpose the masks for the hor filter.
--- a/src/lf_apply.c
+++ b/src/lf_apply.c
@@ -34,14 +34,6 @@
#include "src/lf_apply.h"
-static inline int maxifzero(const uint8_t (*const a)[4],
- const uint8_t (*const b)[4], const int diridx)
-{
- const int a_val = (*a)[diridx];
- if (a_val) return a_val;
- return (*b)[diridx];
-}
-
static inline void filter_plane_cols_y(const Dav1dFrameContext *const f,
const int have_left,
const uint8_t (*lvl)[4],
@@ -48,30 +40,22 @@
const ptrdiff_t b4_stride,
const uint32_t (*const mask)[3],
pixel *dst, const ptrdiff_t ls,
+ const int w,
const int starty4, const int endy4)
{
const Dav1dDSPContext *const dsp = f->dsp;
// filter edges between columns (e.g. block1 | block2)
- for (int y = starty4; y < endy4;
- y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
- {
- pixel *ptr = dst;
- const uint8_t (*l)[4] = lvl;
- const uint32_t *const hmask = mask[y];
- const unsigned hm = hmask[0] | hmask[1] | hmask[2];
-
- for (unsigned x = 1; hm & ~(x - 1); l++, x <<= 1, ptr += 4) {
- if ((have_left || x > 1) && (hm & x)) {
- const int L = maxifzero(l, &l[-1], 0);
- if (!L) continue;
- const int H = L >> 4;
- const int E = f->lf.lim_lut.e[L], I = f->lf.lim_lut.i[L];
- const int idx = (hmask[2] & x) ? 2 : !!(hmask[1] & x);
-
- dsp->lf.loop_filter[idx][0](ptr, ls, E, I, H);
- }
- }
+ for (int x = 0; x < w; x++) {
+ if (!have_left && !x) continue;
+ dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls,
+ starty4 ? (const uint32_t[3]) {
+ mask[x][0] >> starty4,
+ mask[x][1] >> starty4,
+ mask[x][2] >> starty4,
+ } : mask[x],
+ (const uint8_t(*)[4]) &lvl[x][0], b4_stride,
+ &f->lf.lim_lut, endy4 - starty4);
}
}
@@ -93,9 +77,9 @@
y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
{
if (!have_top && !y) continue;
- dsp->lf.loop_filter_sb128y(dst, ls, mask[y],
- (const uint8_t(*)[4]) &lvl[0][1], b4_stride,
- &f->lf.lim_lut, w);
+ dsp->lf.loop_filter_sb[0][1](dst, ls, mask[y],
+ (const uint8_t(*)[4]) &lvl[0][1], b4_stride,
+ &f->lf.lim_lut, w);
}
}
@@ -105,45 +89,28 @@
const ptrdiff_t b4_stride,
const uint32_t (*const mask)[2],
pixel *const u, pixel *const v,
- const ptrdiff_t ls,
+ const ptrdiff_t ls, const int w,
const int starty4, const int endy4)
{
const Dav1dDSPContext *const dsp = f->dsp;
- int y;
- ptrdiff_t off_l;
- const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
// filter edges between columns (e.g. block1 | block2)
- for (off_l = 0, y = starty4; y < endy4;
- y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
- {
- ptrdiff_t off = off_l;
- const uint8_t (*l)[4] = lvl;
- const uint32_t *const hmask = mask[y];
- const unsigned hm = hmask[0] | hmask[1];
-
- for (unsigned x = 1; hm & ~(x - 1); l++, x <<= 1, off += 4) {
- if ((have_left || x > 1) && (hm & x)) {
- const int idx = !!(hmask[1] & x);
-
- const int Lu = maxifzero(l, &l[-1], 2);
- if (Lu) {
- const int H = Lu >> 4;
- const int E = f->lf.lim_lut.e[Lu], I = f->lf.lim_lut.i[Lu];
-
- dsp->lf.loop_filter_uv[idx][0](&u[off], ls, E, I, H);
- }
-
- const int Lv = maxifzero(l, &l[-1], 3);
- if (Lv) {
- const int H = Lv >> 4;
- const int E = f->lf.lim_lut.e[Lv], I = f->lf.lim_lut.i[Lv];
-
- dsp->lf.loop_filter_uv[idx][0](&v[off], ls, E, I, H);
- }
- }
- }
+ for (int x = 0; x < w; x++) {
+ if (!have_left && !x) continue;
+ dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls,
+ starty4 ? (const uint32_t[2]) {
+ mask[x][0] >> starty4,
+ mask[x][1] >> starty4,
+ } : mask[x],
+ (const uint8_t(*)[4]) &lvl[x][2], b4_stride,
+ &f->lf.lim_lut, endy4 - starty4);
+ dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls,
+ starty4 ? (const uint32_t[2]) {
+ mask[x][0] >> starty4,
+ mask[x][1] >> starty4,
+ } : mask[x],
+ (const uint8_t(*)[4]) &lvl[x][3], b4_stride,
+ &f->lf.lim_lut, endy4 - starty4);
}
}
@@ -167,12 +134,12 @@
y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
{
if (!have_top && !y) continue;
- dsp->lf.loop_filter_sb128uv(&u[off_l], ls, mask[y],
- (const uint8_t(*)[4]) &lvl[0][2], b4_stride,
- &f->lf.lim_lut, w);
- dsp->lf.loop_filter_sb128uv(&v[off_l], ls, mask[y],
- (const uint8_t(*)[4]) &lvl[0][3], b4_stride,
- &f->lf.lim_lut, w);
+ dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, mask[y],
+ (const uint8_t(*)[4]) &lvl[0][2], b4_stride,
+ &f->lf.lim_lut, w);
+ dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, mask[y],
+ (const uint8_t(*)[4]) &lvl[0][3], b4_stride,
+ &f->lf.lim_lut, w);
}
}
@@ -200,22 +167,23 @@
for (int tile_col = 1;; tile_col++) {
x = f->frame_hdr.tiling.col_start_sb[tile_col];
if ((x << sbl2) >= f->bw) break;
- const int mask = x & is_sb64 ? 1 << 16 : 1;
- const int uv_mask = x & is_sb64 ? 1 << (16 >> ss_hor) : 1;
+ const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;
x >>= is_sb64;
- for (int y = starty4; y < endy4; y++) {
- const int idx = 2 * !!(lflvl[x].filter_y[0][y][2] & mask) +
- !!(lflvl[x].filter_y[0][y][1] & mask);
- lflvl[x].filter_y[0][y][2] &= ~mask;
- lflvl[x].filter_y[0][y][1] &= ~mask;
- lflvl[x].filter_y[0][y][0] &= ~mask;
- lflvl[x].filter_y[0][y][imin(idx, lpf_y[y - starty4])] |= mask;
+ for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {
+ const int idx = 2 * !!(lflvl[x].filter_y[0][bx4][2] & mask) +
+ !!(lflvl[x].filter_y[0][bx4][1] & mask);
+ lflvl[x].filter_y[0][bx4][2] &= ~mask;
+ lflvl[x].filter_y[0][bx4][1] &= ~mask;
+ lflvl[x].filter_y[0][bx4][0] &= ~mask;
+ lflvl[x].filter_y[0][bx4][imin(idx, lpf_y[y - starty4])] |= mask;
}
- for (int y = starty4 >> ss_ver; y < uv_endy4; y++) {
- const int idx = !!(lflvl[x].filter_uv[0][y][1] & uv_mask);
- lflvl[x].filter_uv[0][y][1] &= ~uv_mask;
- lflvl[x].filter_uv[0][y][0] &= ~uv_mask;
- lflvl[x].filter_uv[0][y][imin(idx, lpf_uv[y - (starty4 >> ss_ver)])] |= uv_mask;
+ for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;
+ y++, uv_mask <<= 1)
+ {
+ const int idx = !!(lflvl[x].filter_uv[0][cbx4][1] & uv_mask);
+ lflvl[x].filter_uv[0][cbx4][1] &= ~uv_mask;
+ lflvl[x].filter_uv[0][cbx4][0] &= ~uv_mask;
+ lflvl[x].filter_uv[0][cbx4][imin(idx, lpf_uv[y - (starty4 >> ss_ver)])] |= uv_mask;
}
lpf_y += halign;
lpf_uv += halign >> ss_ver;
@@ -257,8 +225,8 @@
x++, have_left = 1, ptr += 128, level_ptr += 32)
{
filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,
- lflvl[x].filter_y[0],
- ptr, f->cur.p.stride[0], starty4, endy4);
+ lflvl[x].filter_y[0], ptr, f->cur.p.stride[0],
+ imin(32, f->bw - x * 32), starty4, endy4);
}
level_ptr = f->lf.level + f->b4_stride * sby * sbsz;
@@ -279,6 +247,7 @@
filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,
lflvl[x].filter_uv[0],
&p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
+ (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
starty4 >> ss_ver, uv_endy4);
}
--- a/src/lf_mask.c
+++ b/src/lf_mask.c
@@ -98,30 +98,24 @@
max_tx, 0, y_off, x_off, tx_masks);
// left block edge
- unsigned mask = 1U << bx4;
- for (y = 0; y < h4; y++)
- masks[0][by4 + y][imin(txa[0][0][y][0], l[y])] |= mask;
+ unsigned mask = 1U << by4;
+ for (y = 0; y < h4; y++, mask <<= 1)
+ masks[0][bx4][imin(txa[0][0][y][0], l[y])] |= mask;
// top block edge
- for (x = 0; x < w4; x++, mask <<= 1)
+ for (mask = 1U << bx4, x = 0; x < w4; x++, mask <<= 1)
masks[1][by4][imin(txa[1][0][0][x], a[x])] |= mask;
if (!skip) {
// inner (tx) left|right edges
- for (y = 0; y < h4; y++) {
+ for (y = 0, mask = 1U << by4; y < h4; y++, mask <<= 1) {
int ltx = txa[0][0][y][0];
int step = txa[0][1][y][0];
- if (step < w4) {
- x = step;
- mask = 1U << (bx4 + step);
- do {
- const int rtx = txa[0][0][y][x];
- masks[0][by4 + y][imin(rtx, ltx)] |= mask;
- ltx = rtx;
- step = txa[0][1][y][x];
- x += step;
- mask <<= step;
- } while (x < w4);
+ for (x = step; x < w4; x += step) {
+ const int rtx = txa[0][0][y][x];
+ masks[0][bx4 + x][imin(rtx, ltx)] |= mask;
+ ltx = rtx;
+ step = txa[0][1][y][x];
}
}
@@ -157,29 +151,27 @@
int y, x;
// left block edge
- unsigned mask = 1U << bx4;
- for (y = 0; y < h4; y++)
- masks[0][by4 + y][imin(twl4c, l[y])] |= mask;
+ unsigned mask = 1U << by4;
+ for (y = 0; y < h4; y++, mask <<= 1)
+ masks[0][bx4][imin(twl4c, l[y])] |= mask;
// top block edge
for (mask = 1U << bx4, x = 0; x < w4; x++, mask <<= 1)
masks[1][by4][imin(thl4c, a[x])] |= mask;
- static const uint32_t hstep[] = {
- 0xffffffff, 0x55555555, 0x11111111, 0x01010101, 0x00010001
- };
-
// inner (tx) left|right edges
- const unsigned t = 1U << bx4;
- const unsigned inner = (((uint64_t) t) << w4) - t;
- mask = (inner - t) & hstep[twl4];
- for (y = 0; y < h4; y++)
- masks[0][by4 + y][twl4c] |= mask;
+ const int hstep = t_dim->w;
+ unsigned t = 1U << by4;
+ unsigned inner = (((uint64_t) t) << h4) - t;
+ for (x = hstep; x < w4; x += hstep)
+ masks[0][bx4 + x][twl4c] |= inner;
// top
// inner (tx) --- edges
// bottom
const int vstep = t_dim->h;
+ t = 1U << bx4;
+ inner = (((uint64_t) t) << w4) - t;
for (y = vstep; y < h4; y += vstep)
masks[1][by4 + y][thl4c] |= inner;
@@ -200,9 +192,9 @@
int y, x;
// left block edge
- unsigned mask = 1U << cbx4;
- for (y = 0; y < ch4; y++)
- masks[0][cby4 + y][imin(twl4c, l[y])] |= mask;
+ unsigned mask = 1U << cby4;
+ for (y = 0; y < ch4; y++, mask <<= 1)
+ masks[0][cbx4][imin(twl4c, l[y])] |= mask;
// top block edge
for (mask = 1U << cbx4, x = 0; x < cw4; x++, mask <<= 1)
@@ -209,21 +201,19 @@
masks[1][cby4][imin(thl4c, a[x])] |= mask;
if (!skip_inter) {
- static const uint32_t hstep[] = {
- 0xffffffff, 0x55555555, 0x11111111, 0x01010101
- };
-
// inner (tx) left|right edges
- const int t = 1U << cbx4;
- const unsigned inner = (((uint64_t) t) << cw4) - t;
- mask = (inner - t) & hstep[twl4];
- for (y = 0; y < ch4; y++)
- masks[0][cby4 + y][twl4c] |= mask;
+ const int hstep = t_dim->w;
+ int t = 1U << cby4;
+ unsigned inner = (((uint64_t) t) << ch4) - t;
+ for (x = hstep; x < cw4; x += hstep)
+ masks[0][cbx4 + x][twl4c] |= inner;
// top
// inner (tx) --- edges
// bottom
const int vstep = t_dim->h;
+ t = 1U << cbx4;
+ inner = (((uint64_t) t) << cw4) - t;
for (y = vstep; y < ch4; y += vstep)
masks[1][cby4 + y][thl4c] |= inner;
}
--- a/src/loopfilter.c
+++ b/src/loopfilter.c
@@ -159,27 +159,26 @@
}
}
-#define lf_4_fn(dir, wd, stridea, strideb) \
-static void loop_filter_##dir##_##wd##wd_4px_c(pixel *const dst, \
- const ptrdiff_t stride, \
- const int E, const int I, \
- const int H) \
-{ \
- loop_filter(dst, E, I, H, stridea, strideb, wd); \
+static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,
+ const uint32_t *const vmask,
+ const uint8_t (*l)[4], ptrdiff_t b4_stride,
+ const Av1FilterLUT *lut, const int h)
+{
+ const unsigned vm = (vmask[0] | vmask[1] | vmask[2]) & ((1ULL << h) - 1);
+ for (unsigned y = 1; vm & ~(y - 1);
+ y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
+ {
+ if (vm & y) {
+ const int L = l[0][0] ? l[0][0] : l[-1][0];
+ if (!L) continue;
+ const int H = L >> 4;
+ const int E = lut->e[L], I = lut->i[L];
+ const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);
+ loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx);
+ }
+ }
}
-#define lf_4_fns(wd) \
-lf_4_fn(h, wd, PXSTRIDE(stride), 1) \
-lf_4_fn(v, wd, 1, PXSTRIDE(stride))
-
-lf_4_fns(4)
-lf_4_fns(6)
-lf_4_fns(8)
-lf_4_fns(16)
-
-#undef lf_4_fn
-#undef lf_4_fns
-
static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,
const uint32_t *const vmask,
const uint8_t (*l)[4], ptrdiff_t b4_stride,
@@ -198,6 +197,26 @@
}
}
+static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,
+ const uint32_t *const vmask,
+ const uint8_t (*l)[4], ptrdiff_t b4_stride,
+ const Av1FilterLUT *lut, const int h)
+{
+ const unsigned vm = (vmask[0] | vmask[1]) & ((1ULL << h) - 1);
+ for (unsigned y = 1; vm & ~(y - 1);
+ y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
+ {
+ if (vm & y) {
+ const int L = l[0][0] ? l[0][0] : l[-1][0];
+ if (!L) continue;
+ const int H = L >> 4;
+ const int E = lut->e[L], I = lut->i[L];
+ const int idx = !!(vmask[1] & y);
+ loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx);
+ }
+ }
+}
+
static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
const uint32_t *const vmask,
const uint8_t (*l)[4], ptrdiff_t b4_stride,
@@ -217,20 +236,10 @@
}
void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
- c->loop_filter[0][0] = loop_filter_h_4wd_4px_c;
- c->loop_filter[0][1] = loop_filter_v_4wd_4px_c;
- c->loop_filter[1][0] = loop_filter_h_8wd_4px_c;
- c->loop_filter[1][1] = loop_filter_v_8wd_4px_c;
- c->loop_filter[2][0] = loop_filter_h_16wd_4px_c;
- c->loop_filter[2][1] = loop_filter_v_16wd_4px_c;
-
- c->loop_filter_uv[0][0] = loop_filter_h_4wd_4px_c;
- c->loop_filter_uv[0][1] = loop_filter_v_4wd_4px_c;
- c->loop_filter_uv[1][0] = loop_filter_h_6wd_4px_c;
- c->loop_filter_uv[1][1] = loop_filter_v_6wd_4px_c;
-
- c->loop_filter_sb128y = loop_filter_v_sb128y_c;
- c->loop_filter_sb128uv = loop_filter_v_sb128uv_c;
+ c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
+ c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
+ c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
+ c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
#if HAVE_ASM && ARCH_X86
bitfn(dav1d_loop_filter_dsp_init_x86)(c);
--- a/src/loopfilter.h
+++ b/src/loopfilter.h
@@ -36,10 +36,6 @@
#include "src/levels.h"
#include "src/lf_mask.h"
-#define decl_loopfilter_fn(name) \
-void (name)(pixel *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
-typedef decl_loopfilter_fn(*loopfilter_fn);
-
#define decl_loopfilter_sb_fn(name) \
void (name)(pixel *dst, ptrdiff_t stride, const uint32_t *mask, \
const uint8_t (*lvl)[4], ptrdiff_t lvl_stride, \
@@ -48,15 +44,12 @@
typedef struct Dav1dLoopFilterDSPContext {
/*
- * dimension 1: filter taps (0=4, 1=8, 2=16 for luma; 0=4, 1=6 for chroma)
+ * dimension 1: plane (0=luma, 1=chroma)
* dimension 2: 0=col-edge filter (h), 1=row-edge filter (v)
*
- * dst/stride are aligned by 4
+ * dst/stride are aligned by 32
*/
- loopfilter_fn loop_filter[3][2];
- loopfilter_fn loop_filter_uv[2][2];
- loopfilter_sb_fn loop_filter_sb128y;
- loopfilter_sb_fn loop_filter_sb128uv;
+ loopfilter_sb_fn loop_filter_sb[2][2];
} Dav1dLoopFilterDSPContext;
void dav1d_loop_filter_dsp_init_8bpc(Dav1dLoopFilterDSPContext *c);
--- a/src/x86/loopfilter_init.c
+++ b/src/x86/loopfilter_init.c
@@ -37,7 +37,7 @@
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8 && ARCH_X86_64
- c->loop_filter_sb128y = dav1d_lpf_v_sb128y_avx2;
- c->loop_filter_sb128uv = dav1d_lpf_v_sb128uv_avx2;
+ c->loop_filter_sb[0][1] = dav1d_lpf_v_sb128y_avx2;
+ c->loop_filter_sb[1][1] = dav1d_lpf_v_sb128uv_avx2;
#endif
}
--- a/tests/checkasm/loopfilter.c
+++ b/tests/checkasm/loopfilter.c
@@ -161,6 +161,6 @@
bitfn(dav1d_loop_filter_dsp_init)(&c);
- check_lpf_sb(c.loop_filter_sb128y, "lpf_v_sb128y", 3, 32, 1);
- check_lpf_sb(c.loop_filter_sb128uv, "lpf_v_sb128uv", 2, 16, 2);
+ check_lpf_sb(c.loop_filter_sb[0][1], "lpf_v_sb128y", 3, 32, 1);
+ check_lpf_sb(c.loop_filter_sb[1][1], "lpf_v_sb128uv", 2, 16, 2);
}