shithub: dav1d

--- a/src/decode.c

+++ b/src/decode.c

@@ -42,6 +42,7 @@

 #include "src/decode.h"

 #include "src/dequant_tables.h"

 #include "src/env.h"

+#include "src/film_grain.h"

 #include "src/log.h"

 #include "src/qm.h"

 #include "src/recon.h"

@@ -3190,6 +3191,7 @@

             dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \

             dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr); \

             dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \

+            dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \

             break

 #if CONFIG_8BPC

         case 8:

--- /dev/null

+++ b/src/fg_apply.h

@@ -1,0 +1,41 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#ifndef DAV1D_SRC_FG_APPLY_H

+#define DAV1D_SRC_FG_APPLY_H

+#include "dav1d/picture.h"

+#include "common/bitdepth.h"

+#include "src/film_grain.h"

+bitfn_decls(void dav1d_apply_grain, const Dav1dFilmGrainDSPContext *const dsp,

+                                    Dav1dPicture *const out,

+                                    const Dav1dPicture *const in);

+#endif /* DAV1D_SRC_FG_APPLY_H */

--- /dev/null

+++ b/src/fg_apply_tmpl.c

@@ -1,0 +1,176 @@

+/*

+ * Copyright © 2018, Niklas Haas

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "config.h"

+#include <stdint.h>

+#include "dav1d/picture.h"

+#include "common.h"

+#include "common/intops.h"

+#include "common/bitdepth.h"

+#include "fg_apply.h"

+static void generate_scaling(const int bitdepth,

+                             const uint8_t points[][2], const int num,

+                             uint8_t scaling[SCALING_SIZE])

+{

+    const int shift_x = bitdepth - 8;

+    const int scaling_size = 1 << bitdepth;

+    const int pad = 1 << shift_x;

+    // Fill up the preceding entries with the initial value

+    for (int i = 0; i < points[0][0] << shift_x; i++)

+        scaling[i] = points[0][1];

+    // Linearly interpolate the values in the middle

+    for (int i = 0; i < num - 1; i++) {

+        const int bx = points[i][0];

+        const int by = points[i][1];

+        const int ex = points[i+1][0];

+        const int ey = points[i+1][1];

+        const int dx = ex - bx;

+        const int dy = ey - by;

+        const int delta = dy * ((0x10000 + (dx >> 1)) / dx);

+        for (int x = 0; x < dx; x++) {

+            const int v = by + ((x * delta + 0x8000) >> 16);

+            scaling[(bx + x) << shift_x] = v;

+        }

+    }

+    // Fill up the remaining entries with the final value

+    for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++)

+        scaling[i] = points[num - 1][1];

+    if (pad <= 1) return;

+    const int rnd = pad >> 1;

+    for (int i = 0; i < num - 1; i++) {

+        const int bx = points[i][0] << shift_x;

+        const int ex = points[i+1][0] << shift_x;

+        const int dx = ex - bx;

+        for (int x = 0; x < dx; x += pad) {

+            const int range = scaling[bx + x + pad] - scaling[bx + x];

+            for (int n = 1; n < pad; n++) {

+                scaling[bx + x + n] = scaling[bx + x] + ((range * n + rnd) >> shift_x);

+            }

+        }

+    }

+}

+#ifndef UNIT_TEST

+void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,

+                              Dav1dPicture *const out,

+                              const Dav1dPicture *const in)

+{

+    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;

+    entry grain_lut[3][GRAIN_HEIGHT][GRAIN_WIDTH];

+    uint8_t scaling[3][SCALING_SIZE];

+#if BITDEPTH != 8

+    const int bitdepth_max = (1 << out->p.bpc) - 1;

+#endif

+    // Generate grain LUTs as needed

+    dsp->generate_grain_y(grain_lut[0], data HIGHBD_TAIL_SUFFIX); // always needed

+    if (data->num_uv_points[0] || data->chroma_scaling_from_luma)

+        dsp->generate_grain_uv[in->p.layout - 1](grain_lut[1], grain_lut[0],

+                                                 data, 0 HIGHBD_TAIL_SUFFIX);

+    if (data->num_uv_points[1] || data->chroma_scaling_from_luma)

+        dsp->generate_grain_uv[in->p.layout - 1](grain_lut[2], grain_lut[0],

+                                                 data, 1 HIGHBD_TAIL_SUFFIX);

+    // Generate scaling LUTs as needed

+    if (data->num_y_points)

+        generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);

+    if (data->num_uv_points[0])

+        generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);

+    if (data->num_uv_points[1])

+        generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]);

+    // Copy over the non-modified planes

+    // TODO: eliminate in favor of per-plane refs

+    assert(out->stride[0] == in->stride[0]);

+    if (!data->num_y_points) {

+        memcpy(out->data[0], in->data[0], out->p.h * out->stride[0]);

+    }

+    if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400) {

+        assert(out->stride[1] == in->stride[1]);

+        for (int i = 0; i < 2; i++) {

+            if (!data->num_uv_points[i] && !data->chroma_scaling_from_luma) {

+                const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;

+                memcpy(out->data[1+i], in->data[1+i],

+                       (out->p.h >> suby) * out->stride[1]);

+            }

+        }

+    }

+    // Synthesize grain for the affected planes

+    const int rows = (out->p.h + 31) >> 5;

+    const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;

+    const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;

+    for (int row = 0; row < rows; row++) {

+        const pixel *const luma_src =

+            ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);

+        if (data->num_y_points) {

+            const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE);

+            dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]),

+                             luma_src, out->stride[0], &out->frame_hdr->film_grain.data,

+                             out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);

+        }

+        const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;

+        const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;

+        if (data->chroma_scaling_from_luma) {

+            for (int pl = 0; pl < 2; pl++)

+                dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,

+                                                    ((const pixel *) in->data[1 + pl]) + uv_off,

+                                                    in->stride[1], luma_src,

+                                                    in->stride[0], out->p.w, bh,

+                                                    &out->frame_hdr->film_grain.data,

+                                                    grain_lut[1 + pl], scaling[0],

+                                                    pl, row, is_id HIGHBD_TAIL_SUFFIX);

+        } else {

+            for (int pl = 0; pl < 2; pl++)

+                if (data->num_uv_points[pl])

+                    dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,

+                                                        ((const pixel *) in->data[1 + pl]) + uv_off,

+                                                        in->stride[1], luma_src,

+                                                        in->stride[0], out->p.w, bh,

+                                                        &out->frame_hdr->film_grain.data,

+                                                        grain_lut[1 + pl],

+                                                        scaling[1 + pl], pl, row, is_id

+                                                        HIGHBD_TAIL_SUFFIX);

+        }

+    }

+}

+#endif

--- a/src/film_grain.h

+++ b/src/film_grain.h

@@ -28,9 +28,57 @@

 #ifndef DAV1D_SRC_FILM_GRAIN_H

 #define DAV1D_SRC_FILM_GRAIN_H

-#include "dav1d/dav1d.h"

+#include "common/bitdepth.h"

-bitfn_decls(void dav1d_apply_grain, Dav1dPicture *const out,

-                                    const Dav1dPicture *const in);

+#include "src/levels.h"

+#define GRAIN_WIDTH 82

+#define GRAIN_HEIGHT 73

+#define BLOCK_SIZE 32

+#if !defined(BITDEPTH) || BITDEPTH == 8

+#define SCALING_SIZE 256

+typedef int8_t entry;

+#else

+#define SCALING_SIZE 4096

+typedef int16_t entry;

+#endif

+#define decl_generate_grain_y_fn(name) \

+void (name)(entry buf[GRAIN_HEIGHT][GRAIN_WIDTH], \

+            const Dav1dFilmGrainData *const data HIGHBD_DECL_SUFFIX)

+typedef decl_generate_grain_y_fn(*generate_grain_y_fn);

+#define decl_generate_grain_uv_fn(name) \

+void (name)(entry buf[GRAIN_HEIGHT][GRAIN_WIDTH], \

+            const entry buf_y[GRAIN_HEIGHT][GRAIN_WIDTH], \

+            const Dav1dFilmGrainData *const data, const int uv HIGHBD_DECL_SUFFIX)

+typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn);

+#define decl_fgy_32x32xn_fn(name) \

+void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \

+            const Dav1dFilmGrainData *data, \

+            int pw, const uint8_t scaling[SCALING_SIZE], \

+            const entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH], \

+            int bh, int row_num HIGHBD_DECL_SUFFIX)

+typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn);

+#define decl_fguv_32x32xn_fn(name) \

+void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \

+            const pixel *luma_row, ptrdiff_t luma_stride, int pw, int bh, \

+            const Dav1dFilmGrainData *data, \

+            const entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH], \

+            const uint8_t scaling[SCALING_SIZE], \

+            int uv_pl, int row_num, int is_id HIGHBD_DECL_SUFFIX)

+typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn);

+typedef struct Dav1dFilmGrainDSPContext {

+    generate_grain_y_fn generate_grain_y;

+    generate_grain_uv_fn generate_grain_uv[3];

+    fgy_32x32xn_fn fgy_32x32xn;

+    fguv_32x32xn_fn fguv_32x32xn[3];

+} Dav1dFilmGrainDSPContext;

+bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c);

 #endif /* DAV1D_SRC_FILM_GRAIN_H */

--- a/src/film_grain_tmpl.c

+++ b/src/film_grain_tmpl.c

@@ -26,38 +26,16 @@

  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

-#include "config.h"

-#include <stdint.h>

-#include "common.h"

+#include "common/attributes.h"

 #include "common/intops.h"

-#include "common/bitdepth.h"

-#include "tables.h"

 #include "film_grain.h"

+#include "tables.h"

-#if BITDEPTH == 8

-typedef int8_t entry;

-#else

-typedef int16_t entry;

-#endif

+#define SUB_GRAIN_WIDTH 44

+#define SUB_GRAIN_HEIGHT 38

-enum {

-    GRAIN_WIDTH  = 82,

-    GRAIN_HEIGHT = 73,

-    SUB_GRAIN_WIDTH = 44,

-    SUB_GRAIN_HEIGHT = 38,

-    SUB_GRAIN_OFFSET = 6,

-    BLOCK_SIZE = 32,

-#if BITDEPTH == 8

-    SCALING_SIZE = 256

-#else

-    SCALING_SIZE = 4096

-#endif

-};

-static inline int get_random_number(const int bits, unsigned *state) {

+static inline int get_random_number(const int bits, unsigned *const state) {

     const int r = *state;

     unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;

     *state = (r >> 1) | (bit << 15);

@@ -69,13 +47,14 @@

     return (x + ((1 << shift) >> 1)) >> shift;

-static void generate_grain_y(const Dav1dPicture *const in,

-                             entry buf[GRAIN_HEIGHT][GRAIN_WIDTH])

+static void generate_grain_y_c(entry buf[GRAIN_HEIGHT][GRAIN_WIDTH],

+                               const Dav1dFilmGrainData *const data

+                               HIGHBD_DECL_SUFFIX)

-    const Dav1dFilmGrainData *data = &in->frame_hdr->film_grain.data;

+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;

     unsigned seed = data->seed;

-    const int shift = 12 - in->p.bpc + data->grain_scale_shift;

-    const int grain_ctr = 128 << (in->p.bpc - 8);

+    const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;

+    const int grain_ctr = 128 << bitdepth_min_8;

     const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;

     for (int y = 0; y < GRAIN_HEIGHT; y++) {

@@ -100,25 +79,24 @@

-            int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);

+            const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);

             buf[y][x] = iclip(grain, grain_min, grain_max);

-static void generate_grain_uv(const Dav1dPicture *const in, int uv,

-                              entry buf[GRAIN_HEIGHT][GRAIN_WIDTH],

-                              entry buf_y[GRAIN_HEIGHT][GRAIN_WIDTH])

+static NOINLINE void

+generate_grain_uv_c(entry buf[GRAIN_HEIGHT][GRAIN_WIDTH],

+                    const entry buf_y[GRAIN_HEIGHT][GRAIN_WIDTH],

+                    const Dav1dFilmGrainData *const data, const int uv,

+                    const int subx, const int suby HIGHBD_DECL_SUFFIX)

-    const Dav1dFilmGrainData *data = &in->frame_hdr->film_grain.data;

+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;

     unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524);

-    const int shift = 12 - in->p.bpc + data->grain_scale_shift;

-    const int grain_ctr = 128 << (in->p.bpc - 8);

+    const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;

+    const int grain_ctr = 128 << bitdepth_min_8;

     const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;

-    const int subx = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;

-    const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;

     const int chromaW = subx ? SUB_GRAIN_WIDTH  : GRAIN_WIDTH;

     const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;

@@ -166,56 +144,18 @@

-static void generate_scaling(const int bitdepth,

-                             const uint8_t points[][2], int num,

-                             uint8_t scaling[SCALING_SIZE])

-{

-    const int shift_x = bitdepth - 8;

-    const int scaling_size = 1 << bitdepth;

-    const int pad = 1 << shift_x;

-    // Fill up the preceding entries with the initial value

-    for (int i = 0; i < points[0][0] << shift_x; i++)

-        scaling[i] = points[0][1];

-    // Linearly interpolate the values in the middle

-    for (int i = 0; i < num - 1; i++) {

-        const int bx = points[i][0];

-        const int by = points[i][1];

-        const int ex = points[i+1][0];

-        const int ey = points[i+1][1];

-        const int dx = ex - bx;

-        const int dy = ey - by;

-        const int delta = dy * ((0x10000 + (dx >> 1)) / dx);

-        for (int x = 0; x < dx; x++) {

-            const int v = by + ((x * delta + 0x8000) >> 16);

-            scaling[(bx + x) << shift_x] = v;

-        }

-    }

-    // Fill up the remaining entries with the final value

-    for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++)

-        scaling[i] = points[num - 1][1];

-    if (pad > 1) {

-        const int rnd = pad >> 1;

-        for (int i = 0; i < num - 1; i++) {

-            const int bx = points[i][0] << shift_x;

-            const int ex = points[i+1][0] << shift_x;

-            const int dx = ex - bx;

-            for (int x = 0; x < dx; x += pad) {

-                const int range = scaling[bx + x + pad] - scaling[bx + x];

-                for (int n = 1; n < pad; n++) {

-                    scaling[bx + x + n] = scaling[bx + x] + ((range * n + rnd) >> shift_x);

-                }

-            }

-        }

-    }

-}

+#define gnuv_ss_fn(nm, ss_x, ss_y) \

+static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \

+    generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \

+}

+gnuv_ss_fn(420, 1, 1);

+gnuv_ss_fn(422, 1, 0);

+gnuv_ss_fn(444, 0, 0);

 // samples from the correct block of a grain LUT, while taking into account the

 // offsets provided by the offsets cache

-static inline entry sample_lut(entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],

+static inline entry sample_lut(const entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],

                                int offsets[2][2], int subx, int suby,

                                int bx, int by, int x, int y)

@@ -226,13 +166,15 @@

                     [offx + x + (BLOCK_SIZE >> subx) * bx];

-static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in,

-                           entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],

-                           uint8_t scaling[SCALING_SIZE], int row_num)

+static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,

+                          const ptrdiff_t stride,

+                          const Dav1dFilmGrainData *const data, const int pw,

+                          const uint8_t scaling[SCALING_SIZE],

+                          const entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],

+                          const int bh, const int row_num HIGHBD_DECL_SUFFIX)

-    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;

     const int rows = 1 + (data->overlap_flag && row_num > 0);

-    const int bitdepth_min_8 = in->p.bpc - 8;

+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;

     const int grain_ctr = 128 << bitdepth_min_8;

     const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;

@@ -242,7 +184,11 @@

         max_value = 235 << bitdepth_min_8;

     } else {

         min_value = 0;

-        max_value = (1U << in->p.bpc) - 1;

+#if BITDEPTH == 8

+        max_value = 0xff;

+#else

+        max_value = bitdepth_max;

+#endif

     // seed[0] contains the current row, seed[1] contains the previous

@@ -253,18 +199,13 @@

         seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);

-    const ptrdiff_t stride = out->stride[0];

     assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);

-    assert(stride == in->stride[0]);

-    pixel *const src_row = (pixel *)  in->data[0] + PXSTRIDE(stride) * row_num * BLOCK_SIZE;

-    pixel *const dst_row = (pixel *) out->data[0] + PXSTRIDE(stride) * row_num * BLOCK_SIZE;

     int offsets[2 /* col offset */][2 /* row offset */];

     // process this row in BLOCK_SIZE^2 blocks

-    const int bh = imin(out->p.h - row_num * BLOCK_SIZE, BLOCK_SIZE);

-    for (int bx = 0; bx < out->p.w; bx += BLOCK_SIZE) {

-        const int bw = imin(BLOCK_SIZE, out->p.w - bx);

+    for (int bx = 0; bx < pw; bx += BLOCK_SIZE) {

+        const int bw = imin(BLOCK_SIZE, pw - bx);

         if (data->overlap_flag && bx) {

             // shift previous offsets left

@@ -282,11 +223,11 @@

         static const int w[2][2] = { { 27, 17 }, { 17, 27 } };

-#define add_noise_y(x, y, grain)                                                \

-            pixel *src = src_row + (y) * PXSTRIDE(stride) + (bx + (x));         \

-            pixel *dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x));         \

-            int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \

-            *dst = iclip(*src + noise, min_value, max_value);

+#define add_noise_y(x, y, grain)                                                  \

+        const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx;     \

+        pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx;           \

+        const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \

+        *dst = iclip(*src + noise, min_value, max_value);

         for (int y = ystart; y < bh; y++) {

             // Non-overlapped image region (straightforward)

@@ -338,14 +279,18 @@

-static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const in,

-                            entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],

-                            uint8_t scaling[SCALING_SIZE], int uv, int row_num)

+static NOINLINE void

+fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,

+               const ptrdiff_t stride, const pixel *const luma_row,

+               const ptrdiff_t luma_stride, const int pw, const int bh,

+               const Dav1dFilmGrainData *const data,

+               const entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],

+               const uint8_t scaling[SCALING_SIZE],

+               const int uv, const int row_num, const int is_id,

+               const int sx, const int sy HIGHBD_DECL_SUFFIX)

-    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;

     const int rows = 1 + (data->overlap_flag && row_num > 0);

-    const int bitdepth_max = (1 << in->p.bpc) - 1;

-    const int bitdepth_min_8 = in->p.bpc - 8;

+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;

     const int grain_ctr = 128 << bitdepth_min_8;

     const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;

@@ -352,19 +297,16 @@

     int min_value, max_value;

     if (data->clip_to_restricted_range) {

         min_value = 16 << bitdepth_min_8;

-        if (out->seq_hdr->mtrx == DAV1D_MC_IDENTITY) {

-            max_value = 235 << bitdepth_min_8;

-        } else {

-            max_value = 240 << bitdepth_min_8;

-        }

+        max_value = (is_id ? 235 : 240) << bitdepth_min_8;

     } else {

         min_value = 0;

+#if BITDEPTH == 8

+        max_value = 0xff;

+#else

         max_value = bitdepth_max;

+#endif

-    const int sx = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;

-    const int sy = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;

     // seed[0] contains the current row, seed[1] contains the previous

     unsigned seed[2];

     for (int i = 0; i < rows; i++) {

@@ -373,21 +315,13 @@

         seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);

-    const ptrdiff_t stride = out->stride[1];

     assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);

-    assert(stride == in->stride[1]);

-    const int by = row_num * (BLOCK_SIZE >> sy);

-    pixel *const dst_row = (pixel *) out->data[1 + uv] + PXSTRIDE(stride) * by;

-    pixel *const src_row = (pixel *)  in->data[1 + uv] + PXSTRIDE(stride) * by;

-    pixel *const luma_row = (pixel *) in->data[0] + PXSTRIDE(in->stride[0]) * row_num * BLOCK_SIZE;

     int offsets[2 /* col offset */][2 /* row offset */];

     // process this row in BLOCK_SIZE^2 blocks (subsampled)

-    const int bh = (imin(out->p.h - row_num * BLOCK_SIZE, BLOCK_SIZE) + sy) >> sy;

-    for (int bx = 0; bx < (out->p.w + sx) >> sx; bx += BLOCK_SIZE >> sx) {

-        const int bw = (imin(BLOCK_SIZE, out->p.w - (bx << sx)) + sx) >> sx;

+    for (int bx = 0; bx < (pw + sx) >> sx; bx += BLOCK_SIZE >> sx) {

+        const int bw = (imin(BLOCK_SIZE, pw - (bx << sx)) + sx) >> sx;

         if (data->overlap_flag && bx) {

             // shift previous offsets left

             for (int i = 0; i < rows; i++)

@@ -407,25 +341,23 @@

             { { 23, 22 } },

};

-#define add_noise_uv(x, y, grain)                                               \

-            const int lx = (bx + x) << sx;                                      \

-            const int ly = y << sy;                                             \

-            pixel *luma = luma_row + ly * PXSTRIDE(in->stride[0]) + lx;         \

-            pixel avg = luma[0];                                                \

-            if (sx && lx + 1 < out->p.w)                                        \

-                avg = (avg + luma[1] + 1) >> 1;                                 \

-                                                                                \

-            pixel *src = src_row + (y) * PXSTRIDE(stride) + (bx + (x));         \

-            pixel *dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x));         \

-            int val = avg;                                                      \

-            if (!data->chroma_scaling_from_luma) {                              \

-                int combined = avg * data->uv_luma_mult[uv] +                   \

-                               *src * data->uv_mult[uv];                        \

-                val = iclip_pixel( (combined >> 6) +                            \

-                                   (data->uv_offset[uv] * (1 << bitdepth_min_8)) );   \

-            }                                                                   \

-                                                                                \

-            int noise = round2(scaling[ val ] * (grain), data->scaling_shift);  \

+#define add_noise_uv(x, y, grain)                                                    \

+            const int lx = (bx + x) << sx;                                           \

+            const int ly = y << sy;                                                  \

+            const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx;    \

+            pixel avg = luma[0];                                                     \

+            if (sx && lx + 1 < pw)                                                   \

+                avg = (avg + luma[1] + 1) >> 1;                                      \

+            const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x));  \

+            pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x));        \

+            int val = avg;                                                           \

+            if (!data->chroma_scaling_from_luma) {                                   \

+                const int combined = avg * data->uv_luma_mult[uv] +                  \

+                               *src * data->uv_mult[uv];                             \

+                val = iclip_pixel( (combined >> 6) +                                 \

+                                   (data->uv_offset[uv] * (1 << bitdepth_min_8)) );  \

+            }                                                                        \

+            const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \

             *dst = iclip(*src + noise, min_value, max_value);

         for (int y = ystart; y < bh; y++) {

@@ -478,61 +410,25 @@

-void bitfn(dav1d_apply_grain)(Dav1dPicture *const out,

-                              const Dav1dPicture *const in)

-{

-    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;

+#define fguv_ss_fn(nm, ss_x, ss_y) \

+static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \

+    fguv_32x32xn_c(dst_row, src_row, stride, luma_row, luma_stride, pw, bh, \

+                   data, grain_lut, scaling, uv_pl, row_num, is_id, ss_x, ss_y \

+                   HIGHBD_TAIL_SUFFIX); \

+}

-    entry grain_lut[3][GRAIN_HEIGHT][GRAIN_WIDTH];

-    uint8_t scaling[3][SCALING_SIZE];

+fguv_ss_fn(420, 1, 1);

+fguv_ss_fn(422, 1, 0);

+fguv_ss_fn(444, 0, 0);

-    // Generate grain LUTs as needed

-    generate_grain_y(out, grain_lut[0]); // always needed

-    if (data->num_uv_points[0] || data->chroma_scaling_from_luma)

-        generate_grain_uv(out, 0, grain_lut[1], grain_lut[0]);

-    if (data->num_uv_points[1] || data->chroma_scaling_from_luma)

-        generate_grain_uv(out, 1, grain_lut[2], grain_lut[0]);

+COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {

+    c->generate_grain_y = generate_grain_y_c;

+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c;

+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c;

+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c;

-    // Generate scaling LUTs as needed

-    if (data->num_y_points)

-        generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);

-    if (data->num_uv_points[0])

-        generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);

-    if (data->num_uv_points[1])

-        generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]);

-    // Copy over the non-modified planes

-    // TODO: eliminate in favor of per-plane refs

-    if (!data->num_y_points) {

-        assert(out->stride[0] == in->stride[0]);

-        memcpy(out->data[0], in->data[0], out->p.h * out->stride[0]);

-    }

-    if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400) {

-        for (int i = 0; i < 2; i++) {

-            if (!data->num_uv_points[i] && !data->chroma_scaling_from_luma) {

-                const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;

-                assert(out->stride[1] == in->stride[1]);

-                memcpy(out->data[1+i], in->data[1+i],

-                       (out->p.h >> suby) * out->stride[1]);

-            }

-        }

-    }

-    // Synthesize grain for the affected planes

-    int rows = (out->p.h + 31) >> 5;

-    for (int row = 0; row < rows; row++) {

-        if (data->num_y_points)

-            apply_to_row_y(out, in, grain_lut[0], scaling[0], row);

-        if (data->chroma_scaling_from_luma) {

-            apply_to_row_uv(out, in, grain_lut[1], scaling[0], 0, row);

-            apply_to_row_uv(out, in, grain_lut[2], scaling[0], 1, row);

-        } else {

-            if (data->num_uv_points[0])

-                apply_to_row_uv(out, in, grain_lut[1], scaling[1], 0, row);

-            if (data->num_uv_points[1])

-                apply_to_row_uv(out, in, grain_lut[2], scaling[2], 1, row);

-        }

-    }

+    c->fgy_32x32xn = fgy_32x32xn_c;

+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c;

+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c;

+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c;

--- a/src/internal.h

+++ b/src/internal.h

@@ -42,6 +42,7 @@

 #include "src/cdf.h"

 #include "src/data.h"

 #include "src/env.h"

+#include "src/film_grain.h"

 #include "src/intra_edge.h"

 #include "src/ipred.h"

 #include "src/itx.h"

@@ -57,6 +58,7 @@

 #include "src/thread.h"

 typedef struct Dav1dDSPContext {

+    Dav1dFilmGrainDSPContext fg;

     Dav1dIntraPredDSPContext ipred;

     Dav1dMCDSPContext mc;

     Dav1dInvTxfmDSPContext itx;

--- a/src/lib.c

+++ b/src/lib.c

@@ -37,6 +37,7 @@

 #include "common/mem.h"

 #include "common/validate.h"

+#include "src/fg_apply.h"

 #include "src/internal.h"

 #include "src/log.h"

 #include "src/obu.h"

@@ -44,7 +45,6 @@

 #include "src/ref.h"

 #include "src/thread_task.h"

 #include "src/wedge.h"

-#include "src/film_grain.h"

 static COLD void init_internal(void) {

     dav1d_init_wedge_masks();

@@ -290,13 +290,13 @@

     switch (out->p.bpc) {

 #if CONFIG_8BPC

     case 8:

-        dav1d_apply_grain_8bpc(out, in);

+        dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in);

         break;

 #endif

 #if CONFIG_16BPC

     case 10:

     case 12:

-        dav1d_apply_grain_16bpc(out, in);

+        dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in);

         break;

 #endif

     default:

--- a/src/meson.build

+++ b/src/meson.build

@@ -55,6 +55,7 @@

 libdav1d_tmpl_sources = files(

     'cdef_apply_tmpl.c',

     'cdef_tmpl.c',

+    'fg_apply_tmpl.c',

     'film_grain_tmpl.c',

     'ipred_prepare_tmpl.c',

     'ipred_tmpl.c',

--

⑨