shithub: dav1d

--- a/src/cdef.c

+++ /dev/null

@@ -1,298 +1,0 @@

-/*

- * Copyright © 2018, VideoLAN and dav1d authors

- * Copyright © 2018, Two Orioles, LLC

- * All rights reserved.

- *

- * Redistribution and use in source and binary forms, with or without

- * modification, are permitted provided that the following conditions are met:

- *

- * 1. Redistributions of source code must retain the above copyright notice, this

- *    list of conditions and the following disclaimer.

- *

- * 2. Redistributions in binary form must reproduce the above copyright notice,

- *    this list of conditions and the following disclaimer in the documentation

- *    and/or other materials provided with the distribution.

- *

- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

- */

-/*

- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved

- *

- * This source code is subject to the terms of the BSD 2 Clause License and

- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License

- * was not distributed with this source code in the LICENSE file, you can

- * obtain it at www.aomedia.org/license/software. If the Alliance for Open

- * Media Patent License 1.0 was not distributed with this source code in the

- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.

- */

-#include "config.h"

-#include <assert.h>

-#include <stdlib.h>

-#include "common/intops.h"

-#include "src/cdef.h"

-static const int8_t cdef_directions4[8 /* dir */][2 /* pass */] = {

-    { -1 * 8 + 1, -2 * 8 + 2 },

-    {  0 * 8 + 1, -1 * 8 + 2 },

-    {  0 * 8 + 1,  0 * 8 + 2 },

-    {  0 * 8 + 1,  1 * 8 + 2 },

-    {  1 * 8 + 1,  2 * 8 + 2 },

-    {  1 * 8 + 0,  2 * 8 + 1 },

-    {  1 * 8 + 0,  2 * 8 + 0 },

-    {  1 * 8 + 0,  2 * 8 - 1 }

-};

-static const int8_t cdef_directions8[8 /* dir */][2 /* pass */] = {

-    { -1 * 16 + 1, -2 * 16 + 2 },

-    {  0 * 16 + 1, -1 * 16 + 2 },

-    {  0 * 16 + 1,  0 * 16 + 2 },

-    {  0 * 16 + 1,  1 * 16 + 2 },

-    {  1 * 16 + 1,  2 * 16 + 2 },

-    {  1 * 16 + 0,  2 * 16 + 1 },

-    {  1 * 16 + 0,  2 * 16 + 0 },

-    {  1 * 16 + 0,  2 * 16 - 1 }

-};

-static const uint8_t cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };

-static const uint8_t cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };

-static inline int constrain(const int diff, const int threshold,

-                            const int damping)

-{

-    if (!threshold) return 0;

-    const int shift = imax(0, damping - ulog2(threshold));

-    return apply_sign(imin(abs(diff), imax(0, threshold - (abs(diff) >> shift))),

-                      diff);

-}

-/*

- * <code partially copied from libaom>

- */

-#define CDEF_VERY_LARGE (30000)

-static void fill(uint16_t *tmp, const ptrdiff_t stride,

-                 const int w, const int h)

-{

-    for (int y = 0; y < h; y++) {

-        for (int x = 0; x < w; x++)

-            tmp[x] = CDEF_VERY_LARGE;

-        tmp += stride;

-    }

-}

-/* Smooth in the direction detected. */

-static void cdef_filter_block_c(pixel *const dst, const ptrdiff_t dst_stride,

-                                /*const*/ pixel *const top[2],

-                                const int w, const int h, const int pri_strength,

-                                const int sec_strength, const int dir,

-                                const int damping, const enum CdefEdgeFlags edges)

-{

-    const ptrdiff_t tmp_stride = 16 >> (w == 4);

-    assert((w == 4 || w == 8) && (h == 4 || h == 8));

-    uint16_t tmp[192];  // 16*12 is the maximum value of tmp_stride * (h + 4)

-    uint16_t *tmp2 = tmp + 2 * tmp_stride + 2;

-    const uint8_t *const pri_taps = cdef_pri_taps[(pri_strength >> (BITDEPTH - 8)) & 1];

-    const uint8_t *const sec_taps = cdef_sec_taps[(pri_strength >> (BITDEPTH - 8)) & 1];

-    const int8_t (*cdef_directions)[2];

-    assert(w == 4 || w == 8);

-    cdef_directions = w == 4 ? cdef_directions4 : cdef_directions8;

-    // fill extended input buffer

-    int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2;

-    if (!(edges & HAVE_TOP)) {

-        fill(tmp, tmp_stride, w + 4, 2);

-        y_start = 0;

-    }

-    if (!(edges & HAVE_BOTTOM)) {

-        fill(tmp + (h + 2) * tmp_stride, tmp_stride, w + 4, 2);

-        y_end -= 2;

-    }

-    if (!(edges & HAVE_LEFT)) {

-        fill(tmp + (2 + y_start) * tmp_stride, tmp_stride, 2, y_end - y_start);

-        x_start = 0;

-    }

-    if (!(edges & HAVE_RIGHT)) {

-        fill(tmp + (2 + y_start) * tmp_stride + w + 2, tmp_stride,

-             2, y_end - y_start);

-        x_end -= 2;

-    }

-    for (int y = y_start; y < 0; y++)

-        for (int x = x_start; x < x_end; x++)

-            tmp2[y * tmp_stride + x] = top[y & 1][x];

-    for (int y = 0; y < y_end; y++)

-        for (int x = x_start; x < x_end; x++)

-            tmp2[y * tmp_stride + x] = dst[y * PXSTRIDE(dst_stride) + x];

-    // run actual filter

-    for (int y = 0; y < h; y++) {

-        for (int x = 0; x < w; x++) {

-            int sum = 0;

-            const int px = dst[y * PXSTRIDE(dst_stride) + x];

-            int max = px, min = px;

-            for (int k = 0; k < 2; k++) {

-                const int8_t off1 = cdef_directions[dir][k];

-                const int p0 = tmp2[y * tmp_stride + x + off1];

-                const int p1 = tmp2[y * tmp_stride + x - off1];

-                sum += pri_taps[k] * constrain(p0 - px, pri_strength, damping);

-                sum += pri_taps[k] * constrain(p1 - px, pri_strength, damping);

-                if (p0 != CDEF_VERY_LARGE) max = imax(p0, max);

-                if (p1 != CDEF_VERY_LARGE) max = imax(p1, max);

-                min = imin(p0, min);

-                min = imin(p1, min);

-                const int8_t off2 = cdef_directions[(dir + 2) & 7][k];

-                const int s0 = tmp2[y * tmp_stride + x + off2];

-                const int s1 = tmp2[y * tmp_stride + x - off2];

-                const int8_t off3 = cdef_directions[(dir + 6) & 7][k];

-                const int s2 = tmp2[y * tmp_stride + x + off3];

-                const int s3 = tmp2[y * tmp_stride + x - off3];

-                if (s0 != CDEF_VERY_LARGE) max = imax(s0, max);

-                if (s1 != CDEF_VERY_LARGE) max = imax(s1, max);

-                if (s2 != CDEF_VERY_LARGE) max = imax(s2, max);

-                if (s3 != CDEF_VERY_LARGE) max = imax(s3, max);

-                min = imin(s0, min);

-                min = imin(s1, min);

-                min = imin(s2, min);

-                min = imin(s3, min);

-                sum += sec_taps[k] * constrain(s0 - px, sec_strength, damping);

-                sum += sec_taps[k] * constrain(s1 - px, sec_strength, damping);

-                sum += sec_taps[k] * constrain(s2 - px, sec_strength, damping);

-                sum += sec_taps[k] * constrain(s3 - px, sec_strength, damping);

-            }

-            dst[y * PXSTRIDE(dst_stride) + x] =

-                iclip(px + ((8 + sum - (sum < 0)) >> 4), min, max);

-        }

-    }

-}

-/*

- * </code partially copied from libaom>

- */

-#define cdef_fn(w, h) \

-static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \

-                                            const ptrdiff_t stride, \

-                                            /*const*/ pixel *const top[2], \

-                                            const int pri_strength, \

-                                            const int sec_strength, \

-                                            const int dir, \

-                                            const int damping, \

-                                            const enum CdefEdgeFlags edges) \

-{ \

-    cdef_filter_block_c(dst, stride, top, w, h, pri_strength, sec_strength, \

-                        dir, damping, edges); \

-}

-cdef_fn(4, 4);

-cdef_fn(4, 8);

-cdef_fn(8, 8);

-/*

- * <code copied from libaom>

- */

-/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.

-   The search minimizes the weighted variance along all the lines in a

-   particular direction, i.e. the squared error between the input and a

-   "predicted" block where each pixel is replaced by the average along a line

-   in a particular direction. Since each direction have the same sum(x^2) term,

-   that term is never computed. See Section 2, step 2, of:

-   http://jmvalin.ca/notes/intra_paint.pdf */

-static const uint16_t div_table[] = {

-    0, 840, 420, 280, 210, 168, 140, 120, 105

-};

-static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,

-                           unsigned *const var)

-{

-    int i;

-    int32_t cost[8] = { 0 };

-    int partial[8][15] = { { 0 } };

-    int32_t best_cost = 0;

-    int best_dir = 0;

-    /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.

-     The output is then 840 times larger, but we don't care for finding

-     the max. */

-    for (i = 0; i < 8; i++) {

-        int j;

-        for (j = 0; j < 8; j++) {

-            int x;

-            /* We subtract 128 here to reduce the maximum range of the squared

-             partial sums. */

-            x = (img[i * PXSTRIDE(stride) + j] >> (BITDEPTH - 8)) - 128;

-            partial[0][i + j] += x;

-            partial[1][i + j / 2] += x;

-            partial[2][i] += x;

-            partial[3][3 + i - j / 2] += x;

-            partial[4][7 + i - j] += x;

-            partial[5][3 - i / 2 + j] += x;

-            partial[6][j] += x;

-            partial[7][i / 2 + j] += x;

-        }

-    }

-    for (i = 0; i < 8; i++) {

-        cost[2] += partial[2][i] * partial[2][i];

-        cost[6] += partial[6][i] * partial[6][i];

-    }

-    cost[2] *= div_table[8];

-    cost[6] *= div_table[8];

-    for (i = 0; i < 7; i++) {

-        cost[0] += (partial[0][i] * partial[0][i] +

-                    partial[0][14 - i] * partial[0][14 - i]) *

-                   div_table[i + 1];

-        cost[4] += (partial[4][i] * partial[4][i] +

-                    partial[4][14 - i] * partial[4][14 - i]) *

-                   div_table[i + 1];

-    }

-    cost[0] += partial[0][7] * partial[0][7] * div_table[8];

-    cost[4] += partial[4][7] * partial[4][7] * div_table[8];

-    for (i = 1; i < 8; i += 2) {

-        int j;

-        for (j = 0; j < 4 + 1; j++) {

-            cost[i] += partial[i][3 + j] * partial[i][3 + j];

-        }

-        cost[i] *= div_table[8];

-        for (j = 0; j < 4 - 1; j++) {

-            cost[i] += (partial[i][j] * partial[i][j] +

-                        partial[i][10 - j] * partial[i][10 - j]) *

-                       div_table[2 * j + 2];

-        }

-    }

-    for (i = 0; i < 8; i++) {

-        if (cost[i] > best_cost) {

-            best_cost = cost[i];

-            best_dir = i;

-        }

-    }

-    /* Difference between the optimal variance and the variance along the

-     orthogonal direction. Again, the sum(x^2) terms cancel out. */

-    *var = best_cost - cost[(best_dir + 4) & 7];

-    /* We'd normally divide by 840, but dividing by 1024 is close enough

-     for what we're going to do with this. */

-    *var >>= 10;

-    return best_dir;

-}

-/*

- * </code copied from libaom>

- */

-void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {

-    c->dir = cdef_find_dir_c;

-    c->fb[0] = cdef_filter_block_8x8_c;

-    c->fb[1] = cdef_filter_block_4x8_c;

-    c->fb[2] = cdef_filter_block_4x4_c;

-}

--- a/src/cdef_apply.c

+++ /dev/null

@@ -1,237 +1,0 @@

-/*

- * Copyright © 2018, VideoLAN and dav1d authors

- * Copyright © 2018, Two Orioles, LLC

- * All rights reserved.

- *

- * Redistribution and use in source and binary forms, with or without

- * modification, are permitted provided that the following conditions are met:

- *

- * 1. Redistributions of source code must retain the above copyright notice, this

- *    list of conditions and the following disclaimer.

- *

- * 2. Redistributions in binary form must reproduce the above copyright notice,

- *    this list of conditions and the following disclaimer in the documentation

- *    and/or other materials provided with the distribution.

- *

- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

- */

-#include "config.h"

-#include <string.h>

-#include "common/intops.h"

-#include "src/cdef_apply.h"

-static void backup2lines(pixel *const dst[3][2],

-                         /*const*/ pixel *const src[3],

-                         const ptrdiff_t src_stride[2], int y_off, int w,

-                         const enum Dav1dPixelLayout layout)

-{

-    pixel_copy(dst[0][0], src[0] + (y_off - 2) * PXSTRIDE(src_stride[0]), w);

-    pixel_copy(dst[0][1], src[0] + (y_off - 1) * PXSTRIDE(src_stride[0]), w);

-    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;

-    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;

-    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;

-    w >>= ss_hor;

-    y_off >>= ss_ver;

-    pixel_copy(dst[1][0], src[1] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);

-    pixel_copy(dst[1][1], src[1] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);

-    pixel_copy(dst[2][0], src[2] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);

-    pixel_copy(dst[2][1], src[2] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);

-}

-static void backup2x8(pixel dst[3][8][2],

-                      /*const*/ pixel *const src[3],

-                      const ptrdiff_t src_stride[2], int x_off,

-                      const enum Dav1dPixelLayout layout)

-{

-    for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))

-        pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);

-    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;

-    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;

-    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;

-    x_off >>= ss_hor;

-    for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) {

-        pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2);

-        pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2);

-    }

-}

-static void restore2x8(pixel *const dst[3],

-                       const ptrdiff_t dst_stride[2],

-                       const pixel src[3][8][2], const enum Dav1dPixelLayout layout)

-{

-    for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(dst_stride[0]))

-        pixel_copy(&dst[0][y_off - 2], src[0][y], 2);

-    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;

-    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;

-    for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(dst_stride[1])) {

-        pixel_copy(&dst[1][y_off - 2], src[1][y], 2);

-        pixel_copy(&dst[2][y_off - 2], src[2][y], 2);

-    }

-}

-static int adjust_strength(const int strength, const unsigned var) {

-    if (!var) return 0;

-    const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0;

-    return (strength * (4 + i) + 8) >> 4;

-}

-void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,

-                             pixel *const p[3],

-                             const Av1Filter *const lflvl,

-                             const int by_start, const int by_end)

-{

-    const Dav1dDSPContext *const dsp = f->dsp;

-    enum CdefEdgeFlags edges = HAVE_BOTTOM | (by_start > 0 ? HAVE_TOP : 0);

-    pixel *ptrs[3] = { p[0], p[1], p[2] };

-    const int sbsz = 16;

-    const int sb64w = f->sb128w << 1;

-    const int damping = f->frame_hdr.cdef.damping + BITDEPTH - 8;

-    const enum Dav1dPixelLayout layout = f->cur.p.p.layout;

-    const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;

-    const int has_chroma = layout != DAV1D_PIXEL_LAYOUT_I400;

-    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;

-    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;

-    // FIXME a design improvement that could be made here is to keep a set of

-    // flags for each block position on whether the block was filtered; if not,

-    // the backup of pre-filter data is empty, and the restore is therefore

-    // unnecessary as well.

-    for (int by = by_start; by < by_end; by += 2, edges |= HAVE_TOP) {

-        const int tf = f->lf.top_pre_cdef_toggle;

-        if (by + 2 >= f->bh) edges &= ~HAVE_BOTTOM;

-        if (edges & HAVE_BOTTOM) {

-            // backup pre-filter data for next iteration

-            backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.p.stride,

-                         8, f->bw * 4, layout);

-        }

-        pixel lr_bak[2 /* idx */][3 /* plane */][8 /* y */][2 /* x */];

-        pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };

-        edges &= ~HAVE_LEFT;

-        edges |= HAVE_RIGHT;

-        for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= HAVE_LEFT) {

-            const int sb128x = sbx >>1;

-            const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);

-            const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];

-            if (cdef_idx == -1 ||

-                (!f->frame_hdr.cdef.y_strength[cdef_idx] &&

-                 !f->frame_hdr.cdef.uv_strength[cdef_idx]))

-            {

-                last_skip = 1;

-                goto next_sb;

-            }

-            const int y_lvl = f->frame_hdr.cdef.y_strength[cdef_idx];

-            const int uv_lvl = f->frame_hdr.cdef.uv_strength[cdef_idx];

-            pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };

-            for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);

-                 bx += 2, edges |= HAVE_LEFT)

-            {

-                if (bx + 2 >= f->bw) edges &= ~HAVE_RIGHT;

-                // check if this 8x8 block had any coded coefficients; if not,

-                // go to the next block

-                const unsigned bx_mask = 3U << (bx & 14);

-                const int by_idx = by & 30, bx_idx = (bx & 16) >> 4;

-                if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |

-                       lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))

-                {

-                    last_skip = 1;

-                    goto next_b;

-                }

-                if (!last_skip) {

-                    // backup post-filter data (will be restored at the end)

-                    backup2x8(lr_bak[1], bptrs, f->cur.p.stride, 0, layout);

-                    // restore pre-filter data from last iteration

-                    restore2x8(bptrs, f->cur.p.stride, lr_bak[0], layout);

-                }

-                if (edges & HAVE_RIGHT) {

-                    // backup pre-filter data for next iteration

-                    backup2x8(lr_bak[0], bptrs, f->cur.p.stride, 8, layout);

-                }

-                // the actual filter

-                const int y_pri_lvl = (y_lvl >> 2) << (BITDEPTH - 8);

-                int y_sec_lvl = y_lvl & 3;

-                y_sec_lvl += y_sec_lvl == 3;

-                y_sec_lvl <<= BITDEPTH - 8;

-                const int uv_pri_lvl = (uv_lvl >> 2) << (BITDEPTH - 8);

-                int uv_sec_lvl = uv_lvl & 3;

-                uv_sec_lvl += uv_sec_lvl == 3;

-                uv_sec_lvl <<= BITDEPTH - 8;

-                unsigned variance;

-                const int dir = dsp->cdef.dir(bptrs[0], f->cur.p.stride[0],

-                                              &variance);

-                if (y_lvl) {

-                    dsp->cdef.fb[0](bptrs[0], f->cur.p.stride[0],

-                                    (pixel *const [2]) {

-                                        &f->lf.cdef_line_ptr[tf][0][0][bx * 4],

-                                        &f->lf.cdef_line_ptr[tf][0][1][bx * 4],

-                                    },

-                                    adjust_strength(y_pri_lvl, variance),

-                                    y_sec_lvl, y_pri_lvl ? dir : 0,

-                                    damping, edges);

-                }

-                if (uv_lvl && has_chroma) {

-                    const int uvdir =

-                        f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I422 ? dir :

-                        ((uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir];

-                    for (int pl = 1; pl <= 2; pl++) {

-                        dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.p.stride[1],

-                                             (pixel *const [2]) {

-                                                 &f->lf.cdef_line_ptr[tf][pl][0][bx * 4 >> ss_hor],

-                                                 &f->lf.cdef_line_ptr[tf][pl][1][bx * 4 >> ss_hor],

-                                             },

-                                             uv_pri_lvl, uv_sec_lvl,

-                                             uv_pri_lvl ? uvdir : 0,

-                                             damping - 1, edges);

-                    }

-                }

-                if (!last_skip) {

-                    // restore post-filter data from the beginning of this loop

-                    restore2x8(bptrs, f->cur.p.stride, lr_bak[1], layout);

-                }

-                last_skip = 0;

-            next_b:

-                bptrs[0] += 8;

-                bptrs[1] += 8 >> ss_hor;

-                bptrs[2] += 8 >> ss_hor;

-            }

-        next_sb:

-            iptrs[0] += sbsz * 4;

-            iptrs[1] += sbsz * 4 >> ss_hor;

-            iptrs[2] += sbsz * 4 >> ss_hor;

-        }

-        ptrs[0] += 8 * PXSTRIDE(f->cur.p.stride[0]);

-        ptrs[1] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;

-        ptrs[2] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;

-        f->lf.top_pre_cdef_toggle ^= 1;

-    }

-}

--- /dev/null

+++ b/src/cdef_apply_tmpl.c

@@ -1,0 +1,237 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "config.h"

+#include <string.h>

+#include "common/intops.h"

+#include "src/cdef_apply.h"

+static void backup2lines(pixel *const dst[3][2],

+                         /*const*/ pixel *const src[3],

+                         const ptrdiff_t src_stride[2], int y_off, int w,

+                         const enum Dav1dPixelLayout layout)

+{

+    pixel_copy(dst[0][0], src[0] + (y_off - 2) * PXSTRIDE(src_stride[0]), w);

+    pixel_copy(dst[0][1], src[0] + (y_off - 1) * PXSTRIDE(src_stride[0]), w);

+    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;

+    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;

+    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;

+    w >>= ss_hor;

+    y_off >>= ss_ver;

+    pixel_copy(dst[1][0], src[1] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);

+    pixel_copy(dst[1][1], src[1] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);

+    pixel_copy(dst[2][0], src[2] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);

+    pixel_copy(dst[2][1], src[2] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);

+}

+static void backup2x8(pixel dst[3][8][2],

+                      /*const*/ pixel *const src[3],

+                      const ptrdiff_t src_stride[2], int x_off,

+                      const enum Dav1dPixelLayout layout)

+{

+    for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))

+        pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);

+    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;

+    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;

+    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;

+    x_off >>= ss_hor;

+    for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) {

+        pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2);

+        pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2);

+    }

+}

+static void restore2x8(pixel *const dst[3],

+                       const ptrdiff_t dst_stride[2],

+                       const pixel src[3][8][2], const enum Dav1dPixelLayout layout)

+{

+    for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(dst_stride[0]))

+        pixel_copy(&dst[0][y_off - 2], src[0][y], 2);

+    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;

+    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;

+    for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(dst_stride[1])) {

+        pixel_copy(&dst[1][y_off - 2], src[1][y], 2);

+        pixel_copy(&dst[2][y_off - 2], src[2][y], 2);

+    }

+}

+static int adjust_strength(const int strength, const unsigned var) {

+    if (!var) return 0;

+    const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0;

+    return (strength * (4 + i) + 8) >> 4;

+}

+void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,

+                             pixel *const p[3],

+                             const Av1Filter *const lflvl,

+                             const int by_start, const int by_end)

+{

+    const Dav1dDSPContext *const dsp = f->dsp;

+    enum CdefEdgeFlags edges = HAVE_BOTTOM | (by_start > 0 ? HAVE_TOP : 0);

+    pixel *ptrs[3] = { p[0], p[1], p[2] };

+    const int sbsz = 16;

+    const int sb64w = f->sb128w << 1;

+    const int damping = f->frame_hdr.cdef.damping + BITDEPTH - 8;

+    const enum Dav1dPixelLayout layout = f->cur.p.p.layout;

+    const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;

+    const int has_chroma = layout != DAV1D_PIXEL_LAYOUT_I400;

+    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;

+    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;

+    // FIXME a design improvement that could be made here is to keep a set of

+    // flags for each block position on whether the block was filtered; if not,

+    // the backup of pre-filter data is empty, and the restore is therefore

+    // unnecessary as well.

+    for (int by = by_start; by < by_end; by += 2, edges |= HAVE_TOP) {

+        const int tf = f->lf.top_pre_cdef_toggle;

+        if (by + 2 >= f->bh) edges &= ~HAVE_BOTTOM;

+        if (edges & HAVE_BOTTOM) {

+            // backup pre-filter data for next iteration

+            backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.p.stride,

+                         8, f->bw * 4, layout);

+        }

+        pixel lr_bak[2 /* idx */][3 /* plane */][8 /* y */][2 /* x */];

+        pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };

+        edges &= ~HAVE_LEFT;

+        edges |= HAVE_RIGHT;

+        for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= HAVE_LEFT) {

+            const int sb128x = sbx >>1;

+            const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);

+            const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];

+            if (cdef_idx == -1 ||

+                (!f->frame_hdr.cdef.y_strength[cdef_idx] &&

+                 !f->frame_hdr.cdef.uv_strength[cdef_idx]))

+            {

+                last_skip = 1;

+                goto next_sb;

+            }

+            const int y_lvl = f->frame_hdr.cdef.y_strength[cdef_idx];

+            const int uv_lvl = f->frame_hdr.cdef.uv_strength[cdef_idx];

+            pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };

+            for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);

+                 bx += 2, edges |= HAVE_LEFT)

+            {

+                if (bx + 2 >= f->bw) edges &= ~HAVE_RIGHT;

+                // check if this 8x8 block had any coded coefficients; if not,

+                // go to the next block

+                const unsigned bx_mask = 3U << (bx & 14);

+                const int by_idx = by & 30, bx_idx = (bx & 16) >> 4;

+                if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |

+                       lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))

+                {

+                    last_skip = 1;

+                    goto next_b;

+                }

+                if (!last_skip) {

+                    // backup post-filter data (will be restored at the end)

+                    backup2x8(lr_bak[1], bptrs, f->cur.p.stride, 0, layout);

+                    // restore pre-filter data from last iteration

+                    restore2x8(bptrs, f->cur.p.stride, lr_bak[0], layout);

+                }

+                if (edges & HAVE_RIGHT) {

+                    // backup pre-filter data for next iteration

+                    backup2x8(lr_bak[0], bptrs, f->cur.p.stride, 8, layout);

+                }

+                // the actual filter

+                const int y_pri_lvl = (y_lvl >> 2) << (BITDEPTH - 8);

+                int y_sec_lvl = y_lvl & 3;

+                y_sec_lvl += y_sec_lvl == 3;

+                y_sec_lvl <<= BITDEPTH - 8;

+                const int uv_pri_lvl = (uv_lvl >> 2) << (BITDEPTH - 8);

+                int uv_sec_lvl = uv_lvl & 3;

+                uv_sec_lvl += uv_sec_lvl == 3;

+                uv_sec_lvl <<= BITDEPTH - 8;

+                unsigned variance;

+                const int dir = dsp->cdef.dir(bptrs[0], f->cur.p.stride[0],

+                                              &variance);

+                if (y_lvl) {

+                    dsp->cdef.fb[0](bptrs[0], f->cur.p.stride[0],

+                                    (pixel *const [2]) {

+                                        &f->lf.cdef_line_ptr[tf][0][0][bx * 4],

+                                        &f->lf.cdef_line_ptr[tf][0][1][bx * 4],

+                                    },

+                                    adjust_strength(y_pri_lvl, variance),

+                                    y_sec_lvl, y_pri_lvl ? dir : 0,

+                                    damping, edges);

+                }

+                if (uv_lvl && has_chroma) {

+                    const int uvdir =

+                        f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I422 ? dir :

+                        ((uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir];

+                    for (int pl = 1; pl <= 2; pl++) {

+                        dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.p.stride[1],

+                                             (pixel *const [2]) {

+                                                 &f->lf.cdef_line_ptr[tf][pl][0][bx * 4 >> ss_hor],

+                                                 &f->lf.cdef_line_ptr[tf][pl][1][bx * 4 >> ss_hor],

+                                             },

+                                             uv_pri_lvl, uv_sec_lvl,

+                                             uv_pri_lvl ? uvdir : 0,

+                                             damping - 1, edges);

+                    }

+                }

+                if (!last_skip) {

+                    // restore post-filter data from the beginning of this loop

+                    restore2x8(bptrs, f->cur.p.stride, lr_bak[1], layout);

+                }

+                last_skip = 0;

+            next_b:

+                bptrs[0] += 8;

+                bptrs[1] += 8 >> ss_hor;

+                bptrs[2] += 8 >> ss_hor;

+            }

+        next_sb:

+            iptrs[0] += sbsz * 4;

+            iptrs[1] += sbsz * 4 >> ss_hor;

+            iptrs[2] += sbsz * 4 >> ss_hor;

+        }

+        ptrs[0] += 8 * PXSTRIDE(f->cur.p.stride[0]);

+        ptrs[1] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;

+        ptrs[2] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;

+        f->lf.top_pre_cdef_toggle ^= 1;

+    }

+}

--- /dev/null

+++ b/src/cdef_tmpl.c

@@ -1,0 +1,298 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+/*

+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved

+ *

+ * This source code is subject to the terms of the BSD 2 Clause License and

+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License

+ * was not distributed with this source code in the LICENSE file, you can

+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open

+ * Media Patent License 1.0 was not distributed with this source code in the

+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.

+ */

+#include "config.h"

+#include <assert.h>

+#include <stdlib.h>

+#include "common/intops.h"

+#include "src/cdef.h"

+static const int8_t cdef_directions4[8 /* dir */][2 /* pass */] = {

+    { -1 * 8 + 1, -2 * 8 + 2 },

+    {  0 * 8 + 1, -1 * 8 + 2 },

+    {  0 * 8 + 1,  0 * 8 + 2 },

+    {  0 * 8 + 1,  1 * 8 + 2 },

+    {  1 * 8 + 1,  2 * 8 + 2 },

+    {  1 * 8 + 0,  2 * 8 + 1 },

+    {  1 * 8 + 0,  2 * 8 + 0 },

+    {  1 * 8 + 0,  2 * 8 - 1 }

+};

+static const int8_t cdef_directions8[8 /* dir */][2 /* pass */] = {

+    { -1 * 16 + 1, -2 * 16 + 2 },

+    {  0 * 16 + 1, -1 * 16 + 2 },

+    {  0 * 16 + 1,  0 * 16 + 2 },

+    {  0 * 16 + 1,  1 * 16 + 2 },

+    {  1 * 16 + 1,  2 * 16 + 2 },

+    {  1 * 16 + 0,  2 * 16 + 1 },

+    {  1 * 16 + 0,  2 * 16 + 0 },

+    {  1 * 16 + 0,  2 * 16 - 1 }

+};

+static const uint8_t cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };

+static const uint8_t cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };

+static inline int constrain(const int diff, const int threshold,

+                            const int damping)

+{

+    if (!threshold) return 0;

+    const int shift = imax(0, damping - ulog2(threshold));

+    return apply_sign(imin(abs(diff), imax(0, threshold - (abs(diff) >> shift))),

+                      diff);

+}

+/*

+ * <code partially copied from libaom>

+ */

+#define CDEF_VERY_LARGE (30000)

+static void fill(uint16_t *tmp, const ptrdiff_t stride,

+                 const int w, const int h)

+{

+    for (int y = 0; y < h; y++) {

+        for (int x = 0; x < w; x++)

+            tmp[x] = CDEF_VERY_LARGE;

+        tmp += stride;

+    }

+}

+/* Smooth in the direction detected. */

+static void cdef_filter_block_c(pixel *const dst, const ptrdiff_t dst_stride,

+                                /*const*/ pixel *const top[2],

+                                const int w, const int h, const int pri_strength,

+                                const int sec_strength, const int dir,

+                                const int damping, const enum CdefEdgeFlags edges)

+{

+    const ptrdiff_t tmp_stride = 16 >> (w == 4);

+    assert((w == 4 || w == 8) && (h == 4 || h == 8));

+    uint16_t tmp[192];  // 16*12 is the maximum value of tmp_stride * (h + 4)

+    uint16_t *tmp2 = tmp + 2 * tmp_stride + 2;

+    const uint8_t *const pri_taps = cdef_pri_taps[(pri_strength >> (BITDEPTH - 8)) & 1];

+    const uint8_t *const sec_taps = cdef_sec_taps[(pri_strength >> (BITDEPTH - 8)) & 1];

+    const int8_t (*cdef_directions)[2];

+    assert(w == 4 || w == 8);

+    cdef_directions = w == 4 ? cdef_directions4 : cdef_directions8;

+    // fill extended input buffer

+    int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2;

+    if (!(edges & HAVE_TOP)) {

+        fill(tmp, tmp_stride, w + 4, 2);

+        y_start = 0;

+    }

+    if (!(edges & HAVE_BOTTOM)) {

+        fill(tmp + (h + 2) * tmp_stride, tmp_stride, w + 4, 2);

+        y_end -= 2;

+    }

+    if (!(edges & HAVE_LEFT)) {

+        fill(tmp + (2 + y_start) * tmp_stride, tmp_stride, 2, y_end - y_start);

+        x_start = 0;

+    }

+    if (!(edges & HAVE_RIGHT)) {

+        fill(tmp + (2 + y_start) * tmp_stride + w + 2, tmp_stride,

+             2, y_end - y_start);

+        x_end -= 2;

+    }

+    for (int y = y_start; y < 0; y++)

+        for (int x = x_start; x < x_end; x++)

+            tmp2[y * tmp_stride + x] = top[y & 1][x];

+    for (int y = 0; y < y_end; y++)

+        for (int x = x_start; x < x_end; x++)

+            tmp2[y * tmp_stride + x] = dst[y * PXSTRIDE(dst_stride) + x];

+    // run actual filter

+    for (int y = 0; y < h; y++) {

+        for (int x = 0; x < w; x++) {

+            int sum = 0;

+            const int px = dst[y * PXSTRIDE(dst_stride) + x];

+            int max = px, min = px;

+            for (int k = 0; k < 2; k++) {

+                const int8_t off1 = cdef_directions[dir][k];

+                const int p0 = tmp2[y * tmp_stride + x + off1];

+                const int p1 = tmp2[y * tmp_stride + x - off1];

+                sum += pri_taps[k] * constrain(p0 - px, pri_strength, damping);

+                sum += pri_taps[k] * constrain(p1 - px, pri_strength, damping);

+                if (p0 != CDEF_VERY_LARGE) max = imax(p0, max);

+                if (p1 != CDEF_VERY_LARGE) max = imax(p1, max);

+                min = imin(p0, min);

+                min = imin(p1, min);

+                const int8_t off2 = cdef_directions[(dir + 2) & 7][k];

+                const int s0 = tmp2[y * tmp_stride + x + off2];

+                const int s1 = tmp2[y * tmp_stride + x - off2];

+                const int8_t off3 = cdef_directions[(dir + 6) & 7][k];

+                const int s2 = tmp2[y * tmp_stride + x + off3];

+                const int s3 = tmp2[y * tmp_stride + x - off3];

+                if (s0 != CDEF_VERY_LARGE) max = imax(s0, max);

+                if (s1 != CDEF_VERY_LARGE) max = imax(s1, max);

+                if (s2 != CDEF_VERY_LARGE) max = imax(s2, max);

+                if (s3 != CDEF_VERY_LARGE) max = imax(s3, max);

+                min = imin(s0, min);

+                min = imin(s1, min);

+                min = imin(s2, min);

+                min = imin(s3, min);

+                sum += sec_taps[k] * constrain(s0 - px, sec_strength, damping);

+                sum += sec_taps[k] * constrain(s1 - px, sec_strength, damping);

+                sum += sec_taps[k] * constrain(s2 - px, sec_strength, damping);

+                sum += sec_taps[k] * constrain(s3 - px, sec_strength, damping);

+            }

+            dst[y * PXSTRIDE(dst_stride) + x] =

+                iclip(px + ((8 + sum - (sum < 0)) >> 4), min, max);

+        }

+    }

+}

+/*

+ * </code partially copied from libaom>

+ */

+#define cdef_fn(w, h) \

+static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \

+                                            const ptrdiff_t stride, \

+                                            /*const*/ pixel *const top[2], \

+                                            const int pri_strength, \

+                                            const int sec_strength, \

+                                            const int dir, \

+                                            const int damping, \

+                                            const enum CdefEdgeFlags edges) \

+{ \

+    cdef_filter_block_c(dst, stride, top, w, h, pri_strength, sec_strength, \

+                        dir, damping, edges); \

+}

+cdef_fn(4, 4);

+cdef_fn(4, 8);

+cdef_fn(8, 8);

+/*

+ * <code copied from libaom>

+ */

+/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.

+   The search minimizes the weighted variance along all the lines in a

+   particular direction, i.e. the squared error between the input and a

+   "predicted" block where each pixel is replaced by the average along a line

+   in a particular direction. Since each direction have the same sum(x^2) term,

+   that term is never computed. See Section 2, step 2, of:

+   http://jmvalin.ca/notes/intra_paint.pdf */

+static const uint16_t div_table[] = {

+    0, 840, 420, 280, 210, 168, 140, 120, 105

+};

+static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,

+                           unsigned *const var)

+{

+    int i;

+    int32_t cost[8] = { 0 };

+    int partial[8][15] = { { 0 } };

+    int32_t best_cost = 0;

+    int best_dir = 0;

+    /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.

+     The output is then 840 times larger, but we don't care for finding

+     the max. */

+    for (i = 0; i < 8; i++) {

+        int j;

+        for (j = 0; j < 8; j++) {

+            int x;

+            /* We subtract 128 here to reduce the maximum range of the squared

+             partial sums. */

+            x = (img[i * PXSTRIDE(stride) + j] >> (BITDEPTH - 8)) - 128;

+            partial[0][i + j] += x;

+            partial[1][i + j / 2] += x;

+            partial[2][i] += x;

+            partial[3][3 + i - j / 2] += x;

+            partial[4][7 + i - j] += x;

+            partial[5][3 - i / 2 + j] += x;

+            partial[6][j] += x;

+            partial[7][i / 2 + j] += x;

+        }

+    }

+    for (i = 0; i < 8; i++) {

+        cost[2] += partial[2][i] * partial[2][i];

+        cost[6] += partial[6][i] * partial[6][i];

+    }

+    cost[2] *= div_table[8];

+    cost[6] *= div_table[8];

+    for (i = 0; i < 7; i++) {

+        cost[0] += (partial[0][i] * partial[0][i] +

+                    partial[0][14 - i] * partial[0][14 - i]) *

+                   div_table[i + 1];

+        cost[4] += (partial[4][i] * partial[4][i] +

+                    partial[4][14 - i] * partial[4][14 - i]) *

+                   div_table[i + 1];

+    }

+    cost[0] += partial[0][7] * partial[0][7] * div_table[8];

+    cost[4] += partial[4][7] * partial[4][7] * div_table[8];

+    for (i = 1; i < 8; i += 2) {

+        int j;

+        for (j = 0; j < 4 + 1; j++) {

+            cost[i] += partial[i][3 + j] * partial[i][3 + j];

+        }

+        cost[i] *= div_table[8];

+        for (j = 0; j < 4 - 1; j++) {

+            cost[i] += (partial[i][j] * partial[i][j] +

+                        partial[i][10 - j] * partial[i][10 - j]) *

+                       div_table[2 * j + 2];

+        }

+    }

+    for (i = 0; i < 8; i++) {

+        if (cost[i] > best_cost) {

+            best_cost = cost[i];

+            best_dir = i;

+        }

+    }

+    /* Difference between the optimal variance and the variance along the

+     orthogonal direction. Again, the sum(x^2) terms cancel out. */

+    *var = best_cost - cost[(best_dir + 4) & 7];

+    /* We'd normally divide by 840, but dividing by 1024 is close enough

+     for what we're going to do with this. */

+    *var >>= 10;

+    return best_dir;

+}

+/*

+ * </code copied from libaom>

+ */

+void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {

+    c->dir = cdef_find_dir_c;

+    c->fb[0] = cdef_filter_block_8x8_c;

+    c->fb[1] = cdef_filter_block_4x8_c;

+    c->fb[2] = cdef_filter_block_4x4_c;

+}

--- a/src/ipred.c

+++ /dev/null

@@ -1,757 +1,0 @@

-/*

- * Copyright © 2018, VideoLAN and dav1d authors

- * Copyright © 2018, Two Orioles, LLC

- * All rights reserved.

- *

- * Redistribution and use in source and binary forms, with or without

- * modification, are permitted provided that the following conditions are met:

- *

- * 1. Redistributions of source code must retain the above copyright notice, this

- *    list of conditions and the following disclaimer.

- *

- * 2. Redistributions in binary form must reproduce the above copyright notice,

- *    this list of conditions and the following disclaimer in the documentation

- *    and/or other materials provided with the distribution.

- *

- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

- */

-#include "config.h"

-#include <assert.h>

-#include <stdlib.h>

-#include <string.h>

-#include "common/attributes.h"

-#include "common/intops.h"

-#include "src/ipred.h"

-#include "src/tables.h"

-static NOINLINE void

-splat_dc(pixel *dst, const ptrdiff_t stride,

-         const int width, const int height, const unsigned dc)

-{

-    assert(dc <= (1 << BITDEPTH) - 1);

-#if BITDEPTH == 8

-    if (width > 4) {

-        const uint64_t dcN = dc * 0x0101010101010101ULL;

-        for (int y = 0; y < height; y++) {

-            for (int x = 0; x < width; x += sizeof(dcN))

-                *((uint64_t *) &dst[x]) = dcN;

-            dst += PXSTRIDE(stride);

-        }

-    } else {

-        const unsigned dcN = dc * 0x01010101U;

-        for (int y = 0; y < height; y++) {

-            for (int x = 0; x < width; x += sizeof(dcN))

-                *((unsigned *) &dst[x]) = dcN;

-            dst += PXSTRIDE(stride);

-        }

-    }

-#else

-    const uint64_t dcN = dc * 0x0001000100010001ULL;

-    for (int y = 0; y < height; y++) {

-        for (int x = 0; x < width; x += sizeof(dcN) >> 1)

-            *((uint64_t *) &dst[x]) = dcN;

-        dst += PXSTRIDE(stride);

-    }

-#endif

-}

-static NOINLINE void

-cfl_pred(pixel *dst, const ptrdiff_t stride,

-         const int width, const int height, const unsigned dc,

-         const int16_t *ac, const int alpha)

-{

-    for (int y = 0; y < height; y++) {

-        for (int x = 0; x < width; x++) {

-            const int diff = alpha * ac[x];

-            dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff));

-        }

-        ac += width;

-        dst += PXSTRIDE(stride);

-    }

-}

-static unsigned dc_gen_top(const pixel *const topleft, const int width)

-{

-    unsigned dc = width >> 1;

-    for (int i = 0; i < width; i++)

-       dc += topleft[1 + i];

-    return dc >> ctz(width);

-}

-static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,

-                           const pixel *const topleft,

-                           const int width, const int height, const int a)

-{

-    splat_dc(dst, stride, width, height, dc_gen_top(topleft, width));

-}

-static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride,

-                            const pixel *const topleft,

-                            const int width, const int height,

-                            const int16_t *ac, const int alpha)

-{

-    cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha);

-}

-static unsigned dc_gen_left(const pixel *const topleft, const int height)

-{

-    unsigned dc = height >> 1;

-    for (int i = 0; i < height; i++)

-       dc += topleft[-(1 + i)];

-    return dc >> ctz(height);

-}

-static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,

-                            const pixel *const topleft,

-                            const int width, const int height, const int a)

-{

-    splat_dc(dst, stride, width, height, dc_gen_left(topleft, height));

-}

-static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,

-                             const pixel *const topleft,

-                             const int width, const int height,

-                             const int16_t *ac, const int alpha)

-{

-    unsigned dc = dc_gen_left(topleft, height);

-    cfl_pred(dst, stride, width, height, dc, ac, alpha);

-}

-#if BITDEPTH == 8

-#define MULTIPLIER_1x2 0x5556

-#define MULTIPLIER_1x4 0x3334

-#define BASE_SHIFT 16

-#else

-#define MULTIPLIER_1x2 0xAAAB

-#define MULTIPLIER_1x4 0x6667

-#define BASE_SHIFT 17

-#endif

-static unsigned

-dc_gen(const pixel *const topleft, const int width, const int height)

-{

-    unsigned dc = (width + height) >> 1;

-    for (int i = 0; i < width; i++)

-       dc += topleft[i + 1];

-    for (int i = 0; i < height; i++)

-       dc += topleft[-(i + 1)];

-    dc >>= ctz(width + height);

-    if (width != height) {

-        dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 :

-                                                           MULTIPLIER_1x2;

-        dc >>= BASE_SHIFT;

-    }

-    return dc;

-}

-static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,

-                       const pixel *const topleft,

-                       const int width, const int height, const int a)

-{

-    splat_dc(dst, stride, width, height, dc_gen(topleft, width, height));

-}

-static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride,

-                        const pixel *const topleft,

-                        const int width, const int height,

-                        const int16_t *ac, const int alpha)

-{

-    unsigned dc = dc_gen(topleft, width, height);

-    cfl_pred(dst, stride, width, height, dc, ac, alpha);

-}

-#undef MULTIPLIER_1x2

-#undef MULTIPLIER_1x4

-#undef BASE_SHIFT

-static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,

-                           const pixel *const topleft,

-                           const int width, const int height, const int a)

-{

-    splat_dc(dst, stride, width, height, 1 << (BITDEPTH - 1));

-}

-static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride,

-                            const pixel *const topleft,

-                            const int width, const int height,

-                            const int16_t *ac, const int alpha)

-{

-    cfl_pred(dst, stride, width, height, 1 << (BITDEPTH - 1), ac, alpha);

-}

-static void ipred_v_c(pixel *dst, const ptrdiff_t stride,

-                      const pixel *const topleft,

-                      const int width, const int height, const int a)

-{

-    for (int y = 0; y < height; y++) {

-        pixel_copy(dst, topleft + 1, width);

-        dst += PXSTRIDE(stride);

-    }

-}

-static void ipred_h_c(pixel *dst, const ptrdiff_t stride,

-                      const pixel *const topleft,

-                      const int width, const int height, const int a)

-{

-    for (int y = 0; y < height; y++) {

-        pixel_set(dst, topleft[-(1 + y)], width);

-        dst += PXSTRIDE(stride);

-    }

-}

-static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,

-                          const pixel *const tl_ptr,

-                          const int width, const int height, const int a)

-{

-    const int topleft = tl_ptr[0];

-    for (int y = 0; y < height; y++) {

-        const int left = tl_ptr[-(y + 1)];

-        for (int x = 0; x < width; x++) {

-            const int top = tl_ptr[1 + x];

-            const int base = left + top - topleft;

-            const int ldiff = abs(left - base);

-            const int tdiff = abs(top - base);

-            const int tldiff = abs(topleft - base);

-            dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left :

-                     tdiff <= tldiff ? top : topleft;

-        }

-        dst += PXSTRIDE(stride);

-    }

-}

-static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,

-                           const pixel *const topleft,

-                           const int width, const int height, const int a)

-{

-    const uint8_t *const weights_hor = &dav1d_sm_weights[width];

-    const uint8_t *const weights_ver = &dav1d_sm_weights[height];

-    const int right = topleft[width], bottom = topleft[-height];

-    for (int y = 0; y < height; y++) {

-        for (int x = 0; x < width; x++) {

-            const int pred = weights_ver[y]  * topleft[1 + x] +

-                      (256 - weights_ver[y]) * bottom +

-                             weights_hor[x]  * topleft[-(1 + y)] +

-                      (256 - weights_hor[x]) * right;

-            dst[x] = (pred + 256) >> 9;

-        }

-        dst += PXSTRIDE(stride);

-    }

-}

-static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,

-                             const pixel *const topleft,

-                             const int width, const int height, const int a)

-{

-    const uint8_t *const weights_ver = &dav1d_sm_weights[height];

-    const int bottom = topleft[-height];

-    for (int y = 0; y < height; y++) {

-        for (int x = 0; x < width; x++) {

-            const int pred = weights_ver[y]  * topleft[1 + x] +

-                      (256 - weights_ver[y]) * bottom;

-            dst[x] = (pred + 128) >> 8;

-        }

-        dst += PXSTRIDE(stride);

-    }

-}

-static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,

-                             const pixel *const topleft,

-                             const int width, const int height, const int a)

-{

-    const uint8_t *const weights_hor = &dav1d_sm_weights[width];

-    const int right = topleft[width];

-    for (int y = 0; y < height; y++) {

-        for (int x = 0; x < width; x++) {

-            const int pred = weights_hor[x]  * topleft[-(y + 1)] +

-                      (256 - weights_hor[x]) * right;

-            dst[x] = (pred + 128) >> 8;

-        }

-        dst += PXSTRIDE(stride);

-    }

-}

-static int get_filter_strength(const unsigned blk_wh, const unsigned d,

-                               const int type)

-{

-    int strength = 0;

-    if (type == 0) {

-        if (blk_wh <= 8) {

-            if (d >= 56) strength = 1;

-        } else if (blk_wh <= 12) {

-            if (d >= 40) strength = 1;

-        } else if (blk_wh <= 16) {

-            if (d >= 40) strength = 1;

-        } else if (blk_wh <= 24) {

-            if (d >= 8) strength = 1;

-            if (d >= 16) strength = 2;

-            if (d >= 32) strength = 3;

-        } else if (blk_wh <= 32) {

-            if (d >= 1) strength = 1;

-            if (d >= 4) strength = 2;

-            if (d >= 32) strength = 3;

-        } else {

-            if (d >= 1) strength = 3;

-        }

-    } else {

-        if (blk_wh <= 8) {

-            if (d >= 40) strength = 1;

-            if (d >= 64) strength = 2;

-        } else if (blk_wh <= 16) {

-            if (d >= 20) strength = 1;

-            if (d >= 48) strength = 2;

-        } else if (blk_wh <= 24) {

-            if (d >= 4) strength = 3;

-        } else {

-            if (d >= 1) strength = 3;

-        }

-    }

-    return strength;

-}

-static void filter_edge(pixel *const out, const int sz, const pixel *const in,

-                        const int from, const int to, const unsigned strength)

-{

-    static const uint8_t kernel[3][5] = {

-        { 0, 4, 8, 4, 0 },

-        { 0, 5, 6, 5, 0 },

-        { 2, 4, 4, 4, 2 }

-    };

-    assert(strength > 0);

-    for (int i = 0; i < sz; i++) {

-        int s = 0;

-        for (int j = 0; j < 5; j++)

-            s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j];

-        out[i] = (s + 8) >> 4;

-    }

-}

-static int get_upsample(const int blk_wh, const unsigned d, const int type) {

-    if (d >= 40) return 0;

-    return type ? (blk_wh <= 8) : (blk_wh <= 16);

-}

-static void upsample_edge(pixel *const out, const int hsz,

-                          const pixel *const in, const int from, const int to)

-{

-    static const int8_t kernel[4] = { -1, 9, 9, -1 };

-    int i;

-    for (i = 0; i < hsz - 1; i++) {

-        out[i * 2] = in[iclip(i, from, to - 1)];

-        int s = 0;

-        for (int j = 0; j < 4; j++)

-            s += in[iclip(i + j - 1, from, to - 1)] * kernel[j];

-        out[i * 2 + 1] = iclip_pixel((s + 8) >> 4);

-    }

-    out[i * 2] = in[iclip(i, from, to - 1)];

-}

-static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,

-                       const pixel *const topleft_in,

-                       const int width, const int height, int angle)

-{

-    const int is_sm = angle >> 9;

-    angle &= 511;

-    assert(angle < 90);

-    const int dx = dav1d_dr_intra_derivative[angle];

-    pixel top_out[(64 + 64) * 2];

-    const pixel *top;

-    int max_base_x;

-    const int upsample_above = get_upsample(width + height, 90 - angle, is_sm);

-    if (upsample_above) {

-        upsample_edge(top_out, width + height,

-                      &topleft_in[1], -1, width + imin(width, height));

-        top = top_out;

-        max_base_x = 2 * (width + height) - 2;

-    } else {

-        const int filter_strength =

-            get_filter_strength(width + height, 90 - angle, is_sm);

-        if (filter_strength) {

-            filter_edge(top_out, width + height,

-                        &topleft_in[1], -1, width + imin(width, height),

-                        filter_strength);

-            top = top_out;

-            max_base_x = width + height - 1;

-        } else {

-            top = &topleft_in[1];

-            max_base_x = width + imin(width, height) - 1;

-        }

-    }

-    const int frac_bits = 6 - upsample_above;

-    const int base_inc = 1 << upsample_above;

-    for (int y = 0, xpos = dx; y < height;

-         y++, dst += PXSTRIDE(stride), xpos += dx)

-    {

-        int base = xpos >> frac_bits;

-        const int frac = ((xpos << upsample_above) & 0x3F) >> 1;

-        for (int x = 0; x < width; x++, base += base_inc) {

-            if (base < max_base_x) {

-                const int v = top[base] * (32 - frac) + top[base + 1] * frac;

-                dst[x] = iclip_pixel((v + 16) >> 5);

-            } else {

-                pixel_set(&dst[x], top[max_base_x], width - x);

-                break;

-            }

-        }

-    }

-}

-static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,

-                       const pixel *const topleft_in,

-                       const int width, const int height, int angle)

-{

-    const int is_sm = angle >> 9;

-    angle &= 511;

-    assert(angle > 90 && angle < 180);

-    const int dy = dav1d_dr_intra_derivative[angle - 90];

-    const int dx = dav1d_dr_intra_derivative[180 - angle];

-    const int upsample_left = get_upsample(width + height, 180 - angle, is_sm);

-    const int upsample_above = get_upsample(width + height, angle - 90, is_sm);

-    pixel edge[64 * 2 + 64 * 2 + 1];

-    pixel *const topleft = &edge[height * 2];

-    if (upsample_above) {

-        upsample_edge(topleft, width + 1, topleft_in, 0, width + 1);

-    } else {

-        const int filter_strength =

-            get_filter_strength(width + height, angle - 90, is_sm);

-        if (filter_strength) {

-            filter_edge(&topleft[1], width, &topleft_in[1], -1, width,

-                        filter_strength);

-        } else {

-            pixel_copy(&topleft[1], &topleft_in[1], width);

-        }

-    }

-    if (upsample_left) {

-        upsample_edge(edge, height + 1, &topleft_in[-height], 0, height + 1);

-    } else {

-        const int filter_strength =

-            get_filter_strength(width + height, 180 - angle, is_sm);

-        if (filter_strength) {

-            filter_edge(&topleft[-height], height, &topleft_in[-height],

-                        0, height + 1, filter_strength);

-        } else {

-            pixel_copy(&topleft[-height], &topleft_in[-height], height);

-        }

-    }

-    *topleft = *topleft_in;

-    const int min_base_x = -(1 << upsample_above);

-    const int frac_bits_y = 6 - upsample_left, frac_bits_x = 6 - upsample_above;

-    const int base_inc_x = 1 << upsample_above;

-    const pixel *const left = &topleft[-(1 << upsample_left)];

-    const pixel *const top = &topleft[1 << upsample_above];

-    for (int y = 0, xpos = -dx; y < height;

-         y++, xpos -= dx, dst += PXSTRIDE(stride))

-    {

-        int base_x = xpos >> frac_bits_x;

-        const int frac_x = ((xpos * (1 << upsample_above)) & 0x3F) >> 1;

-        for (int x = 0, ypos = (y << 6) - dy; x < width;

-             x++, base_x += base_inc_x, ypos -= dy)

-        {

-            int v;

-            if (base_x >= min_base_x) {

-                v = top[base_x] * (32 - frac_x) + top[base_x + 1] * frac_x;

-            } else {

-                const int base_y = ypos >> frac_bits_y;

-                assert(base_y >= -(1 << upsample_left));

-                const int frac_y = ((ypos * (1 << upsample_left)) & 0x3F) >> 1;

-                v = left[-base_y] * (32 - frac_y) + left[-(base_y + 1)] * frac_y;

-            }

-            dst[x] = iclip_pixel((v + 16) >> 5);

-        }

-    }

-}

-static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,

-                       const pixel *const topleft_in,

-                       const int width, const int height, int angle)

-{

-    const int is_sm = angle >> 9;

-    angle &= 511;

-    assert(angle > 180);

-    const int dy = dav1d_dr_intra_derivative[270 - angle];

-    pixel left_out[(64 + 64) * 2];

-    const pixel *left;

-    int max_base_y;

-    const int upsample_left = get_upsample(width + height, angle - 180, is_sm);

-    if (upsample_left) {

-        upsample_edge(left_out, width + height,

-                      &topleft_in[-(width + height)],

-                      imax(width - height, 0), width + height + 1);

-        left = &left_out[2 * (width + height) - 2];

-        max_base_y = 2 * (width + height) - 2;

-    } else {

-        const int filter_strength =

-            get_filter_strength(width + height, angle - 180, is_sm);

-        if (filter_strength) {

-            filter_edge(left_out, width + height,

-                        &topleft_in[-(width + height)],

-                        imax(width - height, 0), width + height + 1,

-                        filter_strength);

-            left = &left_out[width + height - 1];

-            max_base_y = width + height - 1;

-        } else {

-            left = &topleft_in[-1];

-            max_base_y = height + imin(width, height) - 1;

-        }

-    }

-    const int frac_bits = 6 - upsample_left;

-    const int base_inc = 1 << upsample_left;

-    for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {

-        int base = ypos >> frac_bits;

-        const int frac = ((ypos << upsample_left) & 0x3F) >> 1;

-        for (int y = 0; y < height; y++, base += base_inc) {

-            if (base < max_base_y) {

-                const int v = left[-base] * (32 - frac) +

-                              left[-(base + 1)] * frac;

-                dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 16) >> 5);

-            } else {

-                do {

-                    dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];

-                } while (++y < height);

-                break;

-            }

-        }

-    }

-}

-/* Up to 32x32 only */

-static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,

-                           const pixel *const topleft_in,

-                           const int width, const int height, int filt_idx)

-{

-    filt_idx &= 511;

-    assert(filt_idx < 5);

-    const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];

-    int x, y;

-    ptrdiff_t left_stride;

-    const pixel *left, *topleft, *top;

-    top = &topleft_in[1];

-    for (y = 0; y < height; y += 2) {

-        topleft = &topleft_in[-y];

-        left = &topleft[-1];

-        left_stride = -1;

-        for (x = 0; x < width; x += 4) {

-            const int p0 = *topleft;

-            const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];

-            const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];

-            pixel *ptr = &dst[x];

-            const int8_t *flt_ptr = filter;

-            for (int yy = 0; yy < 2; yy++) {

-                for (int xx = 0; xx < 4; xx++, flt_ptr += 2) {

-                    int acc = flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +

-                              flt_ptr[16] * p2 + flt_ptr[17] * p3 +

-                              flt_ptr[32] * p4 + flt_ptr[33] * p5 +

-                              flt_ptr[48] * p6;

-                    ptr[xx] = iclip_pixel((acc + 8) >> 4);

-                }

-                ptr += PXSTRIDE(stride);

-            }

-            left = &dst[x + 4 - 1];

-            left_stride = PXSTRIDE(stride);

-            top += 4;

-            topleft = &top[-1];

-        }

-        top = &dst[PXSTRIDE(stride)];

-        dst = &dst[PXSTRIDE(stride) * 2];

-    }

-}

-static NOINLINE void

-cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,

-         const int w_pad, const int h_pad, const int width, const int height,

-         const int ss_hor, const int ss_ver, const int log2sz)

-{

-    int y, x;

-    int16_t *const ac_orig = ac;

-    assert(w_pad >= 0 && w_pad * 4 < width);

-    assert(h_pad >= 0 && h_pad * 4 < height);

-    for (y = 0; y < height - 4 * h_pad; y++) {

-        for (x = 0; x < width - 4 * w_pad; x++) {

-            int ac_sum = ypx[x << ss_hor];

-            if (ss_hor) ac_sum += ypx[x * 2 + 1];

-            if (ss_ver) {

-                ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)];

-                if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)];

-            }

-            ac[x] = ac_sum << (1 + !ss_ver + !ss_hor);

-        }

-        for (; x < width; x++)

-            ac[x] = ac[x - 1];

-        ac += width;

-        ypx += PXSTRIDE(stride) << ss_ver;

-    }

-    for (; y < height; y++) {

-        memcpy(ac, &ac[-width], width * sizeof(*ac));

-        ac += width;

-    }

-    int sum = (1 << log2sz) >> 1;

-    for (ac = ac_orig, y = 0; y < height; y++) {

-        for (x = 0; x < width; x++)

-            sum += ac[x];

-        ac += width;

-    }

-    sum >>= log2sz;

-    // subtract DC

-    for (ac = ac_orig, y = 0; y < height; y++) {

-        for (x = 0; x < width; x++)

-            ac[x] -= sum;

-        ac += width;

-    }

-}

-#define cfl_ac_fn(lw, lh, cw, ch, ss_hor, ss_ver, log2sz) \

-static void cfl_ac_##lw##x##lh##_to_##cw##x##ch##_c(int16_t *const ac, \

-                                                    const pixel *const ypx, \

-                                                    const ptrdiff_t stride, \

-                                                    const int w_pad, \

-                                                    const int h_pad) \

-{ \

-    cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver, log2sz); \

-}

-cfl_ac_fn( 8,  8,  4,  4, 1, 1, 4)

-cfl_ac_fn( 8, 16,  4,  8, 1, 1, 5)

-cfl_ac_fn( 8, 32,  4, 16, 1, 1, 6)

-cfl_ac_fn(16,  8,  8,  4, 1, 1, 5)

-cfl_ac_fn(16, 16,  8,  8, 1, 1, 6)

-cfl_ac_fn(16, 32,  8, 16, 1, 1, 7)

-cfl_ac_fn(32,  8, 16,  4, 1, 1, 6)

-cfl_ac_fn(32, 16, 16,  8, 1, 1, 7)

-cfl_ac_fn(32, 32, 16, 16, 1, 1, 8)

-cfl_ac_fn( 8,  4,  4,  4, 1, 0, 4)

-cfl_ac_fn( 8,  8,  4,  8, 1, 0, 5)

-cfl_ac_fn(16,  4,  8,  4, 1, 0, 5)

-cfl_ac_fn(16,  8,  8,  8, 1, 0, 6)

-cfl_ac_fn(16, 16,  8, 16, 1, 0, 7)

-cfl_ac_fn(32,  8, 16,  8, 1, 0, 7)

-cfl_ac_fn(32, 16, 16, 16, 1, 0, 8)

-cfl_ac_fn(32, 32, 16, 32, 1, 0, 9)

-cfl_ac_fn( 4,  4,  4,  4, 0, 0, 4)

-cfl_ac_fn( 4,  8,  4,  8, 0, 0, 5)

-cfl_ac_fn( 4, 16,  4, 16, 0, 0, 6)

-cfl_ac_fn( 8,  4,  8,  4, 0, 0, 5)

-cfl_ac_fn( 8,  8,  8,  8, 0, 0, 6)

-cfl_ac_fn( 8, 16,  8, 16, 0, 0, 7)

-cfl_ac_fn( 8, 32,  8, 32, 0, 0, 8)

-cfl_ac_fn(16,  4, 16,  4, 0, 0, 6)

-cfl_ac_fn(16,  8, 16,  8, 0, 0, 7)

-cfl_ac_fn(16, 16, 16, 16, 0, 0, 8)

-cfl_ac_fn(16, 32, 16, 32, 0, 0, 9)

-cfl_ac_fn(32,  8, 32,  8, 0, 0, 8)

-cfl_ac_fn(32, 16, 32, 16, 0, 0, 9)

-cfl_ac_fn(32, 32, 32, 32, 0, 0, 10)

-static void pal_pred_c(pixel *dst, const ptrdiff_t stride,

-                       const uint16_t *const pal, const uint8_t *idx,

-                       const int w, const int h)

-{

-    for (int y = 0; y < h; y++) {

-        for (int x = 0; x < w; x++)

-            dst[x] = pal[idx[x]];

-        idx += w;

-        dst += PXSTRIDE(stride);

-    }

-}

-void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {

-    c->intra_pred[DC_PRED      ] = ipred_dc_c;

-    c->intra_pred[DC_128_PRED  ] = ipred_dc_128_c;

-    c->intra_pred[TOP_DC_PRED  ] = ipred_dc_top_c;

-    c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;

-    c->intra_pred[HOR_PRED     ] = ipred_h_c;

-    c->intra_pred[VERT_PRED    ] = ipred_v_c;

-    c->intra_pred[PAETH_PRED   ] = ipred_paeth_c;

-    c->intra_pred[SMOOTH_PRED  ] = ipred_smooth_c;

-    c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;

-    c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;

-    c->intra_pred[Z1_PRED      ] = ipred_z1_c;

-    c->intra_pred[Z2_PRED      ] = ipred_z2_c;

-    c->intra_pred[Z3_PRED      ] = ipred_z3_c;

-    c->intra_pred[FILTER_PRED  ] = ipred_filter_c;

-    // cfl functions are split per chroma subsampling type

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_4X4  ] = cfl_ac_8x8_to_4x4_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_4X8  ] = cfl_ac_8x16_to_4x8_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_4X16 ] = cfl_ac_8x32_to_4x16_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_8X4  ] = cfl_ac_16x8_to_8x4_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_8X8  ] = cfl_ac_16x16_to_8x8_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_8X16 ] = cfl_ac_16x32_to_8x16_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_16X4 ] = cfl_ac_32x8_to_16x4_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_16X8 ] = cfl_ac_32x16_to_16x8_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_16X16] = cfl_ac_32x32_to_16x16_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_4X4  ] = cfl_ac_8x4_to_4x4_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_4X8  ] = cfl_ac_8x8_to_4x8_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X4  ] = cfl_ac_16x4_to_8x4_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_8X8  ] = cfl_ac_16x8_to_8x8_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X16 ] = cfl_ac_16x16_to_8x16_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X8 ] = cfl_ac_32x8_to_16x8_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_16X16] = cfl_ac_32x16_to_16x16_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X32] = cfl_ac_32x32_to_16x32_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_4X4  ] = cfl_ac_4x4_to_4x4_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_4X8  ] = cfl_ac_4x8_to_4x8_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_4X16 ] = cfl_ac_4x16_to_4x16_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X4  ] = cfl_ac_8x4_to_8x4_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_8X8  ] = cfl_ac_8x8_to_8x8_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X16 ] = cfl_ac_8x16_to_8x16_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X32 ] = cfl_ac_8x32_to_8x32_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X4 ] = cfl_ac_16x4_to_16x4_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X8 ] = cfl_ac_16x8_to_16x8_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_16X16] = cfl_ac_16x16_to_16x16_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X32] = cfl_ac_16x32_to_16x32_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_32X8 ] = cfl_ac_32x8_to_32x8_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_32X16] = cfl_ac_32x16_to_32x16_c;

-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_32X32] = cfl_ac_32x32_to_32x32_c;

-    c->cfl_pred[DC_PRED     ] = ipred_cfl_c;

-    c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;

-    c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;

-    c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;

-    c->pal_pred = pal_pred_c;

-#if HAVE_ASM && ARCH_X86

-    bitfn(dav1d_intra_pred_dsp_init_x86)(c);

-#endif

-}

--- a/src/ipred_prepare.c

+++ /dev/null

@@ -1,209 +1,0 @@

-/*

- * Copyright © 2018, VideoLAN and dav1d authors

- * Copyright © 2018, Two Orioles, LLC

- * All rights reserved.

- *

- * Redistribution and use in source and binary forms, with or without

- * modification, are permitted provided that the following conditions are met:

- *

- * 1. Redistributions of source code must retain the above copyright notice, this

- *    list of conditions and the following disclaimer.

- *

- * 2. Redistributions in binary form must reproduce the above copyright notice,

- *    this list of conditions and the following disclaimer in the documentation

- *    and/or other materials provided with the distribution.

- *

- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

- */

-#include "config.h"

-#include <assert.h>

-#include <stdint.h>

-#include <string.h>

-#include "common/intops.h"

-#include "src/ipred_prepare.h"

-static const uint8_t av1_mode_conv[N_INTRA_PRED_MODES]

-                                  [2 /* have_left */][2 /* have_top */] =

-{

-    [DC_PRED]    = { { DC_128_PRED,  TOP_DC_PRED },

-                     { LEFT_DC_PRED, DC_PRED     } },

-    [PAETH_PRED] = { { DC_128_PRED,  VERT_PRED   },

-                     { HOR_PRED,     PAETH_PRED  } },

-};

-static const uint8_t av1_mode_to_angle_map[8] = {

-    90, 180, 45, 135, 113, 157, 203, 67

-};

-static const struct {

-    uint8_t needs_left:1;

-    uint8_t needs_top:1;

-    uint8_t needs_topleft:1;

-    uint8_t needs_topright:1;

-    uint8_t needs_bottomleft:1;

-} av1_intra_prediction_edges[N_IMPL_INTRA_PRED_MODES] = {

-    [DC_PRED]       = { .needs_top  = 1, .needs_left = 1 },

-    [VERT_PRED]     = { .needs_top  = 1 },

-    [HOR_PRED]      = { .needs_left = 1 },

-    [LEFT_DC_PRED]  = { .needs_left = 1 },

-    [TOP_DC_PRED]   = { .needs_top  = 1 },

-    [DC_128_PRED]   = { 0 },

-    [Z1_PRED]       = { .needs_top = 1, .needs_topright = 1,

-                        .needs_topleft = 1 },

-    [Z2_PRED]       = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },

-    [Z3_PRED]       = { .needs_left = 1, .needs_bottomleft = 1,

-                        .needs_topleft = 1 },

-    [SMOOTH_PRED]   = { .needs_left = 1, .needs_top = 1 },

-    [SMOOTH_V_PRED] = { .needs_left = 1, .needs_top = 1 },

-    [SMOOTH_H_PRED] = { .needs_left = 1, .needs_top = 1 },

-    [PAETH_PRED]    = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },

-    [FILTER_PRED]   = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },

-};

-enum IntraPredMode

-bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,

-                                  const int y, const int have_top,

-                                  const int w, const int h,

-                                  const enum EdgeFlags edge_flags,

-                                  const pixel *const dst,

-                                  const ptrdiff_t stride,

-                                  const pixel *prefilter_toplevel_sb_edge,

-                                  enum IntraPredMode mode, int *const angle,

-                                  const int tw, const int th,

-                                  pixel *const topleft_out)

-{

-    assert(y < h && x < w);

-    switch (mode) {

-    case VERT_PRED:

-    case HOR_PRED:

-    case DIAG_DOWN_LEFT_PRED:

-    case DIAG_DOWN_RIGHT_PRED:

-    case VERT_RIGHT_PRED:

-    case HOR_DOWN_PRED:

-    case HOR_UP_PRED:

-    case VERT_LEFT_PRED: {

-        *angle = av1_mode_to_angle_map[mode - VERT_PRED] + 3 * *angle;

-        if (*angle < 90) {

-            mode = have_top ? Z1_PRED : VERT_PRED;

-        } else if (*angle == 90) {

-            mode = VERT_PRED;

-        } else if (*angle < 180) {

-            mode = Z2_PRED;

-        } else if (*angle == 180) {

-            mode = HOR_PRED;

-        } else {

-            mode = have_left ? Z3_PRED : HOR_PRED;

-        }

-        break;

-    }

-    case DC_PRED:

-    case PAETH_PRED:

-        mode = av1_mode_conv[mode][have_left][have_top];

-        break;

-    default:

-        break;

-    }

-    const pixel *dst_top;

-    if (have_top &&

-        (av1_intra_prediction_edges[mode].needs_top ||

-         av1_intra_prediction_edges[mode].needs_topleft ||

-         (av1_intra_prediction_edges[mode].needs_left && !have_left)))

-    {

-        if (prefilter_toplevel_sb_edge) {

-            dst_top = &prefilter_toplevel_sb_edge[x * 4];

-        } else {

-            dst_top = &dst[-PXSTRIDE(stride)];

-        }

-    }

-    if (av1_intra_prediction_edges[mode].needs_left) {

-        const int sz = th << 2;

-        pixel *const left = &topleft_out[-sz];

-        if (have_left) {

-            const int px_have = imin(sz, (h - y) << 2);

-            for (int i = 0; i < px_have; i++)

-                left[sz - 1 - i] = dst[PXSTRIDE(stride) * i - 1];

-            if (px_have < sz)

-                pixel_set(left, left[sz - px_have], sz - px_have);

-        } else {

-            pixel_set(left, have_top ? *dst_top : ((1 << BITDEPTH) >> 1) + 1, sz);

-        }

-        if (av1_intra_prediction_edges[mode].needs_bottomleft) {

-            const int have_bottomleft = (!have_left || y + th >= h) ? 0 :

-                                        (edge_flags & EDGE_I444_LEFT_HAS_BOTTOM);

-            if (have_bottomleft) {

-                const int px_have = imin(sz, (h - y - th) << 2);

-                for (int i = 0; i < px_have; i++)

-                    left[-(i + 1)] = dst[(sz + i) * PXSTRIDE(stride) - 1];

-                if (px_have < sz)

-                    pixel_set(left - sz, left[-px_have], sz - px_have);

-            } else {

-                pixel_set(left - sz, left[0], sz);

-            }

-        }

-    }

-    if (av1_intra_prediction_edges[mode].needs_top) {

-        const int sz = tw << 2;

-        pixel *const top = &topleft_out[1];

-        if (have_top) {

-            const int px_have = imin(sz, (w - x) << 2);

-            pixel_copy(top, dst_top, px_have);

-            if (px_have < sz)

-                pixel_set(top + px_have, top[px_have - 1], sz - px_have);

-        } else {

-            pixel_set(top, have_left ? dst[-1] : ((1 << BITDEPTH) >> 1) - 1, sz);

-        }

-        if (av1_intra_prediction_edges[mode].needs_topright) {

-            const int have_topright = (!have_top || x + tw >= w) ? 0 :

-                                      (edge_flags & EDGE_I444_TOP_HAS_RIGHT);

-            if (have_topright) {

-                const int px_have = imin(sz, (w - x - tw) << 2);

-                pixel_copy(top + sz, &dst_top[sz], px_have);

-                if (px_have < sz)

-                    pixel_set(top + sz + px_have, top[sz + px_have - 1],

-                              sz - px_have);

-            } else {

-                pixel_set(top + sz, top[sz - 1], sz);

-            }

-        }

-    }

-    if (av1_intra_prediction_edges[mode].needs_topleft) {

-        if (have_left) {

-            *topleft_out = have_top ? dst_top[-1] : dst[-1];

-        } else {

-            *topleft_out = have_top ? *dst_top : (1 << BITDEPTH) >> 1;

-        }

-        if (mode == Z2_PRED && tw + th >= 6)

-            *topleft_out = (topleft_out[-1] * 5 + topleft_out[0] * 6 +

-                            topleft_out[1] * 5 + 8) >> 4;

-    }

-    return mode;

-}

--- /dev/null

+++ b/src/ipred_prepare_tmpl.c

@@ -1,0 +1,209 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "config.h"

+#include <assert.h>

+#include <stdint.h>

+#include <string.h>

+#include "common/intops.h"

+#include "src/ipred_prepare.h"

+static const uint8_t av1_mode_conv[N_INTRA_PRED_MODES]

+                                  [2 /* have_left */][2 /* have_top */] =

+{

+    [DC_PRED]    = { { DC_128_PRED,  TOP_DC_PRED },

+                     { LEFT_DC_PRED, DC_PRED     } },

+    [PAETH_PRED] = { { DC_128_PRED,  VERT_PRED   },

+                     { HOR_PRED,     PAETH_PRED  } },

+};

+static const uint8_t av1_mode_to_angle_map[8] = {

+    90, 180, 45, 135, 113, 157, 203, 67

+};

+static const struct {

+    uint8_t needs_left:1;

+    uint8_t needs_top:1;

+    uint8_t needs_topleft:1;

+    uint8_t needs_topright:1;

+    uint8_t needs_bottomleft:1;

+} av1_intra_prediction_edges[N_IMPL_INTRA_PRED_MODES] = {

+    [DC_PRED]       = { .needs_top  = 1, .needs_left = 1 },

+    [VERT_PRED]     = { .needs_top  = 1 },

+    [HOR_PRED]      = { .needs_left = 1 },

+    [LEFT_DC_PRED]  = { .needs_left = 1 },

+    [TOP_DC_PRED]   = { .needs_top  = 1 },

+    [DC_128_PRED]   = { 0 },

+    [Z1_PRED]       = { .needs_top = 1, .needs_topright = 1,

+                        .needs_topleft = 1 },

+    [Z2_PRED]       = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },

+    [Z3_PRED]       = { .needs_left = 1, .needs_bottomleft = 1,

+                        .needs_topleft = 1 },

+    [SMOOTH_PRED]   = { .needs_left = 1, .needs_top = 1 },

+    [SMOOTH_V_PRED] = { .needs_left = 1, .needs_top = 1 },

+    [SMOOTH_H_PRED] = { .needs_left = 1, .needs_top = 1 },

+    [PAETH_PRED]    = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },

+    [FILTER_PRED]   = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },

+};

+enum IntraPredMode

+bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,

+                                  const int y, const int have_top,

+                                  const int w, const int h,

+                                  const enum EdgeFlags edge_flags,

+                                  const pixel *const dst,

+                                  const ptrdiff_t stride,

+                                  const pixel *prefilter_toplevel_sb_edge,

+                                  enum IntraPredMode mode, int *const angle,

+                                  const int tw, const int th,

+                                  pixel *const topleft_out)

+{

+    assert(y < h && x < w);

+    switch (mode) {

+    case VERT_PRED:

+    case HOR_PRED:

+    case DIAG_DOWN_LEFT_PRED:

+    case DIAG_DOWN_RIGHT_PRED:

+    case VERT_RIGHT_PRED:

+    case HOR_DOWN_PRED:

+    case HOR_UP_PRED:

+    case VERT_LEFT_PRED: {

+        *angle = av1_mode_to_angle_map[mode - VERT_PRED] + 3 * *angle;

+        if (*angle < 90) {

+            mode = have_top ? Z1_PRED : VERT_PRED;

+        } else if (*angle == 90) {

+            mode = VERT_PRED;

+        } else if (*angle < 180) {

+            mode = Z2_PRED;

+        } else if (*angle == 180) {

+            mode = HOR_PRED;

+        } else {

+            mode = have_left ? Z3_PRED : HOR_PRED;

+        }

+        break;

+    }

+    case DC_PRED:

+    case PAETH_PRED:

+        mode = av1_mode_conv[mode][have_left][have_top];

+        break;

+    default:

+        break;

+    }

+    const pixel *dst_top;

+    if (have_top &&

+        (av1_intra_prediction_edges[mode].needs_top ||

+         av1_intra_prediction_edges[mode].needs_topleft ||

+         (av1_intra_prediction_edges[mode].needs_left && !have_left)))

+    {

+        if (prefilter_toplevel_sb_edge) {

+            dst_top = &prefilter_toplevel_sb_edge[x * 4];

+        } else {

+            dst_top = &dst[-PXSTRIDE(stride)];

+        }

+    }

+    if (av1_intra_prediction_edges[mode].needs_left) {

+        const int sz = th << 2;

+        pixel *const left = &topleft_out[-sz];

+        if (have_left) {

+            const int px_have = imin(sz, (h - y) << 2);

+            for (int i = 0; i < px_have; i++)

+                left[sz - 1 - i] = dst[PXSTRIDE(stride) * i - 1];

+            if (px_have < sz)

+                pixel_set(left, left[sz - px_have], sz - px_have);

+        } else {

+            pixel_set(left, have_top ? *dst_top : ((1 << BITDEPTH) >> 1) + 1, sz);

+        }

+        if (av1_intra_prediction_edges[mode].needs_bottomleft) {

+            const int have_bottomleft = (!have_left || y + th >= h) ? 0 :

+                                        (edge_flags & EDGE_I444_LEFT_HAS_BOTTOM);

+            if (have_bottomleft) {

+                const int px_have = imin(sz, (h - y - th) << 2);

+                for (int i = 0; i < px_have; i++)

+                    left[-(i + 1)] = dst[(sz + i) * PXSTRIDE(stride) - 1];

+                if (px_have < sz)

+                    pixel_set(left - sz, left[-px_have], sz - px_have);

+            } else {

+                pixel_set(left - sz, left[0], sz);

+            }

+        }

+    }

+    if (av1_intra_prediction_edges[mode].needs_top) {

+        const int sz = tw << 2;

+        pixel *const top = &topleft_out[1];

+        if (have_top) {

+            const int px_have = imin(sz, (w - x) << 2);

+            pixel_copy(top, dst_top, px_have);

+            if (px_have < sz)

+                pixel_set(top + px_have, top[px_have - 1], sz - px_have);

+        } else {

+            pixel_set(top, have_left ? dst[-1] : ((1 << BITDEPTH) >> 1) - 1, sz);

+        }

+        if (av1_intra_prediction_edges[mode].needs_topright) {

+            const int have_topright = (!have_top || x + tw >= w) ? 0 :

+                                      (edge_flags & EDGE_I444_TOP_HAS_RIGHT);

+            if (have_topright) {

+                const int px_have = imin(sz, (w - x - tw) << 2);

+                pixel_copy(top + sz, &dst_top[sz], px_have);

+                if (px_have < sz)

+                    pixel_set(top + sz + px_have, top[sz + px_have - 1],

+                              sz - px_have);

+            } else {

+                pixel_set(top + sz, top[sz - 1], sz);

+            }

+        }

+    }

+    if (av1_intra_prediction_edges[mode].needs_topleft) {

+        if (have_left) {

+            *topleft_out = have_top ? dst_top[-1] : dst[-1];

+        } else {

+            *topleft_out = have_top ? *dst_top : (1 << BITDEPTH) >> 1;

+        }

+        if (mode == Z2_PRED && tw + th >= 6)

+            *topleft_out = (topleft_out[-1] * 5 + topleft_out[0] * 6 +

+                            topleft_out[1] * 5 + 8) >> 4;

+    }

+    return mode;

+}

--- /dev/null

+++ b/src/ipred_tmpl.c

@@ -1,0 +1,757 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "config.h"

+#include <assert.h>

+#include <stdlib.h>

+#include <string.h>

+#include "common/attributes.h"

+#include "common/intops.h"

+#include "src/ipred.h"

+#include "src/tables.h"

+static NOINLINE void

+splat_dc(pixel *dst, const ptrdiff_t stride,

+         const int width, const int height, const unsigned dc)

+{

+    assert(dc <= (1 << BITDEPTH) - 1);

+#if BITDEPTH == 8

+    if (width > 4) {

+        const uint64_t dcN = dc * 0x0101010101010101ULL;

+        for (int y = 0; y < height; y++) {

+            for (int x = 0; x < width; x += sizeof(dcN))

+                *((uint64_t *) &dst[x]) = dcN;

+            dst += PXSTRIDE(stride);

+        }

+    } else {

+        const unsigned dcN = dc * 0x01010101U;

+        for (int y = 0; y < height; y++) {

+            for (int x = 0; x < width; x += sizeof(dcN))

+                *((unsigned *) &dst[x]) = dcN;

+            dst += PXSTRIDE(stride);

+        }

+    }

+#else

+    const uint64_t dcN = dc * 0x0001000100010001ULL;

+    for (int y = 0; y < height; y++) {

+        for (int x = 0; x < width; x += sizeof(dcN) >> 1)

+            *((uint64_t *) &dst[x]) = dcN;

+        dst += PXSTRIDE(stride);

+    }

+#endif

+}

+static NOINLINE void

+cfl_pred(pixel *dst, const ptrdiff_t stride,

+         const int width, const int height, const unsigned dc,

+         const int16_t *ac, const int alpha)

+{

+    for (int y = 0; y < height; y++) {

+        for (int x = 0; x < width; x++) {

+            const int diff = alpha * ac[x];

+            dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff));

+        }

+        ac += width;

+        dst += PXSTRIDE(stride);

+    }

+}

+static unsigned dc_gen_top(const pixel *const topleft, const int width)

+{

+    unsigned dc = width >> 1;

+    for (int i = 0; i < width; i++)

+       dc += topleft[1 + i];

+    return dc >> ctz(width);

+}

+static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,

+                           const pixel *const topleft,

+                           const int width, const int height, const int a)

+{

+    splat_dc(dst, stride, width, height, dc_gen_top(topleft, width));

+}

+static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride,

+                            const pixel *const topleft,

+                            const int width, const int height,

+                            const int16_t *ac, const int alpha)

+{

+    cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha);

+}

+static unsigned dc_gen_left(const pixel *const topleft, const int height)

+{

+    unsigned dc = height >> 1;

+    for (int i = 0; i < height; i++)

+       dc += topleft[-(1 + i)];

+    return dc >> ctz(height);

+}

+static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,

+                            const pixel *const topleft,

+                            const int width, const int height, const int a)

+{

+    splat_dc(dst, stride, width, height, dc_gen_left(topleft, height));

+}

+static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,

+                             const pixel *const topleft,

+                             const int width, const int height,

+                             const int16_t *ac, const int alpha)

+{

+    unsigned dc = dc_gen_left(topleft, height);

+    cfl_pred(dst, stride, width, height, dc, ac, alpha);

+}

+#if BITDEPTH == 8

+#define MULTIPLIER_1x2 0x5556

+#define MULTIPLIER_1x4 0x3334

+#define BASE_SHIFT 16

+#else

+#define MULTIPLIER_1x2 0xAAAB

+#define MULTIPLIER_1x4 0x6667

+#define BASE_SHIFT 17

+#endif

+static unsigned

+dc_gen(const pixel *const topleft, const int width, const int height)

+{

+    unsigned dc = (width + height) >> 1;

+    for (int i = 0; i < width; i++)

+       dc += topleft[i + 1];

+    for (int i = 0; i < height; i++)

+       dc += topleft[-(i + 1)];

+    dc >>= ctz(width + height);

+    if (width != height) {

+        dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 :

+                                                           MULTIPLIER_1x2;

+        dc >>= BASE_SHIFT;

+    }

+    return dc;

+}

+static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,

+                       const pixel *const topleft,

+                       const int width, const int height, const int a)

+{

+    splat_dc(dst, stride, width, height, dc_gen(topleft, width, height));

+}

+static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride,

+                        const pixel *const topleft,

+                        const int width, const int height,

+                        const int16_t *ac, const int alpha)

+{

+    unsigned dc = dc_gen(topleft, width, height);

+    cfl_pred(dst, stride, width, height, dc, ac, alpha);

+}

+#undef MULTIPLIER_1x2

+#undef MULTIPLIER_1x4

+#undef BASE_SHIFT

+static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,

+                           const pixel *const topleft,

+                           const int width, const int height, const int a)

+{

+    splat_dc(dst, stride, width, height, 1 << (BITDEPTH - 1));

+}

+static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride,

+                            const pixel *const topleft,

+                            const int width, const int height,

+                            const int16_t *ac, const int alpha)

+{

+    cfl_pred(dst, stride, width, height, 1 << (BITDEPTH - 1), ac, alpha);

+}

+static void ipred_v_c(pixel *dst, const ptrdiff_t stride,

+                      const pixel *const topleft,

+                      const int width, const int height, const int a)

+{

+    for (int y = 0; y < height; y++) {

+        pixel_copy(dst, topleft + 1, width);

+        dst += PXSTRIDE(stride);

+    }

+}

+static void ipred_h_c(pixel *dst, const ptrdiff_t stride,

+                      const pixel *const topleft,

+                      const int width, const int height, const int a)

+{

+    for (int y = 0; y < height; y++) {

+        pixel_set(dst, topleft[-(1 + y)], width);

+        dst += PXSTRIDE(stride);

+    }

+}

+static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,

+                          const pixel *const tl_ptr,

+                          const int width, const int height, const int a)

+{

+    const int topleft = tl_ptr[0];

+    for (int y = 0; y < height; y++) {

+        const int left = tl_ptr[-(y + 1)];

+        for (int x = 0; x < width; x++) {

+            const int top = tl_ptr[1 + x];

+            const int base = left + top - topleft;

+            const int ldiff = abs(left - base);

+            const int tdiff = abs(top - base);

+            const int tldiff = abs(topleft - base);

+            dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left :

+                     tdiff <= tldiff ? top : topleft;

+        }

+        dst += PXSTRIDE(stride);

+    }

+}

+static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,

+                           const pixel *const topleft,

+                           const int width, const int height, const int a)

+{

+    const uint8_t *const weights_hor = &dav1d_sm_weights[width];

+    const uint8_t *const weights_ver = &dav1d_sm_weights[height];

+    const int right = topleft[width], bottom = topleft[-height];

+    for (int y = 0; y < height; y++) {

+        for (int x = 0; x < width; x++) {

+            const int pred = weights_ver[y]  * topleft[1 + x] +

+                      (256 - weights_ver[y]) * bottom +

+                             weights_hor[x]  * topleft[-(1 + y)] +

+                      (256 - weights_hor[x]) * right;

+            dst[x] = (pred + 256) >> 9;

+        }

+        dst += PXSTRIDE(stride);

+    }

+}

+static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,

+                             const pixel *const topleft,

+                             const int width, const int height, const int a)

+{

+    const uint8_t *const weights_ver = &dav1d_sm_weights[height];

+    const int bottom = topleft[-height];

+    for (int y = 0; y < height; y++) {

+        for (int x = 0; x < width; x++) {

+            const int pred = weights_ver[y]  * topleft[1 + x] +

+                      (256 - weights_ver[y]) * bottom;

+            dst[x] = (pred + 128) >> 8;

+        }

+        dst += PXSTRIDE(stride);

+    }

+}

+static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,

+                             const pixel *const topleft,

+                             const int width, const int height, const int a)

+{

+    const uint8_t *const weights_hor = &dav1d_sm_weights[width];

+    const int right = topleft[width];

+    for (int y = 0; y < height; y++) {

+        for (int x = 0; x < width; x++) {

+            const int pred = weights_hor[x]  * topleft[-(y + 1)] +

+                      (256 - weights_hor[x]) * right;

+            dst[x] = (pred + 128) >> 8;

+        }

+        dst += PXSTRIDE(stride);

+    }

+}

+static int get_filter_strength(const unsigned blk_wh, const unsigned d,

+                               const int type)

+{

+    int strength = 0;

+    if (type == 0) {

+        if (blk_wh <= 8) {

+            if (d >= 56) strength = 1;

+        } else if (blk_wh <= 12) {

+            if (d >= 40) strength = 1;

+        } else if (blk_wh <= 16) {

+            if (d >= 40) strength = 1;

+        } else if (blk_wh <= 24) {

+            if (d >= 8) strength = 1;

+            if (d >= 16) strength = 2;

+            if (d >= 32) strength = 3;

+        } else if (blk_wh <= 32) {

+            if (d >= 1) strength = 1;

+            if (d >= 4) strength = 2;

+            if (d >= 32) strength = 3;

+        } else {

+            if (d >= 1) strength = 3;

+        }

+    } else {

+        if (blk_wh <= 8) {

+            if (d >= 40) strength = 1;

+            if (d >= 64) strength = 2;

+        } else if (blk_wh <= 16) {

+            if (d >= 20) strength = 1;

+            if (d >= 48) strength = 2;

+        } else if (blk_wh <= 24) {

+            if (d >= 4) strength = 3;

+        } else {

+            if (d >= 1) strength = 3;

+        }

+    }

+    return strength;

+}

+static void filter_edge(pixel *const out, const int sz, const pixel *const in,

+                        const int from, const int to, const unsigned strength)

+{

+    static const uint8_t kernel[3][5] = {

+        { 0, 4, 8, 4, 0 },

+        { 0, 5, 6, 5, 0 },

+        { 2, 4, 4, 4, 2 }

+    };

+    assert(strength > 0);

+    for (int i = 0; i < sz; i++) {

+        int s = 0;

+        for (int j = 0; j < 5; j++)

+            s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j];

+        out[i] = (s + 8) >> 4;

+    }

+}

+static int get_upsample(const int blk_wh, const unsigned d, const int type) {

+    if (d >= 40) return 0;

+    return type ? (blk_wh <= 8) : (blk_wh <= 16);

+}

+static void upsample_edge(pixel *const out, const int hsz,

+                          const pixel *const in, const int from, const int to)

+{

+    static const int8_t kernel[4] = { -1, 9, 9, -1 };

+    int i;

+    for (i = 0; i < hsz - 1; i++) {

+        out[i * 2] = in[iclip(i, from, to - 1)];

+        int s = 0;

+        for (int j = 0; j < 4; j++)

+            s += in[iclip(i + j - 1, from, to - 1)] * kernel[j];

+        out[i * 2 + 1] = iclip_pixel((s + 8) >> 4);

+    }

+    out[i * 2] = in[iclip(i, from, to - 1)];

+}

+static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,

+                       const pixel *const topleft_in,

+                       const int width, const int height, int angle)

+{

+    const int is_sm = angle >> 9;

+    angle &= 511;

+    assert(angle < 90);

+    const int dx = dav1d_dr_intra_derivative[angle];

+    pixel top_out[(64 + 64) * 2];

+    const pixel *top;

+    int max_base_x;

+    const int upsample_above = get_upsample(width + height, 90 - angle, is_sm);

+    if (upsample_above) {

+        upsample_edge(top_out, width + height,

+                      &topleft_in[1], -1, width + imin(width, height));

+        top = top_out;

+        max_base_x = 2 * (width + height) - 2;

+    } else {

+        const int filter_strength =

+            get_filter_strength(width + height, 90 - angle, is_sm);

+        if (filter_strength) {

+            filter_edge(top_out, width + height,

+                        &topleft_in[1], -1, width + imin(width, height),

+                        filter_strength);

+            top = top_out;

+            max_base_x = width + height - 1;

+        } else {

+            top = &topleft_in[1];

+            max_base_x = width + imin(width, height) - 1;

+        }

+    }

+    const int frac_bits = 6 - upsample_above;

+    const int base_inc = 1 << upsample_above;

+    for (int y = 0, xpos = dx; y < height;

+         y++, dst += PXSTRIDE(stride), xpos += dx)

+    {

+        int base = xpos >> frac_bits;

+        const int frac = ((xpos << upsample_above) & 0x3F) >> 1;

+        for (int x = 0; x < width; x++, base += base_inc) {

+            if (base < max_base_x) {

+                const int v = top[base] * (32 - frac) + top[base + 1] * frac;

+                dst[x] = iclip_pixel((v + 16) >> 5);

+            } else {

+                pixel_set(&dst[x], top[max_base_x], width - x);

+                break;

+            }

+        }

+    }

+}

+static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,

+                       const pixel *const topleft_in,

+                       const int width, const int height, int angle)

+{

+    const int is_sm = angle >> 9;

+    angle &= 511;

+    assert(angle > 90 && angle < 180);

+    const int dy = dav1d_dr_intra_derivative[angle - 90];

+    const int dx = dav1d_dr_intra_derivative[180 - angle];

+    const int upsample_left = get_upsample(width + height, 180 - angle, is_sm);

+    const int upsample_above = get_upsample(width + height, angle - 90, is_sm);

+    pixel edge[64 * 2 + 64 * 2 + 1];

+    pixel *const topleft = &edge[height * 2];

+    if (upsample_above) {

+        upsample_edge(topleft, width + 1, topleft_in, 0, width + 1);

+    } else {

+        const int filter_strength =

+            get_filter_strength(width + height, angle - 90, is_sm);

+        if (filter_strength) {

+            filter_edge(&topleft[1], width, &topleft_in[1], -1, width,

+                        filter_strength);

+        } else {

+            pixel_copy(&topleft[1], &topleft_in[1], width);

+        }

+    }

+    if (upsample_left) {

+        upsample_edge(edge, height + 1, &topleft_in[-height], 0, height + 1);

+    } else {

+        const int filter_strength =

+            get_filter_strength(width + height, 180 - angle, is_sm);

+        if (filter_strength) {

+            filter_edge(&topleft[-height], height, &topleft_in[-height],

+                        0, height + 1, filter_strength);

+        } else {

+            pixel_copy(&topleft[-height], &topleft_in[-height], height);

+        }

+    }

+    *topleft = *topleft_in;

+    const int min_base_x = -(1 << upsample_above);

+    const int frac_bits_y = 6 - upsample_left, frac_bits_x = 6 - upsample_above;

+    const int base_inc_x = 1 << upsample_above;

+    const pixel *const left = &topleft[-(1 << upsample_left)];

+    const pixel *const top = &topleft[1 << upsample_above];

+    for (int y = 0, xpos = -dx; y < height;

+         y++, xpos -= dx, dst += PXSTRIDE(stride))

+    {

+        int base_x = xpos >> frac_bits_x;

+        const int frac_x = ((xpos * (1 << upsample_above)) & 0x3F) >> 1;

+        for (int x = 0, ypos = (y << 6) - dy; x < width;

+             x++, base_x += base_inc_x, ypos -= dy)

+        {

+            int v;

+            if (base_x >= min_base_x) {

+                v = top[base_x] * (32 - frac_x) + top[base_x + 1] * frac_x;

+            } else {

+                const int base_y = ypos >> frac_bits_y;

+                assert(base_y >= -(1 << upsample_left));

+                const int frac_y = ((ypos * (1 << upsample_left)) & 0x3F) >> 1;

+                v = left[-base_y] * (32 - frac_y) + left[-(base_y + 1)] * frac_y;

+            }

+            dst[x] = iclip_pixel((v + 16) >> 5);

+        }

+    }

+}

+static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,

+                       const pixel *const topleft_in,

+                       const int width, const int height, int angle)

+{

+    const int is_sm = angle >> 9;

+    angle &= 511;

+    assert(angle > 180);

+    const int dy = dav1d_dr_intra_derivative[270 - angle];

+    pixel left_out[(64 + 64) * 2];

+    const pixel *left;

+    int max_base_y;

+    const int upsample_left = get_upsample(width + height, angle - 180, is_sm);

+    if (upsample_left) {

+        upsample_edge(left_out, width + height,

+                      &topleft_in[-(width + height)],

+                      imax(width - height, 0), width + height + 1);

+        left = &left_out[2 * (width + height) - 2];

+        max_base_y = 2 * (width + height) - 2;

+    } else {

+        const int filter_strength =

+            get_filter_strength(width + height, angle - 180, is_sm);

+        if (filter_strength) {

+            filter_edge(left_out, width + height,

+                        &topleft_in[-(width + height)],

+                        imax(width - height, 0), width + height + 1,

+                        filter_strength);

+            left = &left_out[width + height - 1];

+            max_base_y = width + height - 1;

+        } else {

+            left = &topleft_in[-1];

+            max_base_y = height + imin(width, height) - 1;

+        }

+    }

+    const int frac_bits = 6 - upsample_left;

+    const int base_inc = 1 << upsample_left;

+    for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {

+        int base = ypos >> frac_bits;

+        const int frac = ((ypos << upsample_left) & 0x3F) >> 1;

+        for (int y = 0; y < height; y++, base += base_inc) {

+            if (base < max_base_y) {

+                const int v = left[-base] * (32 - frac) +

+                              left[-(base + 1)] * frac;

+                dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 16) >> 5);

+            } else {

+                do {

+                    dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];

+                } while (++y < height);

+                break;

+            }

+        }

+    }

+}

+/* Up to 32x32 only */

+static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,

+                           const pixel *const topleft_in,

+                           const int width, const int height, int filt_idx)

+{

+    filt_idx &= 511;

+    assert(filt_idx < 5);

+    const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];

+    int x, y;

+    ptrdiff_t left_stride;

+    const pixel *left, *topleft, *top;

+    top = &topleft_in[1];

+    for (y = 0; y < height; y += 2) {

+        topleft = &topleft_in[-y];

+        left = &topleft[-1];

+        left_stride = -1;

+        for (x = 0; x < width; x += 4) {

+            const int p0 = *topleft;

+            const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];

+            const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];

+            pixel *ptr = &dst[x];

+            const int8_t *flt_ptr = filter;

+            for (int yy = 0; yy < 2; yy++) {

+                for (int xx = 0; xx < 4; xx++, flt_ptr += 2) {

+                    int acc = flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +

+                              flt_ptr[16] * p2 + flt_ptr[17] * p3 +

+                              flt_ptr[32] * p4 + flt_ptr[33] * p5 +

+                              flt_ptr[48] * p6;

+                    ptr[xx] = iclip_pixel((acc + 8) >> 4);

+                }

+                ptr += PXSTRIDE(stride);

+            }

+            left = &dst[x + 4 - 1];

+            left_stride = PXSTRIDE(stride);

+            top += 4;

+            topleft = &top[-1];

+        }

+        top = &dst[PXSTRIDE(stride)];

+        dst = &dst[PXSTRIDE(stride) * 2];

+    }

+}

+static NOINLINE void

+cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,

+         const int w_pad, const int h_pad, const int width, const int height,

+         const int ss_hor, const int ss_ver, const int log2sz)

+{

+    int y, x;

+    int16_t *const ac_orig = ac;

+    assert(w_pad >= 0 && w_pad * 4 < width);

+    assert(h_pad >= 0 && h_pad * 4 < height);

+    for (y = 0; y < height - 4 * h_pad; y++) {

+        for (x = 0; x < width - 4 * w_pad; x++) {

+            int ac_sum = ypx[x << ss_hor];

+            if (ss_hor) ac_sum += ypx[x * 2 + 1];

+            if (ss_ver) {

+                ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)];

+                if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)];

+            }

+            ac[x] = ac_sum << (1 + !ss_ver + !ss_hor);

+        }

+        for (; x < width; x++)

+            ac[x] = ac[x - 1];

+        ac += width;

+        ypx += PXSTRIDE(stride) << ss_ver;

+    }

+    for (; y < height; y++) {

+        memcpy(ac, &ac[-width], width * sizeof(*ac));

+        ac += width;

+    }

+    int sum = (1 << log2sz) >> 1;

+    for (ac = ac_orig, y = 0; y < height; y++) {

+        for (x = 0; x < width; x++)

+            sum += ac[x];

+        ac += width;

+    }

+    sum >>= log2sz;

+    // subtract DC

+    for (ac = ac_orig, y = 0; y < height; y++) {

+        for (x = 0; x < width; x++)

+            ac[x] -= sum;

+        ac += width;

+    }

+}

+#define cfl_ac_fn(lw, lh, cw, ch, ss_hor, ss_ver, log2sz) \

+static void cfl_ac_##lw##x##lh##_to_##cw##x##ch##_c(int16_t *const ac, \

+                                                    const pixel *const ypx, \

+                                                    const ptrdiff_t stride, \

+                                                    const int w_pad, \

+                                                    const int h_pad) \

+{ \

+    cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver, log2sz); \

+}

+cfl_ac_fn( 8,  8,  4,  4, 1, 1, 4)

+cfl_ac_fn( 8, 16,  4,  8, 1, 1, 5)

+cfl_ac_fn( 8, 32,  4, 16, 1, 1, 6)

+cfl_ac_fn(16,  8,  8,  4, 1, 1, 5)

+cfl_ac_fn(16, 16,  8,  8, 1, 1, 6)

+cfl_ac_fn(16, 32,  8, 16, 1, 1, 7)

+cfl_ac_fn(32,  8, 16,  4, 1, 1, 6)

+cfl_ac_fn(32, 16, 16,  8, 1, 1, 7)

+cfl_ac_fn(32, 32, 16, 16, 1, 1, 8)

+cfl_ac_fn( 8,  4,  4,  4, 1, 0, 4)

+cfl_ac_fn( 8,  8,  4,  8, 1, 0, 5)

+cfl_ac_fn(16,  4,  8,  4, 1, 0, 5)

+cfl_ac_fn(16,  8,  8,  8, 1, 0, 6)

+cfl_ac_fn(16, 16,  8, 16, 1, 0, 7)

+cfl_ac_fn(32,  8, 16,  8, 1, 0, 7)

+cfl_ac_fn(32, 16, 16, 16, 1, 0, 8)

+cfl_ac_fn(32, 32, 16, 32, 1, 0, 9)

+cfl_ac_fn( 4,  4,  4,  4, 0, 0, 4)

+cfl_ac_fn( 4,  8,  4,  8, 0, 0, 5)

+cfl_ac_fn( 4, 16,  4, 16, 0, 0, 6)

+cfl_ac_fn( 8,  4,  8,  4, 0, 0, 5)

+cfl_ac_fn( 8,  8,  8,  8, 0, 0, 6)

+cfl_ac_fn( 8, 16,  8, 16, 0, 0, 7)

+cfl_ac_fn( 8, 32,  8, 32, 0, 0, 8)

+cfl_ac_fn(16,  4, 16,  4, 0, 0, 6)

+cfl_ac_fn(16,  8, 16,  8, 0, 0, 7)

+cfl_ac_fn(16, 16, 16, 16, 0, 0, 8)

+cfl_ac_fn(16, 32, 16, 32, 0, 0, 9)

+cfl_ac_fn(32,  8, 32,  8, 0, 0, 8)

+cfl_ac_fn(32, 16, 32, 16, 0, 0, 9)

+cfl_ac_fn(32, 32, 32, 32, 0, 0, 10)

+static void pal_pred_c(pixel *dst, const ptrdiff_t stride,

+                       const uint16_t *const pal, const uint8_t *idx,

+                       const int w, const int h)

+{

+    for (int y = 0; y < h; y++) {

+        for (int x = 0; x < w; x++)

+            dst[x] = pal[idx[x]];

+        idx += w;

+        dst += PXSTRIDE(stride);

+    }

+}

+void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {

+    c->intra_pred[DC_PRED      ] = ipred_dc_c;

+    c->intra_pred[DC_128_PRED  ] = ipred_dc_128_c;

+    c->intra_pred[TOP_DC_PRED  ] = ipred_dc_top_c;

+    c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;

+    c->intra_pred[HOR_PRED     ] = ipred_h_c;

+    c->intra_pred[VERT_PRED    ] = ipred_v_c;

+    c->intra_pred[PAETH_PRED   ] = ipred_paeth_c;

+    c->intra_pred[SMOOTH_PRED  ] = ipred_smooth_c;

+    c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;

+    c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;

+    c->intra_pred[Z1_PRED      ] = ipred_z1_c;

+    c->intra_pred[Z2_PRED      ] = ipred_z2_c;

+    c->intra_pred[Z3_PRED      ] = ipred_z3_c;

+    c->intra_pred[FILTER_PRED  ] = ipred_filter_c;

+    // cfl functions are split per chroma subsampling type

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_4X4  ] = cfl_ac_8x8_to_4x4_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_4X8  ] = cfl_ac_8x16_to_4x8_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_4X16 ] = cfl_ac_8x32_to_4x16_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_8X4  ] = cfl_ac_16x8_to_8x4_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_8X8  ] = cfl_ac_16x16_to_8x8_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_8X16 ] = cfl_ac_16x32_to_8x16_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_16X4 ] = cfl_ac_32x8_to_16x4_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_16X8 ] = cfl_ac_32x16_to_16x8_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_16X16] = cfl_ac_32x32_to_16x16_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_4X4  ] = cfl_ac_8x4_to_4x4_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_4X8  ] = cfl_ac_8x8_to_4x8_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X4  ] = cfl_ac_16x4_to_8x4_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_8X8  ] = cfl_ac_16x8_to_8x8_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X16 ] = cfl_ac_16x16_to_8x16_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X8 ] = cfl_ac_32x8_to_16x8_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_16X16] = cfl_ac_32x16_to_16x16_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X32] = cfl_ac_32x32_to_16x32_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_4X4  ] = cfl_ac_4x4_to_4x4_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_4X8  ] = cfl_ac_4x8_to_4x8_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_4X16 ] = cfl_ac_4x16_to_4x16_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X4  ] = cfl_ac_8x4_to_8x4_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_8X8  ] = cfl_ac_8x8_to_8x8_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X16 ] = cfl_ac_8x16_to_8x16_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X32 ] = cfl_ac_8x32_to_8x32_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X4 ] = cfl_ac_16x4_to_16x4_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X8 ] = cfl_ac_16x8_to_16x8_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_16X16] = cfl_ac_16x16_to_16x16_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X32] = cfl_ac_16x32_to_16x32_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_32X8 ] = cfl_ac_32x8_to_32x8_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_32X16] = cfl_ac_32x16_to_32x16_c;

+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_32X32] = cfl_ac_32x32_to_32x32_c;

+    c->cfl_pred[DC_PRED     ] = ipred_cfl_c;

+    c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;

+    c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;

+    c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;

+    c->pal_pred = pal_pred_c;

+#if HAVE_ASM && ARCH_X86

+    bitfn(dav1d_intra_pred_dsp_init_x86)(c);

+#endif

+}

--- a/src/itx.c

+++ /dev/null

@@ -1,233 +1,0 @@

-/*

- * Copyright © 2018, VideoLAN and dav1d authors

- * Copyright © 2018, Two Orioles, LLC

- * All rights reserved.

- *

- * Redistribution and use in source and binary forms, with or without

- * modification, are permitted provided that the following conditions are met:

- *

- * 1. Redistributions of source code must retain the above copyright notice, this

- *    list of conditions and the following disclaimer.

- *

- * 2. Redistributions in binary form must reproduce the above copyright notice,

- *    this list of conditions and the following disclaimer in the documentation

- *    and/or other materials provided with the distribution.

- *

- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

- */

-#include "config.h"

-#include <assert.h>

-#include <stddef.h>

-#include <stdint.h>

-#include <string.h>

-#include "common/attributes.h"

-#include "common/intops.h"

-#include "src/itx.h"

-#include "src/itx_1d.c"

-typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,

-                          coef *out, ptrdiff_t out_s);

-static void NOINLINE

-inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,

-               coef *const coeff, const int eob,

-               const int w, const int h, const int shift1, const int shift2,

-               const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn)

-{

-    int i, j;

-    const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);

-    assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64));

-    // Maximum value for h and w is 64

-    coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];

-    const int is_rect2 = w * 2 == h || h * 2 == w;

-    if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));

-    const int rnd1 = (1 << shift1) >> 1;

-    for (i = 0; i < sh; i++) {

-        if (w != sw || is_rect2) {

-            for (j = 0; j < sw; j++) {

-                in_mem[j] = coeff[i + j * sh];

-                if (is_rect2)

-                    in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;

-            }

-            first_1d_fn(in_mem, 1, &tmp[i * w], 1);

-        } else {

-            first_1d_fn(&coeff[i], sh, &tmp[i * w], 1);

-        }

-        for (j = 0; j < w; j++)

-            tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;

-    }

-    if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));

-    const int rnd2 = (1 << shift2) >> 1;

-    for (i = 0; i < w; i++) {

-        second_1d_fn(&tmp[i], w, out, 1);

-        for (j = 0; j < h; j++)

-            dst[i + j * PXSTRIDE(stride)] =

-                iclip_pixel(dst[i + j * PXSTRIDE(stride)] +

-                            ((out[j] + (rnd2)) >> shift2));

-    }

-    memset(coeff, 0, sizeof(*coeff) * sh * sw);

-}

-#define inv_txfm_fn(type1, type2, w, h, shift1, shift2) \

-static void \

-inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \

-                                               const ptrdiff_t stride, \

-                                               coef *const coeff, \

-                                               const int eob) \

-{ \

-    inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift1, shift2, \

-                   inv_##type1##w##_1d, inv_##type2##h##_1d); \

-}

-#define inv_txfm_fn64(w, h, shift1, shift2) \

-inv_txfm_fn(dct, dct, w, h, shift1, shift2)

-#define inv_txfm_fn32(w, h, shift1, shift2) \

-inv_txfm_fn64(w, h, shift1, shift2) \

-inv_txfm_fn(identity, identity, w, h, shift1, shift2)

-#define inv_txfm_fn16(w, h, shift1, shift2) \

-inv_txfm_fn32(w, h, shift1, shift2) \

-inv_txfm_fn(adst,     dct,      w, h, shift1, shift2) \

-inv_txfm_fn(dct,      adst,     w, h, shift1, shift2) \

-inv_txfm_fn(adst,     adst,     w, h, shift1, shift2) \

-inv_txfm_fn(dct,      flipadst, w, h, shift1, shift2) \

-inv_txfm_fn(flipadst, dct,      w, h, shift1, shift2) \

-inv_txfm_fn(adst,     flipadst, w, h, shift1, shift2) \

-inv_txfm_fn(flipadst, adst,     w, h, shift1, shift2) \

-inv_txfm_fn(flipadst, flipadst, w, h, shift1, shift2) \

-inv_txfm_fn(identity, dct,      w, h, shift1, shift2) \

-inv_txfm_fn(dct,      identity, w, h, shift1, shift2) \

-#define inv_txfm_fn84(w, h, shift1, shift2) \

-inv_txfm_fn16(w, h, shift1, shift2) \

-inv_txfm_fn(identity, flipadst, w, h, shift1, shift2) \

-inv_txfm_fn(flipadst, identity, w, h, shift1, shift2) \

-inv_txfm_fn(identity, adst,     w, h, shift1, shift2) \

-inv_txfm_fn(adst,     identity, w, h, shift1, shift2) \

-inv_txfm_fn84( 4,  4, 0, 4)

-inv_txfm_fn84( 4,  8, 0, 4)

-inv_txfm_fn84( 4, 16, 1, 4)

-inv_txfm_fn84( 8,  4, 0, 4)

-inv_txfm_fn84( 8,  8, 1, 4)

-inv_txfm_fn84( 8, 16, 1, 4)

-inv_txfm_fn32( 8, 32, 2, 4)

-inv_txfm_fn84(16,  4, 1, 4)

-inv_txfm_fn84(16,  8, 1, 4)

-inv_txfm_fn16(16, 16, 2, 4)

-inv_txfm_fn32(16, 32, 1, 4)

-inv_txfm_fn64(16, 64, 2, 4)

-inv_txfm_fn32(32,  8, 2, 4)

-inv_txfm_fn32(32, 16, 1, 4)

-inv_txfm_fn32(32, 32, 2, 4)

-inv_txfm_fn64(32, 64, 1, 4)

-inv_txfm_fn64(64, 16, 2, 4)

-inv_txfm_fn64(64, 32, 1, 4)

-inv_txfm_fn64(64, 64, 2, 4)

-static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,

-                                       coef *const coeff, const int eob)

-{

-    int i, j;

-    coef tmp[4 * 4], out[4];

-    for (i = 0; i < 4; i++)

-        inv_wht4_1d(&coeff[i], 4, &tmp[i * 4], 1, 0);

-    for (i = 0; i < 4; i++) {

-        inv_wht4_1d(&tmp[i], 4, out, 1, 1);

-        for (j = 0; j < 4; j++)

-            dst[i + j * PXSTRIDE(stride)] =

-                iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);

-    }

-    memset(coeff, 0, sizeof(*coeff) * 4 * 4);

-}

-void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {

-#define assign_itx_all_fn64(w, h, pfx) \

-    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \

-        inv_txfm_add_dct_dct_##w##x##h##_c

-#define assign_itx_all_fn32(w, h, pfx) \

-    assign_itx_all_fn64(w, h, pfx); \

-    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \

-        inv_txfm_add_identity_identity_##w##x##h##_c

-#define assign_itx_all_fn16(w, h, pfx) \

-    assign_itx_all_fn32(w, h, pfx); \

-    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \

-        inv_txfm_add_adst_dct_##w##x##h##_c; \

-    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \

-        inv_txfm_add_dct_adst_##w##x##h##_c; \

-    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \

-        inv_txfm_add_adst_adst_##w##x##h##_c; \

-    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \

-        inv_txfm_add_flipadst_adst_##w##x##h##_c; \

-    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \

-        inv_txfm_add_adst_flipadst_##w##x##h##_c; \

-    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \

-        inv_txfm_add_flipadst_dct_##w##x##h##_c; \

-    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \

-        inv_txfm_add_dct_flipadst_##w##x##h##_c; \

-    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \

-        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \

-    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \

-        inv_txfm_add_dct_identity_##w##x##h##_c; \

-    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \

-        inv_txfm_add_identity_dct_##w##x##h##_c

-#define assign_itx_all_fn84(w, h, pfx) \

-    assign_itx_all_fn16(w, h, pfx); \

-    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \

-        inv_txfm_add_flipadst_identity_##w##x##h##_c; \

-    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \

-        inv_txfm_add_identity_flipadst_##w##x##h##_c; \

-    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \

-        inv_txfm_add_adst_identity_##w##x##h##_c; \

-    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \

-        inv_txfm_add_identity_adst_##w##x##h##_c; \

-    memset(c, 0, sizeof(*c)); /* Zero unused function pointer elements. */

-    c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;

-    assign_itx_all_fn84( 4,  4, );

-    assign_itx_all_fn84( 4,  8, R);

-    assign_itx_all_fn84( 4, 16, R);

-    assign_itx_all_fn84( 8,  4, R);

-    assign_itx_all_fn84( 8,  8, );

-    assign_itx_all_fn84( 8, 16, R);

-    assign_itx_all_fn32( 8, 32, R);

-    assign_itx_all_fn84(16,  4, R);

-    assign_itx_all_fn84(16,  8, R);

-    assign_itx_all_fn16(16, 16, );

-    assign_itx_all_fn32(16, 32, R);

-    assign_itx_all_fn64(16, 64, R);

-    assign_itx_all_fn32(32,  8, R);

-    assign_itx_all_fn32(32, 16, R);

-    assign_itx_all_fn32(32, 32, );

-    assign_itx_all_fn64(32, 64, R);

-    assign_itx_all_fn64(64, 16, R);

-    assign_itx_all_fn64(64, 32, R);

-    assign_itx_all_fn64(64, 64, );

-#if HAVE_ASM && ARCH_X86

-    bitfn(dav1d_itx_dsp_init_x86)(c);

-#endif

-}

--- /dev/null

+++ b/src/itx_tmpl.c

@@ -1,0 +1,233 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "config.h"

+#include <assert.h>

+#include <stddef.h>

+#include <stdint.h>

+#include <string.h>

+#include "common/attributes.h"

+#include "common/intops.h"

+#include "src/itx.h"

+#include "src/itx_1d.c"

+typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,

+                          coef *out, ptrdiff_t out_s);

+static void NOINLINE

+inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,

+               coef *const coeff, const int eob,

+               const int w, const int h, const int shift1, const int shift2,

+               const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn)

+{

+    int i, j;

+    const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);

+    assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64));

+    // Maximum value for h and w is 64

+    coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];

+    const int is_rect2 = w * 2 == h || h * 2 == w;

+    if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));

+    const int rnd1 = (1 << shift1) >> 1;

+    for (i = 0; i < sh; i++) {

+        if (w != sw || is_rect2) {

+            for (j = 0; j < sw; j++) {

+                in_mem[j] = coeff[i + j * sh];

+                if (is_rect2)

+                    in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;

+            }

+            first_1d_fn(in_mem, 1, &tmp[i * w], 1);

+        } else {

+            first_1d_fn(&coeff[i], sh, &tmp[i * w], 1);

+        }

+        for (j = 0; j < w; j++)

+            tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;

+    }

+    if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));

+    const int rnd2 = (1 << shift2) >> 1;

+    for (i = 0; i < w; i++) {

+        second_1d_fn(&tmp[i], w, out, 1);

+        for (j = 0; j < h; j++)

+            dst[i + j * PXSTRIDE(stride)] =

+                iclip_pixel(dst[i + j * PXSTRIDE(stride)] +

+                            ((out[j] + (rnd2)) >> shift2));

+    }

+    memset(coeff, 0, sizeof(*coeff) * sh * sw);

+}

+#define inv_txfm_fn(type1, type2, w, h, shift1, shift2) \

+static void \

+inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \

+                                               const ptrdiff_t stride, \

+                                               coef *const coeff, \

+                                               const int eob) \

+{ \

+    inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift1, shift2, \

+                   inv_##type1##w##_1d, inv_##type2##h##_1d); \

+}

+#define inv_txfm_fn64(w, h, shift1, shift2) \

+inv_txfm_fn(dct, dct, w, h, shift1, shift2)

+#define inv_txfm_fn32(w, h, shift1, shift2) \

+inv_txfm_fn64(w, h, shift1, shift2) \

+inv_txfm_fn(identity, identity, w, h, shift1, shift2)

+#define inv_txfm_fn16(w, h, shift1, shift2) \

+inv_txfm_fn32(w, h, shift1, shift2) \

+inv_txfm_fn(adst,     dct,      w, h, shift1, shift2) \

+inv_txfm_fn(dct,      adst,     w, h, shift1, shift2) \

+inv_txfm_fn(adst,     adst,     w, h, shift1, shift2) \

+inv_txfm_fn(dct,      flipadst, w, h, shift1, shift2) \

+inv_txfm_fn(flipadst, dct,      w, h, shift1, shift2) \

+inv_txfm_fn(adst,     flipadst, w, h, shift1, shift2) \

+inv_txfm_fn(flipadst, adst,     w, h, shift1, shift2) \

+inv_txfm_fn(flipadst, flipadst, w, h, shift1, shift2) \

+inv_txfm_fn(identity, dct,      w, h, shift1, shift2) \

+inv_txfm_fn(dct,      identity, w, h, shift1, shift2) \

+#define inv_txfm_fn84(w, h, shift1, shift2) \

+inv_txfm_fn16(w, h, shift1, shift2) \

+inv_txfm_fn(identity, flipadst, w, h, shift1, shift2) \

+inv_txfm_fn(flipadst, identity, w, h, shift1, shift2) \

+inv_txfm_fn(identity, adst,     w, h, shift1, shift2) \

+inv_txfm_fn(adst,     identity, w, h, shift1, shift2) \

+inv_txfm_fn84( 4,  4, 0, 4)

+inv_txfm_fn84( 4,  8, 0, 4)

+inv_txfm_fn84( 4, 16, 1, 4)

+inv_txfm_fn84( 8,  4, 0, 4)

+inv_txfm_fn84( 8,  8, 1, 4)

+inv_txfm_fn84( 8, 16, 1, 4)

+inv_txfm_fn32( 8, 32, 2, 4)

+inv_txfm_fn84(16,  4, 1, 4)

+inv_txfm_fn84(16,  8, 1, 4)

+inv_txfm_fn16(16, 16, 2, 4)

+inv_txfm_fn32(16, 32, 1, 4)

+inv_txfm_fn64(16, 64, 2, 4)

+inv_txfm_fn32(32,  8, 2, 4)

+inv_txfm_fn32(32, 16, 1, 4)

+inv_txfm_fn32(32, 32, 2, 4)

+inv_txfm_fn64(32, 64, 1, 4)

+inv_txfm_fn64(64, 16, 2, 4)

+inv_txfm_fn64(64, 32, 1, 4)

+inv_txfm_fn64(64, 64, 2, 4)

+static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,

+                                       coef *const coeff, const int eob)

+{

+    int i, j;

+    coef tmp[4 * 4], out[4];

+    for (i = 0; i < 4; i++)

+        inv_wht4_1d(&coeff[i], 4, &tmp[i * 4], 1, 0);

+    for (i = 0; i < 4; i++) {

+        inv_wht4_1d(&tmp[i], 4, out, 1, 1);

+        for (j = 0; j < 4; j++)

+            dst[i + j * PXSTRIDE(stride)] =

+                iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);

+    }

+    memset(coeff, 0, sizeof(*coeff) * 4 * 4);

+}

+void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {

+#define assign_itx_all_fn64(w, h, pfx) \

+    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \

+        inv_txfm_add_dct_dct_##w##x##h##_c

+#define assign_itx_all_fn32(w, h, pfx) \

+    assign_itx_all_fn64(w, h, pfx); \

+    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \

+        inv_txfm_add_identity_identity_##w##x##h##_c

+#define assign_itx_all_fn16(w, h, pfx) \

+    assign_itx_all_fn32(w, h, pfx); \

+    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \

+        inv_txfm_add_adst_dct_##w##x##h##_c; \

+    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \

+        inv_txfm_add_dct_adst_##w##x##h##_c; \

+    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \

+        inv_txfm_add_adst_adst_##w##x##h##_c; \

+    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \

+        inv_txfm_add_flipadst_adst_##w##x##h##_c; \

+    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \

+        inv_txfm_add_adst_flipadst_##w##x##h##_c; \

+    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \

+        inv_txfm_add_flipadst_dct_##w##x##h##_c; \

+    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \

+        inv_txfm_add_dct_flipadst_##w##x##h##_c; \

+    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \

+        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \

+    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \

+        inv_txfm_add_dct_identity_##w##x##h##_c; \

+    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \

+        inv_txfm_add_identity_dct_##w##x##h##_c

+#define assign_itx_all_fn84(w, h, pfx) \

+    assign_itx_all_fn16(w, h, pfx); \

+    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \

+        inv_txfm_add_flipadst_identity_##w##x##h##_c; \

+    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \

+        inv_txfm_add_identity_flipadst_##w##x##h##_c; \

+    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \

+        inv_txfm_add_adst_identity_##w##x##h##_c; \

+    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \

+        inv_txfm_add_identity_adst_##w##x##h##_c; \

+    memset(c, 0, sizeof(*c)); /* Zero unused function pointer elements. */

+    c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;

+    assign_itx_all_fn84( 4,  4, );

+    assign_itx_all_fn84( 4,  8, R);

+    assign_itx_all_fn84( 4, 16, R);

+    assign_itx_all_fn84( 8,  4, R);

+    assign_itx_all_fn84( 8,  8, );

+    assign_itx_all_fn84( 8, 16, R);

+    assign_itx_all_fn32( 8, 32, R);

+    assign_itx_all_fn84(16,  4, R);

+    assign_itx_all_fn84(16,  8, R);

+    assign_itx_all_fn16(16, 16, );

+    assign_itx_all_fn32(16, 32, R);

+    assign_itx_all_fn64(16, 64, R);

+    assign_itx_all_fn32(32,  8, R);

+    assign_itx_all_fn32(32, 16, R);

+    assign_itx_all_fn32(32, 32, );

+    assign_itx_all_fn64(32, 64, R);

+    assign_itx_all_fn64(64, 16, R);

+    assign_itx_all_fn64(64, 32, R);

+    assign_itx_all_fn64(64, 64, );

+#if HAVE_ASM && ARCH_X86

+    bitfn(dav1d_itx_dsp_init_x86)(c);

+#endif

+}

--- a/src/lf_apply.c

+++ /dev/null

@@ -1,306 +1,0 @@

-/*

- * Copyright © 2018, VideoLAN and dav1d authors

- * Copyright © 2018, Two Orioles, LLC

- * All rights reserved.

- *

- * Redistribution and use in source and binary forms, with or without

- * modification, are permitted provided that the following conditions are met:

- *

- * 1. Redistributions of source code must retain the above copyright notice, this

- *    list of conditions and the following disclaimer.

- *

- * 2. Redistributions in binary form must reproduce the above copyright notice,

- *    this list of conditions and the following disclaimer in the documentation

- *    and/or other materials provided with the distribution.

- *

- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

- */

-#include "config.h"

-#include <assert.h>

-#include <string.h>

-#include "common/intops.h"

-#include "src/lf_apply.h"

-static inline void filter_plane_cols_y(const Dav1dFrameContext *const f,

-                                       const int have_left,

-                                       const uint8_t (*lvl)[4],

-                                       const ptrdiff_t b4_stride,

-                                       const uint16_t (*const mask)[3][2],

-                                       pixel *dst, const ptrdiff_t ls,

-                                       const int w,

-                                       const int starty4, const int endy4)

-{

-    const Dav1dDSPContext *const dsp = f->dsp;

-    // filter edges between columns (e.g. block1 | block2)

-    for (int x = 0; x < w; x++) {

-        if (!have_left && !x) continue;

-        uint32_t hmask[4];

-        if (!starty4) {

-            hmask[0] = mask[x][0][0];

-            hmask[1] = mask[x][1][0];

-            hmask[2] = mask[x][2][0];

-            if (endy4 > 16) {

-                hmask[0] |= mask[x][0][1] << 16;

-                hmask[1] |= mask[x][1][1] << 16;

-                hmask[2] |= mask[x][2][1] << 16;

-            }

-        } else {

-            hmask[0] = mask[x][0][1];

-            hmask[1] = mask[x][1][1];

-            hmask[2] = mask[x][2][1];

-        }

-        hmask[3] = 0;

-        dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls, hmask,

-                                     (const uint8_t(*)[4]) &lvl[x][0], b4_stride,

-                                     &f->lf.lim_lut, endy4 - starty4);

-    }

-}

-static inline void filter_plane_rows_y(const Dav1dFrameContext *const f,

-                                       const int have_top,

-                                       const uint8_t (*lvl)[4],

-                                       const ptrdiff_t b4_stride,

-                                       const uint16_t (*const mask)[3][2],

-                                       pixel *dst, const ptrdiff_t ls,

-                                       const int w,

-                                       const int starty4, const int endy4)

-{

-    const Dav1dDSPContext *const dsp = f->dsp;

-    //                                 block1

-    // filter edges between rows (e.g. ------)

-    //                                 block2

-    for (int y = starty4; y < endy4;

-         y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)

-    {

-        if (!have_top && !y) continue;

-        const uint32_t vmask[4] = {

-            mask[y][0][0] | (mask[y][0][1] << 16),

-            mask[y][1][0] | (mask[y][1][1] << 16),

-            mask[y][2][0] | (mask[y][2][1] << 16),

-            0,

-        };

-        dsp->lf.loop_filter_sb[0][1](dst, ls, vmask,

-                                     (const uint8_t(*)[4]) &lvl[0][1], b4_stride,

-                                     &f->lf.lim_lut, w);

-    }

-}

-static inline void filter_plane_cols_uv(const Dav1dFrameContext *const f,

-                                        const int have_left,

-                                        const uint8_t (*lvl)[4],

-                                        const ptrdiff_t b4_stride,

-                                        const uint16_t (*const mask)[2][2],

-                                        pixel *const u, pixel *const v,

-                                        const ptrdiff_t ls, const int w,

-                                        const int starty4, const int endy4,

-                                        const int ss_ver)

-{

-    const Dav1dDSPContext *const dsp = f->dsp;

-    // filter edges between columns (e.g. block1 | block2)

-    for (int x = 0; x < w; x++) {

-        if (!have_left && !x) continue;

-        uint32_t hmask[3];

-        if (!starty4) {

-            hmask[0] = mask[x][0][0];

-            hmask[1] = mask[x][1][0];

-            if (endy4 > (16 >> ss_ver)) {

-                hmask[0] |= mask[x][0][1] << (16 >> ss_ver);

-                hmask[1] |= mask[x][1][1] << (16 >> ss_ver);

-            }

-        } else {

-            hmask[0] = mask[x][0][1];

-            hmask[1] = mask[x][1][1];

-        }

-        hmask[2] = 0;

-        dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls, hmask,

-                                     (const uint8_t(*)[4]) &lvl[x][2], b4_stride,

-                                     &f->lf.lim_lut, endy4 - starty4);

-        dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls, hmask,

-                                     (const uint8_t(*)[4]) &lvl[x][3], b4_stride,

-                                     &f->lf.lim_lut, endy4 - starty4);

-    }

-}

-static inline void filter_plane_rows_uv(const Dav1dFrameContext *const f,

-                                        const int have_top,

-                                        const uint8_t (*lvl)[4],

-                                        const ptrdiff_t b4_stride,

-                                        const uint16_t (*const mask)[2][2],

-                                        pixel *const u, pixel *const v,

-                                        const ptrdiff_t ls, const int w,

-                                        const int starty4, const int endy4,

-                                        const int ss_hor)

-{

-    const Dav1dDSPContext *const dsp = f->dsp;

-    ptrdiff_t off_l = 0;

-    //                                 block1

-    // filter edges between rows (e.g. ------)

-    //                                 block2

-    for (int y = starty4; y < endy4;

-         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)

-    {

-        if (!have_top && !y) continue;

-        const uint32_t vmask[3] = {

-            mask[y][0][0] | (mask[y][0][1] << (16 >> ss_hor)),

-            mask[y][1][0] | (mask[y][1][1] << (16 >> ss_hor)),

-            0,

-        };

-        dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, vmask,

-                                     (const uint8_t(*)[4]) &lvl[0][2], b4_stride,

-                                     &f->lf.lim_lut, w);

-        dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, vmask,

-                                     (const uint8_t(*)[4]) &lvl[0][3], b4_stride,

-                                     &f->lf.lim_lut, w);

-    }

-}

-void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,

-                                    pixel *const p[3], Av1Filter *const lflvl,

-                                    int sby, const int start_of_tile_row)

-{

-    int x, have_left;

-    // Don't filter outside the frame

-    const int hy4 = (f->cur.p.p.h + 3) >> 2;

-    const int have_top = sby > 0;

-    const int is_sb64 = !f->seq_hdr.sb128;

-    const int starty4 = (sby & is_sb64) << 4;

-    const int sbsz = 32 >> is_sb64;

-    const int sbl2 = 5 - is_sb64;

-    const int halign = (f->bh + 31) & ~31;

-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

-    const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;

-    const unsigned vmax = 1 << vmask, hmax = 1 << hmask;

-    const unsigned endy4 = starty4 + imin(hy4 - sby * sbsz, sbsz);

-    const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;

-    // fix lpf strength at tile col boundaries

-    const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2];

-    const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)];

-    for (int tile_col = 1;; tile_col++) {

-        x = f->frame_hdr.tiling.col_start_sb[tile_col];

-        if ((x << sbl2) >= f->bw) break;

-        const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;

-        x >>= is_sb64;

-        uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4];

-        for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {

-            const int sidx = mask >= 0x10000;

-            const unsigned smask = mask >> (sidx << 4);

-            const int idx = 2 * !!(y_hmask[2][sidx] & smask) +

-                                !!(y_hmask[1][sidx] & smask);

-            y_hmask[2][sidx] &= ~smask;

-            y_hmask[1][sidx] &= ~smask;

-            y_hmask[0][sidx] &= ~smask;

-            y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask;

-        }

-        if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {

-            uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4];

-            for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;

-                 y++, uv_mask <<= 1)

-            {

-                const int sidx = uv_mask >= vmax;

-                const unsigned smask = uv_mask >> (sidx << (4 - ss_ver));

-                const int idx = !!(uv_hmask[1][sidx] & smask);

-                uv_hmask[1][sidx] &= ~smask;

-                uv_hmask[0][sidx] &= ~smask;

-                uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask;

-            }

-        }

-        lpf_y  += halign;

-        lpf_uv += halign >> ss_ver;

-    }

-    // fix lpf strength at tile row boundaries

-    if (start_of_tile_row) {

-        const BlockContext *a;

-        for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)];

-             x < f->sb128w; x++, a++)

-        {

-            uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4];

-            for (unsigned mask = 1, i = 0; i < 32; mask <<= 1, i++) {

-                const int sidx = mask >= 0x10000;

-                const unsigned smask = mask >> (sidx << 4);

-                const int idx = 2 * !!(y_vmask[2][sidx] & smask) +

-                                    !!(y_vmask[1][sidx] & smask);

-                y_vmask[2][sidx] &= ~smask;

-                y_vmask[1][sidx] &= ~smask;

-                y_vmask[0][sidx] &= ~smask;

-                y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask;

-            }

-            if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {

-                uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver];

-                for (unsigned uv_mask = 1, i = 0; i < (32U >> ss_hor); uv_mask <<= 1, i++) {

-                    const int sidx = uv_mask >= hmax;

-                    const unsigned smask = uv_mask >> (sidx << (4 - ss_hor));

-                    const int idx = !!(uv_vmask[1][sidx] & smask);

-                    uv_vmask[1][sidx] &= ~smask;

-                    uv_vmask[0][sidx] &= ~smask;

-                    uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask;

-                }

-            }

-        }

-    }

-    pixel *ptr;

-    uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;

-    for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w;

-         x++, have_left = 1, ptr += 128, level_ptr += 32)

-    {

-        filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,

-                            lflvl[x].filter_y[0], ptr, f->cur.p.stride[0],

-                            imin(32, f->bw - x * 32), starty4, endy4);

-    }

-    level_ptr = f->lf.level + f->b4_stride * sby * sbsz;

-    for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {

-        filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,

-                            lflvl[x].filter_y[1], ptr, f->cur.p.stride[0],

-                            imin(32, f->bw - x * 32), starty4, endy4);

-    }

-    if (!f->frame_hdr.loopfilter.level_u && !f->frame_hdr.loopfilter.level_v)

-        return;

-    ptrdiff_t uv_off;

-    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);

-    for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w;

-         x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)

-    {

-        filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,

-                             lflvl[x].filter_uv[0],

-                             &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],

-                             (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,

-                             starty4 >> ss_ver, uv_endy4, ss_ver);

-    }

-    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);

-    for (uv_off = 0, x = 0; x < f->sb128w;

-         x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)

-    {

-        filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,

-                             lflvl[x].filter_uv[1],

-                             &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],

-                             (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,

-                             starty4 >> ss_ver, uv_endy4, ss_hor);

-    }

-}

--- /dev/null

+++ b/src/lf_apply_tmpl.c

@@ -1,0 +1,306 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "config.h"

+#include <assert.h>

+#include <string.h>

+#include "common/intops.h"

+#include "src/lf_apply.h"

+static inline void filter_plane_cols_y(const Dav1dFrameContext *const f,

+                                       const int have_left,

+                                       const uint8_t (*lvl)[4],

+                                       const ptrdiff_t b4_stride,

+                                       const uint16_t (*const mask)[3][2],

+                                       pixel *dst, const ptrdiff_t ls,

+                                       const int w,

+                                       const int starty4, const int endy4)

+{

+    const Dav1dDSPContext *const dsp = f->dsp;

+    // filter edges between columns (e.g. block1 | block2)

+    for (int x = 0; x < w; x++) {

+        if (!have_left && !x) continue;

+        uint32_t hmask[4];

+        if (!starty4) {

+            hmask[0] = mask[x][0][0];

+            hmask[1] = mask[x][1][0];

+            hmask[2] = mask[x][2][0];

+            if (endy4 > 16) {

+                hmask[0] |= mask[x][0][1] << 16;

+                hmask[1] |= mask[x][1][1] << 16;

+                hmask[2] |= mask[x][2][1] << 16;

+            }

+        } else {

+            hmask[0] = mask[x][0][1];

+            hmask[1] = mask[x][1][1];

+            hmask[2] = mask[x][2][1];

+        }

+        hmask[3] = 0;

+        dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls, hmask,

+                                     (const uint8_t(*)[4]) &lvl[x][0], b4_stride,

+                                     &f->lf.lim_lut, endy4 - starty4);

+    }

+}

+static inline void filter_plane_rows_y(const Dav1dFrameContext *const f,

+                                       const int have_top,

+                                       const uint8_t (*lvl)[4],

+                                       const ptrdiff_t b4_stride,

+                                       const uint16_t (*const mask)[3][2],

+                                       pixel *dst, const ptrdiff_t ls,

+                                       const int w,

+                                       const int starty4, const int endy4)

+{

+    const Dav1dDSPContext *const dsp = f->dsp;

+    //                                 block1

+    // filter edges between rows (e.g. ------)

+    //                                 block2

+    for (int y = starty4; y < endy4;

+         y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)

+    {

+        if (!have_top && !y) continue;

+        const uint32_t vmask[4] = {

+            mask[y][0][0] | (mask[y][0][1] << 16),

+            mask[y][1][0] | (mask[y][1][1] << 16),

+            mask[y][2][0] | (mask[y][2][1] << 16),

+            0,

+        };

+        dsp->lf.loop_filter_sb[0][1](dst, ls, vmask,

+                                     (const uint8_t(*)[4]) &lvl[0][1], b4_stride,

+                                     &f->lf.lim_lut, w);

+    }

+}

+static inline void filter_plane_cols_uv(const Dav1dFrameContext *const f,

+                                        const int have_left,

+                                        const uint8_t (*lvl)[4],

+                                        const ptrdiff_t b4_stride,

+                                        const uint16_t (*const mask)[2][2],

+                                        pixel *const u, pixel *const v,

+                                        const ptrdiff_t ls, const int w,

+                                        const int starty4, const int endy4,

+                                        const int ss_ver)

+{

+    const Dav1dDSPContext *const dsp = f->dsp;

+    // filter edges between columns (e.g. block1 | block2)

+    for (int x = 0; x < w; x++) {

+        if (!have_left && !x) continue;

+        uint32_t hmask[3];

+        if (!starty4) {

+            hmask[0] = mask[x][0][0];

+            hmask[1] = mask[x][1][0];

+            if (endy4 > (16 >> ss_ver)) {

+                hmask[0] |= mask[x][0][1] << (16 >> ss_ver);

+                hmask[1] |= mask[x][1][1] << (16 >> ss_ver);

+            }

+        } else {

+            hmask[0] = mask[x][0][1];

+            hmask[1] = mask[x][1][1];

+        }

+        hmask[2] = 0;

+        dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls, hmask,

+                                     (const uint8_t(*)[4]) &lvl[x][2], b4_stride,

+                                     &f->lf.lim_lut, endy4 - starty4);

+        dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls, hmask,

+                                     (const uint8_t(*)[4]) &lvl[x][3], b4_stride,

+                                     &f->lf.lim_lut, endy4 - starty4);

+    }

+}

+static inline void filter_plane_rows_uv(const Dav1dFrameContext *const f,

+                                        const int have_top,

+                                        const uint8_t (*lvl)[4],

+                                        const ptrdiff_t b4_stride,

+                                        const uint16_t (*const mask)[2][2],

+                                        pixel *const u, pixel *const v,

+                                        const ptrdiff_t ls, const int w,

+                                        const int starty4, const int endy4,

+                                        const int ss_hor)

+{

+    const Dav1dDSPContext *const dsp = f->dsp;

+    ptrdiff_t off_l = 0;

+    //                                 block1

+    // filter edges between rows (e.g. ------)

+    //                                 block2

+    for (int y = starty4; y < endy4;

+         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)

+    {

+        if (!have_top && !y) continue;

+        const uint32_t vmask[3] = {

+            mask[y][0][0] | (mask[y][0][1] << (16 >> ss_hor)),

+            mask[y][1][0] | (mask[y][1][1] << (16 >> ss_hor)),

+            0,

+        };

+        dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, vmask,

+                                     (const uint8_t(*)[4]) &lvl[0][2], b4_stride,

+                                     &f->lf.lim_lut, w);

+        dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, vmask,

+                                     (const uint8_t(*)[4]) &lvl[0][3], b4_stride,

+                                     &f->lf.lim_lut, w);

+    }

+}

+void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,

+                                    pixel *const p[3], Av1Filter *const lflvl,

+                                    int sby, const int start_of_tile_row)

+{

+    int x, have_left;

+    // Don't filter outside the frame

+    const int hy4 = (f->cur.p.p.h + 3) >> 2;

+    const int have_top = sby > 0;

+    const int is_sb64 = !f->seq_hdr.sb128;

+    const int starty4 = (sby & is_sb64) << 4;

+    const int sbsz = 32 >> is_sb64;

+    const int sbl2 = 5 - is_sb64;

+    const int halign = (f->bh + 31) & ~31;

+    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

+    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

+    const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;

+    const unsigned vmax = 1 << vmask, hmax = 1 << hmask;

+    const unsigned endy4 = starty4 + imin(hy4 - sby * sbsz, sbsz);

+    const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;

+    // fix lpf strength at tile col boundaries

+    const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2];

+    const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)];

+    for (int tile_col = 1;; tile_col++) {

+        x = f->frame_hdr.tiling.col_start_sb[tile_col];

+        if ((x << sbl2) >= f->bw) break;

+        const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;

+        x >>= is_sb64;

+        uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4];

+        for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {

+            const int sidx = mask >= 0x10000;

+            const unsigned smask = mask >> (sidx << 4);

+            const int idx = 2 * !!(y_hmask[2][sidx] & smask) +

+                                !!(y_hmask[1][sidx] & smask);

+            y_hmask[2][sidx] &= ~smask;

+            y_hmask[1][sidx] &= ~smask;

+            y_hmask[0][sidx] &= ~smask;

+            y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask;

+        }

+        if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {

+            uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4];

+            for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;

+                 y++, uv_mask <<= 1)

+            {

+                const int sidx = uv_mask >= vmax;

+                const unsigned smask = uv_mask >> (sidx << (4 - ss_ver));

+                const int idx = !!(uv_hmask[1][sidx] & smask);

+                uv_hmask[1][sidx] &= ~smask;

+                uv_hmask[0][sidx] &= ~smask;

+                uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask;

+            }

+        }

+        lpf_y  += halign;

+        lpf_uv += halign >> ss_ver;

+    }

+    // fix lpf strength at tile row boundaries

+    if (start_of_tile_row) {

+        const BlockContext *a;

+        for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)];

+             x < f->sb128w; x++, a++)

+        {

+            uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4];

+            for (unsigned mask = 1, i = 0; i < 32; mask <<= 1, i++) {

+                const int sidx = mask >= 0x10000;

+                const unsigned smask = mask >> (sidx << 4);

+                const int idx = 2 * !!(y_vmask[2][sidx] & smask) +

+                                    !!(y_vmask[1][sidx] & smask);

+                y_vmask[2][sidx] &= ~smask;

+                y_vmask[1][sidx] &= ~smask;

+                y_vmask[0][sidx] &= ~smask;

+                y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask;

+            }

+            if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {

+                uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver];

+                for (unsigned uv_mask = 1, i = 0; i < (32U >> ss_hor); uv_mask <<= 1, i++) {

+                    const int sidx = uv_mask >= hmax;

+                    const unsigned smask = uv_mask >> (sidx << (4 - ss_hor));

+                    const int idx = !!(uv_vmask[1][sidx] & smask);

+                    uv_vmask[1][sidx] &= ~smask;

+                    uv_vmask[0][sidx] &= ~smask;

+                    uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask;

+                }

+            }

+        }

+    }

+    pixel *ptr;

+    uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;

+    for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w;

+         x++, have_left = 1, ptr += 128, level_ptr += 32)

+    {

+        filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,

+                            lflvl[x].filter_y[0], ptr, f->cur.p.stride[0],

+                            imin(32, f->bw - x * 32), starty4, endy4);

+    }

+    level_ptr = f->lf.level + f->b4_stride * sby * sbsz;

+    for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {

+        filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,

+                            lflvl[x].filter_y[1], ptr, f->cur.p.stride[0],

+                            imin(32, f->bw - x * 32), starty4, endy4);

+    }

+    if (!f->frame_hdr.loopfilter.level_u && !f->frame_hdr.loopfilter.level_v)

+        return;

+    ptrdiff_t uv_off;

+    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);

+    for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w;

+         x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)

+    {

+        filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,

+                             lflvl[x].filter_uv[0],

+                             &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],

+                             (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,

+                             starty4 >> ss_ver, uv_endy4, ss_ver);

+    }

+    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);

+    for (uv_off = 0, x = 0; x < f->sb128w;

+         x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)

+    {

+        filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,

+                             lflvl[x].filter_uv[1],

+                             &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],

+                             (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,

+                             starty4 >> ss_ver, uv_endy4, ss_hor);

+    }

+}

--- a/src/loopfilter.c

+++ /dev/null

@@ -1,246 +1,0 @@

-/*

- * Copyright © 2018, VideoLAN and dav1d authors

- * Copyright © 2018, Two Orioles, LLC

- * All rights reserved.

- *

- * Redistribution and use in source and binary forms, with or without

- * modification, are permitted provided that the following conditions are met:

- *

- * 1. Redistributions of source code must retain the above copyright notice, this

- *    list of conditions and the following disclaimer.

- *

- * 2. Redistributions in binary form must reproduce the above copyright notice,

- *    this list of conditions and the following disclaimer in the documentation

- *    and/or other materials provided with the distribution.

- *

- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

- */

-#include "config.h"

-#include <stdlib.h>

-#include "common/attributes.h"

-#include "common/intops.h"

-#include "src/loopfilter.h"

-static NOINLINE void

-loop_filter(pixel *dst, int E, int I, int H,

-            const ptrdiff_t stridea, const ptrdiff_t strideb, const int wd)

-{

-    const int F = 1 << (BITDEPTH - 8);

-    E <<= BITDEPTH - 8;

-    I <<= BITDEPTH - 8;

-    H <<= BITDEPTH - 8;

-    for (int i = 0; i < 4; i++, dst += stridea) {

-        int p6, p5, p4, p3, p2;

-        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];

-        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];

-        int q2, q3, q4, q5, q6;

-        int fm, flat8out, flat8in;

-        fm = abs(p1 - p0) <= I && abs(q1 - q0) <= I &&

-             abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E;

-        if (wd > 4) {

-            p2 = dst[strideb * -3];

-            q2 = dst[strideb * +2];

-            fm &= abs(p2 - p1) <= I && abs(q2 - q1) <= I;

-            if (wd > 6) {

-                p3 = dst[strideb * -4];

-                q3 = dst[strideb * +3];

-                fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I;

-            }

-        }

-        if (!fm) continue;

-        if (wd >= 16) {

-            p6 = dst[strideb * -7];

-            p5 = dst[strideb * -6];

-            p4 = dst[strideb * -5];

-            q4 = dst[strideb * +4];

-            q5 = dst[strideb * +5];

-            q6 = dst[strideb * +6];

-            flat8out = abs(p6 - p0) <= F && abs(p5 - p0) <= F &&

-                       abs(p4 - p0) <= F && abs(q4 - q0) <= F &&

-                       abs(q5 - q0) <= F && abs(q6 - q0) <= F;

-        }

-        if (wd >= 6)

-            flat8in = abs(p2 - p0) <= F && abs(p1 - p0) <= F &&

-                      abs(q1 - q0) <= F && abs(q2 - q0) <= F;

-        if (wd >= 8)

-            flat8in &= abs(p3 - p0) <= F && abs(q3 - q0) <= F;

-        if (wd >= 16 && (flat8out & flat8in)) {

-            dst[strideb * -6] = (p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 +

-                                 p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;

-            dst[strideb * -5] = (p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 +

-                                 p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;

-            dst[strideb * -4] = (p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 +

-                                 p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;

-            dst[strideb * -3] = (p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 +

-                                 p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;

-            dst[strideb * -2] = (p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +

-                                 p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;

-            dst[strideb * -1] = (p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +

-                                 q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;

-            dst[strideb * +0] = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +

-                                 q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;

-            dst[strideb * +1] = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +

-                                 q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8) >> 4;

-            dst[strideb * +2] = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +

-                                 q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8) >> 4;

-            dst[strideb * +3] = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +

-                                 q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8) >> 4;

-            dst[strideb * +4] = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +

-                                 q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;

-            dst[strideb * +5] = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 +

-                                 q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;

-        } else if (wd >= 8 && flat8in) {

-            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;

-            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;

-            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;

-            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;

-            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;

-            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;

-        } else if (wd == 6 && flat8in) {

-            dst[strideb * -2] = (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3;

-            dst[strideb * -1] = (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3;

-            dst[strideb * +0] = (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3;

-            dst[strideb * +1] = (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3;

-        } else {

-            const int hev = abs(p1 - p0) > H || abs(q1 - q0) > H;

-#define iclip_diff(v) iclip(v, -128 * (1 << (BITDEPTH - 8)), \

-                                128 * (1 << (BITDEPTH - 8)) - 1)

-            if (hev) {

-                int f = iclip_diff(p1 - q1), f1, f2;

-                f = iclip_diff(3 * (q0 - p0) + f);

-                f1 = imin(f + 4, (128 << (BITDEPTH - 8)) - 1) >> 3;

-                f2 = imin(f + 3, (128 << (BITDEPTH - 8)) - 1) >> 3;

-                dst[strideb * -1] = iclip_pixel(p0 + f2);

-                dst[strideb * +0] = iclip_pixel(q0 - f1);

-            } else {

-                int f = iclip_diff(3 * (q0 - p0)), f1, f2;

-                f1 = imin(f + 4, (128 << (BITDEPTH - 8)) - 1) >> 3;

-                f2 = imin(f + 3, (128 << (BITDEPTH - 8)) - 1) >> 3;

-                dst[strideb * -1] = iclip_pixel(p0 + f2);

-                dst[strideb * +0] = iclip_pixel(q0 - f1);

-                f = (f1 + 1) >> 1;

-                dst[strideb * -2] = iclip_pixel(p1 + f);

-                dst[strideb * +1] = iclip_pixel(q1 - f);

-            }

-#undef iclip_diff

-        }

-    }

-}

-static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,

-                                   const uint32_t *const vmask,

-                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,

-                                   const Av1FilterLUT *lut, const int h)

-{

-    const unsigned vm = (vmask[0] | vmask[1] | vmask[2]) & ((1ULL << h) - 1);

-    for (unsigned y = 1; vm & ~(y - 1);

-         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)

-    {

-        if (vm & y) {

-            const int L = l[0][0] ? l[0][0] : l[-1][0];

-            if (!L) continue;

-            const int H = L >> 4;

-            const int E = lut->e[L], I = lut->i[L];

-            const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);

-            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx);

-        }

-    }

-}

-static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,

-                                   const uint32_t *const vmask,

-                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,

-                                   const Av1FilterLUT *lut, const int w)

-{

-    const unsigned vm = vmask[0] | vmask[1] | vmask[2];

-    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {

-        if (vm & x) {

-            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];

-            if (!L) continue;

-            const int H = L >> 4;

-            const int E = lut->e[L], I = lut->i[L];

-            const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);

-            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx);

-        }

-    }

-}

-static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,

-                                    const uint32_t *const vmask,

-                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,

-                                    const Av1FilterLUT *lut, const int h)

-{

-    const unsigned vm = (vmask[0] | vmask[1]) & ((1ULL << h) - 1);

-    for (unsigned y = 1; vm & ~(y - 1);

-         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)

-    {

-        if (vm & y) {

-            const int L = l[0][0] ? l[0][0] : l[-1][0];

-            if (!L) continue;

-            const int H = L >> 4;

-            const int E = lut->e[L], I = lut->i[L];

-            const int idx = !!(vmask[1] & y);

-            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx);

-        }

-    }

-}

-static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,

-                                    const uint32_t *const vmask,

-                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,

-                                    const Av1FilterLUT *lut, const int w)

-{

-    const unsigned vm = vmask[0] | vmask[1];

-    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {

-        if (vm & x) {

-            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];

-            if (!L) continue;

-            const int H = L >> 4;

-            const int E = lut->e[L], I = lut->i[L];

-            const int idx = !!(vmask[1] & x);

-            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx);

-        }

-    }

-}

-void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {

-    c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;

-    c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;

-    c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;

-    c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;

-#if HAVE_ASM && ARCH_X86

-    bitfn(dav1d_loop_filter_dsp_init_x86)(c);

-#endif

-}

--- /dev/null

+++ b/src/loopfilter_tmpl.c

@@ -1,0 +1,246 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "config.h"

+#include <stdlib.h>

+#include "common/attributes.h"

+#include "common/intops.h"

+#include "src/loopfilter.h"

+static NOINLINE void

+loop_filter(pixel *dst, int E, int I, int H,

+            const ptrdiff_t stridea, const ptrdiff_t strideb, const int wd)

+{

+    const int F = 1 << (BITDEPTH - 8);

+    E <<= BITDEPTH - 8;

+    I <<= BITDEPTH - 8;

+    H <<= BITDEPTH - 8;

+    for (int i = 0; i < 4; i++, dst += stridea) {

+        int p6, p5, p4, p3, p2;

+        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];

+        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];

+        int q2, q3, q4, q5, q6;

+        int fm, flat8out, flat8in;

+        fm = abs(p1 - p0) <= I && abs(q1 - q0) <= I &&

+             abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E;

+        if (wd > 4) {

+            p2 = dst[strideb * -3];

+            q2 = dst[strideb * +2];

+            fm &= abs(p2 - p1) <= I && abs(q2 - q1) <= I;

+            if (wd > 6) {

+                p3 = dst[strideb * -4];

+                q3 = dst[strideb * +3];

+                fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I;

+            }

+        }

+        if (!fm) continue;

+        if (wd >= 16) {

+            p6 = dst[strideb * -7];

+            p5 = dst[strideb * -6];

+            p4 = dst[strideb * -5];

+            q4 = dst[strideb * +4];

+            q5 = dst[strideb * +5];

+            q6 = dst[strideb * +6];

+            flat8out = abs(p6 - p0) <= F && abs(p5 - p0) <= F &&

+                       abs(p4 - p0) <= F && abs(q4 - q0) <= F &&

+                       abs(q5 - q0) <= F && abs(q6 - q0) <= F;

+        }

+        if (wd >= 6)

+            flat8in = abs(p2 - p0) <= F && abs(p1 - p0) <= F &&

+                      abs(q1 - q0) <= F && abs(q2 - q0) <= F;

+        if (wd >= 8)

+            flat8in &= abs(p3 - p0) <= F && abs(q3 - q0) <= F;

+        if (wd >= 16 && (flat8out & flat8in)) {

+            dst[strideb * -6] = (p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 +

+                                 p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;

+            dst[strideb * -5] = (p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 +

+                                 p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;

+            dst[strideb * -4] = (p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 +

+                                 p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;

+            dst[strideb * -3] = (p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 +

+                                 p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;

+            dst[strideb * -2] = (p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +

+                                 p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;

+            dst[strideb * -1] = (p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +

+                                 q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;

+            dst[strideb * +0] = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +

+                                 q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;

+            dst[strideb * +1] = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +

+                                 q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8) >> 4;

+            dst[strideb * +2] = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +

+                                 q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8) >> 4;

+            dst[strideb * +3] = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +

+                                 q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8) >> 4;

+            dst[strideb * +4] = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +

+                                 q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;

+            dst[strideb * +5] = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 +

+                                 q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;

+        } else if (wd >= 8 && flat8in) {

+            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;

+            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;

+            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;

+            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;

+            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;

+            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;

+        } else if (wd == 6 && flat8in) {

+            dst[strideb * -2] = (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3;

+            dst[strideb * -1] = (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3;

+            dst[strideb * +0] = (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3;

+            dst[strideb * +1] = (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3;

+        } else {

+            const int hev = abs(p1 - p0) > H || abs(q1 - q0) > H;

+#define iclip_diff(v) iclip(v, -128 * (1 << (BITDEPTH - 8)), \

+                                128 * (1 << (BITDEPTH - 8)) - 1)

+            if (hev) {

+                int f = iclip_diff(p1 - q1), f1, f2;

+                f = iclip_diff(3 * (q0 - p0) + f);

+                f1 = imin(f + 4, (128 << (BITDEPTH - 8)) - 1) >> 3;

+                f2 = imin(f + 3, (128 << (BITDEPTH - 8)) - 1) >> 3;

+                dst[strideb * -1] = iclip_pixel(p0 + f2);

+                dst[strideb * +0] = iclip_pixel(q0 - f1);

+            } else {

+                int f = iclip_diff(3 * (q0 - p0)), f1, f2;

+                f1 = imin(f + 4, (128 << (BITDEPTH - 8)) - 1) >> 3;

+                f2 = imin(f + 3, (128 << (BITDEPTH - 8)) - 1) >> 3;

+                dst[strideb * -1] = iclip_pixel(p0 + f2);

+                dst[strideb * +0] = iclip_pixel(q0 - f1);

+                f = (f1 + 1) >> 1;

+                dst[strideb * -2] = iclip_pixel(p1 + f);

+                dst[strideb * +1] = iclip_pixel(q1 - f);

+            }

+#undef iclip_diff

+        }

+    }

+}

+static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,

+                                   const uint32_t *const vmask,

+                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,

+                                   const Av1FilterLUT *lut, const int h)

+{

+    const unsigned vm = (vmask[0] | vmask[1] | vmask[2]) & ((1ULL << h) - 1);

+    for (unsigned y = 1; vm & ~(y - 1);

+         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)

+    {

+        if (vm & y) {

+            const int L = l[0][0] ? l[0][0] : l[-1][0];

+            if (!L) continue;

+            const int H = L >> 4;

+            const int E = lut->e[L], I = lut->i[L];

+            const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);

+            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx);

+        }

+    }

+}

+static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,

+                                   const uint32_t *const vmask,

+                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,

+                                   const Av1FilterLUT *lut, const int w)

+{

+    const unsigned vm = vmask[0] | vmask[1] | vmask[2];

+    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {

+        if (vm & x) {

+            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];

+            if (!L) continue;

+            const int H = L >> 4;

+            const int E = lut->e[L], I = lut->i[L];

+            const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);

+            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx);

+        }

+    }

+}

+static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,

+                                    const uint32_t *const vmask,

+                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,

+                                    const Av1FilterLUT *lut, const int h)

+{

+    const unsigned vm = (vmask[0] | vmask[1]) & ((1ULL << h) - 1);

+    for (unsigned y = 1; vm & ~(y - 1);

+         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)

+    {

+        if (vm & y) {

+            const int L = l[0][0] ? l[0][0] : l[-1][0];

+            if (!L) continue;

+            const int H = L >> 4;

+            const int E = lut->e[L], I = lut->i[L];

+            const int idx = !!(vmask[1] & y);

+            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx);

+        }

+    }

+}

+static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,

+                                    const uint32_t *const vmask,

+                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,

+                                    const Av1FilterLUT *lut, const int w)

+{

+    const unsigned vm = vmask[0] | vmask[1];

+    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {

+        if (vm & x) {

+            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];

+            if (!L) continue;

+            const int H = L >> 4;

+            const int E = lut->e[L], I = lut->i[L];

+            const int idx = !!(vmask[1] & x);

+            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx);

+        }

+    }

+}

+void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {

+    c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;

+    c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;

+    c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;

+    c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;

+#if HAVE_ASM && ARCH_X86

+    bitfn(dav1d_loop_filter_dsp_init_x86)(c);

+#endif

+}

--- a/src/looprestoration.c

+++ /dev/null

@@ -1,577 +1,0 @@

-/*

- * Copyright © 2018, VideoLAN and dav1d authors

- * Copyright © 2018, Two Orioles, LLC

- * All rights reserved.

- *

- * Redistribution and use in source and binary forms, with or without

- * modification, are permitted provided that the following conditions are met:

- *

- * 1. Redistributions of source code must retain the above copyright notice, this

- *    list of conditions and the following disclaimer.

- *

- * 2. Redistributions in binary form must reproduce the above copyright notice,

- *    this list of conditions and the following disclaimer in the documentation

- *    and/or other materials provided with the distribution.

- *

- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

- */

-#include "config.h"

-#include <stdlib.h>

-#include "common/intops.h"

-#include "src/looprestoration.h"

-#include "src/tables.h"

-// 256 * 1.5 + 3 + 3 = 390

-#define REST_UNIT_STRIDE (390)

-// TODO Reuse p when no padding is needed (add and remove lpf pixels in p)

-// TODO Chroma only requires 2 rows of padding.

-static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,

-                    const pixel (*left)[4],

-                    const pixel *lpf, const ptrdiff_t lpf_stride,

-                    int unit_w, const int stripe_h, const enum LrEdgeFlags edges)

-{

-    const int have_left = !!(edges & LR_HAVE_LEFT);

-    const int have_right = !!(edges & LR_HAVE_RIGHT);

-    // Copy more pixels if we don't have to pad them

-    unit_w += 3 * have_left + 3 * have_right;

-    pixel *dst_l = dst + 3 * !have_left;

-    p -= 3 * have_left;

-    lpf -= 3 * have_left;

-    if (edges & LR_HAVE_TOP) {

-        // Copy previous loop filtered rows

-        const pixel *const above_1 = lpf;

-        const pixel *const above_2 = above_1 + PXSTRIDE(lpf_stride);

-        pixel_copy(dst_l, above_1, unit_w);

-        pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);

-        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);

-    } else {

-        // Pad with first row

-        pixel_copy(dst_l, p, unit_w);

-        pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);

-        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);

-        if (have_left) {

-            pixel_copy(dst_l, &left[0][1], 3);

-            pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);

-            pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);

-        }

-    }

-    pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;

-    if (edges & LR_HAVE_BOTTOM) {

-        // Copy next loop filtered rows

-        const pixel *const below_1 = lpf + 6 * PXSTRIDE(lpf_stride);

-        const pixel *const below_2 = below_1 + PXSTRIDE(lpf_stride);

-        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);

-        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);

-        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);

-    } else {

-        // Pad with last row

-        const pixel *const src = p + (stripe_h - 1) * PXSTRIDE(p_stride);

-        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);

-        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);

-        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);

-        if (have_left) {

-            pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);

-            pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);

-            pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);

-        }

-    }

-    // Inner UNIT_WxSTRIPE_H

-    for (int j = 0; j < stripe_h; j++) {

-        pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);

-        dst_tl += REST_UNIT_STRIDE;

-        p += PXSTRIDE(p_stride);

-    }

-    if (!have_right) {

-        pixel *pad = dst_l + unit_w;

-        pixel *row_last = &dst_l[unit_w - 1];

-        // Pad 3x(STRIPE_H+6) with last column

-        for (int j = 0; j < stripe_h + 6; j++) {

-            pixel_set(pad, *row_last, 3);

-            pad += REST_UNIT_STRIDE;

-            row_last += REST_UNIT_STRIDE;

-        }

-    }

-    if (!have_left) {

-        // Pad 3x(STRIPE_H+6) with first column

-        for (int j = 0; j < stripe_h + 6; j++) {

-            pixel_set(dst, *dst_l, 3);

-            dst += REST_UNIT_STRIDE;

-            dst_l += REST_UNIT_STRIDE;

-        }

-    } else {

-        dst += 3 * REST_UNIT_STRIDE;

-        for (int j = 0; j < stripe_h; j++) {

-            pixel_copy(dst, &left[j][1], 3);

-            dst += REST_UNIT_STRIDE;

-        }

-    }

-}

-// FIXME Could split into luma and chroma specific functions,

-// (since first and last tops are always 0 for chroma)

-// FIXME Could implement a version that requires less temporary memory

-// (should be possible to implement with only 6 rows of temp storage)

-static void wiener_c(pixel *p, const ptrdiff_t p_stride,

-                     const pixel (*const left)[4],

-                     const pixel *lpf, const ptrdiff_t lpf_stride,

-                     const int w, const int h,

-                     const int16_t filterh[7], const int16_t filterv[7],

-                     const enum LrEdgeFlags edges)

-{

-    // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels

-    // of padding above and below

-    pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];

-    pixel *tmp_ptr = tmp;

-    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);

-    // Values stored between horizontal and vertical filtering don't

-    // fit in a uint8_t.

-    uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];

-    uint16_t *hor_ptr = hor;

-    const int round_bits_h = 3 + (BITDEPTH == 12) * 2;

-    const int rounding_off_h = 1 << (round_bits_h - 1);

-    const int clip_limit = 1 << ((BITDEPTH) + 1 + 7 - round_bits_h);

-    for (int j = 0; j < h + 6; j++) {

-        for (int i = 0; i < w; i++) {

-            int sum = (tmp_ptr[i + 3] << 7) + (1 << (BITDEPTH + 6));

-            for (int k = 0; k < 7; k++) {

-                sum += tmp_ptr[i + k] * filterh[k];

-            }

-            hor_ptr[i] =

-                iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);

-        }

-        tmp_ptr += REST_UNIT_STRIDE;

-        hor_ptr += REST_UNIT_STRIDE;

-    }

-    const int round_bits_v = 11 - (BITDEPTH == 12) * 2;

-    const int rounding_off_v = 1 << (round_bits_v - 1);

-    const int round_offset = 1 << (BITDEPTH + (round_bits_v - 1));

-    for (int i = 0; i < w; i++) {

-        for (int j = 0; j < h; j++) {

-            int sum = (hor[(j + 3) * REST_UNIT_STRIDE + i] << 7) - round_offset;

-            for (int k = 0; k < 7; k++) {

-                sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filterv[k];

-            }

-            p[j * PXSTRIDE(p_stride) + i] =

-                iclip_pixel((sum + rounding_off_v) >> round_bits_v);

-        }

-    }

-}

-// Sum over a 3x3 area

-// The dst and src pointers are positioned 3 pixels above and 3 pixels to the

-// left of the top left corner. However, the self guided filter only needs 1

-// pixel above and one pixel to the left. As for the pixels below and to the

-// right they must be computed in the sums, but don't need to be stored.

-//

-// Example for a 4x4 block:

-//      x x x x x x x x x x

-//      x c c c c c c c c x

-//      x i s s s s s s i x

-//      x i s s s s s s i x

-//      x i s s s s s s i x

-//      x i s s s s s s i x

-//      x i s s s s s s i x

-//      x i s s s s s s i x

-//      x c c c c c c c c x

-//      x x x x x x x x x x

-//

-// s: Pixel summed and stored

-// i: Pixel summed and stored (between loops)

-// c: Pixel summed not stored

-// x: Pixel not summed not stored

-static void boxsum3(coef *dst, const pixel *src, const int w, const int h) {

-    // We skip the first row, as it is never used

-    src += REST_UNIT_STRIDE;

-    dst += REST_UNIT_STRIDE;

-    // We skip the first and last columns, as they are never used

-    for (int x = 1; x < w - 1; x++) {

-        coef *ds = dst + x;

-        const pixel *s = src + x;

-        int a = s[0], b = s[REST_UNIT_STRIDE];

-        // We skip the first 2 rows, as they are skipped in the next loop and

-        // we don't need the last 2 row as it is skipped in the next loop

-        for (int y = 2; y < h - 2; y++) {

-            s += REST_UNIT_STRIDE;

-            const int c = s[REST_UNIT_STRIDE];

-            ds += REST_UNIT_STRIDE;

-            *ds = a + b + c;

-            a = b;

-            b = c;

-        }

-     }

-    // We skip the first 2 rows as they are never read

-    dst += REST_UNIT_STRIDE;

-    // We skip the last 2 rows as it is never read

-    for (int y = 2; y < h - 2; y++) {

-        int a = dst[1], b = dst[2];

-        // We don't store the first column as it is never read and

-        // we don't store the last 2 columns as they are never read

-        for (int x = 2; x < w - 2; x++) {

-            const int c = dst[x + 1];

-            dst[x] = a + b + c;

-            a = b;

-            b = c;

-        }

-        dst += REST_UNIT_STRIDE;

-    }

-}

-// Sum over a 5x5 area

-// The dst and src pointers are positioned 3 pixels above and 3 pixels to the

-// left of the top left corner. However, the self guided filter only needs 1

-// pixel above and one pixel to the left. As for the pixels below and to the

-// right they must be computed in the sums, but don't need to be stored.

-//

-// Example for a 4x4 block:

-//      c c c c c c c c c c

-//      c c c c c c c c c c

-//      i i s s s s s s i i

-//      i i s s s s s s i i

-//      i i s s s s s s i i

-//      i i s s s s s s i i

-//      i i s s s s s s i i

-//      i i s s s s s s i i

-//      c c c c c c c c c c

-//      c c c c c c c c c c

-//

-// s: Pixel summed and stored

-// i: Pixel summed and stored (between loops)

-// c: Pixel summed not stored

-// x: Pixel not summed not stored

-static void boxsum5(coef *dst, const pixel *const src, const int w, const int h) {

-    // We skip the first row, as it is never used

-    dst += REST_UNIT_STRIDE;

-    for (int x = 0; x < w; x++) {

-        coef *ds = dst + x;

-        const pixel *s = src + 3 * REST_UNIT_STRIDE + x;

-        int a = s[-3 * REST_UNIT_STRIDE];

-        int b = s[-2 * REST_UNIT_STRIDE];

-        int c = s[-1 * REST_UNIT_STRIDE];

-        int d = s[0];

-        // We skip the first 2 rows, as they are skipped in the next loop and

-        // we don't need the last 2 row as it is skipped in the next loop

-        for (int y = 2; y < h - 2; y++) {

-            s += REST_UNIT_STRIDE;

-            const int e = *s;

-            ds += REST_UNIT_STRIDE;

-            *ds = a + b + c + d + e;

-            a = b;

-            b = c;

-            c = d;

-            d = e;

-        }

-    }

-    // We skip the first 2 rows as they are never read

-    dst += REST_UNIT_STRIDE;

-    for (int y = 2; y < h - 2; y++) {

-        int a = dst[0];

-        int b = dst[1];

-        int c = dst[2];

-        int d = dst[3];

-        for (int x = 2; x < w - 2; x++) {

-            const int e = dst[x + 2];

-            dst[x] = a + b + c + d + e;

-            a = b;

-            b = c;

-            c = d;

-            d = e;

-        }

-        dst += REST_UNIT_STRIDE;

-    }

-}

-// See boxsum3 function comments for details on row and column skipping

-static void boxsum3sqr(int32_t *dst, const pixel *src, const int w, const int h) {

-    // We skip the first row, as it is never used

-    src += REST_UNIT_STRIDE;

-    dst += REST_UNIT_STRIDE;

-    // We skip the first and last columns, as they are never used

-    for (int x = 1; x < w - 1; x++) {

-        int *ds = dst + x;

-        const pixel *s = src + x;

-        int a = s[0] * s[0];

-        int b = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];

-        // We skip the first row, as it is skipped in the next loop and

-        // we don't need the last row as it is skipped in the next loop

-        for (int y = 2; y < h - 2; y++) {

-            s += REST_UNIT_STRIDE;

-            const int c = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];

-            ds += REST_UNIT_STRIDE;

-            *ds = a + b + c;

-            a = b;

-            b = c;

-        }

-     }

-    // We skip the first row as it is never read

-    dst += REST_UNIT_STRIDE;

-    // We skip the last row as it is never read

-    for (int y = 2; y < h - 2; y++) {

-        int a = dst[1], b = dst[2];

-        // We don't store the first column as it is never read and

-        // we don't store the last 2 columns as they are never read

-        for (int x = 2; x < w - 2; x++) {

-            const int c = dst[x + 1];

-            dst[x] = a + b + c;

-            a = b;

-            b = c;

-        }

-        dst += REST_UNIT_STRIDE;

-    }

-}

-// See boxsum5 function comments for details on row and column skipping

-static void boxsum5sqr(int32_t *dst, const pixel *const src, const int w,

-                       const int h)

-{

-    // We skip the first row, as it is never used

-    dst += REST_UNIT_STRIDE;

-    for (int x = 0; x < w; x++) {

-        int *ds = dst + x;

-        const pixel *s = src + 3 * REST_UNIT_STRIDE + x;

-        int a = s[-3 * REST_UNIT_STRIDE] * s[-3 * REST_UNIT_STRIDE];

-        int b = s[-2 * REST_UNIT_STRIDE] * s[-2 * REST_UNIT_STRIDE];

-        int c = s[-1 * REST_UNIT_STRIDE] * s[-1 * REST_UNIT_STRIDE];

-        int d = s[0] * s[0];

-        // We skip the first 2 rows, as they are skipped in the next loop and

-        // we don't need the last 2 row as it is skipped in the next loop

-        for (int y = 2; y < h - 2; y++) {

-            s += REST_UNIT_STRIDE;

-            const int e = s[0] * s[0];

-            ds += REST_UNIT_STRIDE;

-            *ds = a + b + c + d + e;

-            a = b;

-            b = c;

-            c = d;

-            d = e;

-        }

-    }

-    // We skip the first 2 rows as they are never read

-    dst += REST_UNIT_STRIDE;

-    for (int y = 2; y < h - 2; y++) {

-        int a = dst[0];

-        int b = dst[1];

-        int c = dst[2];

-        int d = dst[3];

-        for (int x = 2; x < w - 2; x++) {

-            const int e = dst[x + 2];

-            dst[x] = a + b + c + d + e;

-            a = b;

-            b = c;

-            c = d;

-            d = e;

-        }

-        dst += REST_UNIT_STRIDE;

-    }

-}

-static void selfguided_filter(int32_t *dst, const pixel *src,

-                              const ptrdiff_t src_stride, const int w,

-                              const int h, const int n, const int s)

-{

-    // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels

-    // of padding above and below

-    int32_t A_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];

-    int32_t *A = A_ + 3 * REST_UNIT_STRIDE + 3;

-    // By inverting A and B after the boxsums, B can be of size coef instead

-    // of int32_t

-    coef B_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];

-    coef *B = B_ + 3 * REST_UNIT_STRIDE + 3;

-    const int step = (n == 25) + 1;

-    if (n == 25) {

-        boxsum5(B_, src, w + 6, h + 6);

-        boxsum5sqr(A_, src, w + 6, h + 6);

-    } else {

-        boxsum3(B_, src, w + 6, h + 6);

-        boxsum3sqr(A_, src, w + 6, h + 6);

-    }

-    int32_t *AA = A - REST_UNIT_STRIDE;

-    coef *BB = B - REST_UNIT_STRIDE;

-    for (int j = -1; j < h + 1; j+= step) {

-        for (int i = -1; i < w + 1; i++) {

-            const int a =

-                (AA[i] + (1 << (2 * (BITDEPTH - 8)) >> 1)) >> (2 * (BITDEPTH - 8));

-            const int b =

-                (BB[i] + (1 << (BITDEPTH - 8) >> 1)) >> (BITDEPTH - 8);

-            const uint32_t p = (a * n >= b * b) * (a * n - b * b);

-            const uint32_t z = (p * s + (1 << 19)) >> 20;

-            const int x = dav1d_sgr_x_by_xplus1[imin(z, 255)];

-            // This is where we invert A and B, so that B is of size coef.

-            AA[i] = (((1 << 8) - x) * BB[i] * dav1d_sgr_one_by_x[n - 1] + (1 << 11)) >> 12;

-            BB[i] = x;

-        }

-        AA += step * REST_UNIT_STRIDE;

-        BB += step * REST_UNIT_STRIDE;

-    }

-    src += 3 * REST_UNIT_STRIDE + 3;

-    if (n == 25) {

-        int j = 0;

-#define SIX_NEIGHBORS(P, i)\

-    ((P[i - REST_UNIT_STRIDE]     + P[i + REST_UNIT_STRIDE]) * 6 +   \

-     (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] +    \

-      P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 5)

-        for (; j < h - 1; j+=2) {

-            for (int i = 0; i < w; i++) {

-                const int32_t a = SIX_NEIGHBORS(B, i);

-                const int32_t b = SIX_NEIGHBORS(A, i);

-                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;

-            }

-            dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;

-            src += REST_UNIT_STRIDE;

-            B += REST_UNIT_STRIDE;

-            A += REST_UNIT_STRIDE;

-            for (int i = 0; i < w; i++) {

-                const int32_t a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;

-                const int32_t b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;

-                dst[i] = (a * src[i] + b + (1 << 7)) >> 8;

-            }

-            dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;

-            src += REST_UNIT_STRIDE;

-            B += REST_UNIT_STRIDE;

-            A += REST_UNIT_STRIDE;

-        }

-        if (j + 1 == h) { // Last row, when number of rows is odd

-            for (int i = 0; i < w; i++) {

-                const int32_t a = SIX_NEIGHBORS(B, i);

-                const int32_t b = SIX_NEIGHBORS(A, i);

-                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;

-            }

-        }

-#undef SIX_NEIGHBORS

-    } else {

-#define EIGHT_NEIGHBORS(P, i)\

-    ((P[i] + P[i - 1] + P[i + 1] + P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 4 + \

-     (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] +                           \

-      P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 3)

-        for (int j = 0; j < h; j++) {

-            for (int i = 0; i < w; i++) {

-                const int32_t a = EIGHT_NEIGHBORS(B, i);

-                const int32_t b = EIGHT_NEIGHBORS(A, i);

-                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;

-            }

-            dst += 384;

-            src += REST_UNIT_STRIDE;

-            B += REST_UNIT_STRIDE;

-            A += REST_UNIT_STRIDE;

-        }

-    }

-#undef NINE_NEIGHBORS

-}

-static void selfguided_c(pixel *p, const ptrdiff_t p_stride,

-                         const pixel (*const left)[4],

-                         const pixel *lpf, const ptrdiff_t lpf_stride,

-                         const int w, const int h, const int sgr_idx,

-                         const int16_t sgr_w[2], const enum LrEdgeFlags edges)

-{

-    // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels

-    // of padding above and below

-    pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];

-    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);

-    // Selfguided filter outputs to a maximum stripe height of 64 and a

-    // maximum restoration width of 384 (256 * 1.5)

-    int32_t dst[64 * 384];

-    // both r1 and r0 can't be zero

-    if (!dav1d_sgr_params[sgr_idx][0]) {

-        const int s1 = dav1d_sgr_params[sgr_idx][3];

-        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, s1);

-        const int w1 = (1 << 7) - sgr_w[1];

-        for (int j = 0; j < h; j++) {

-            for (int i = 0; i < w; i++) {

-                const int32_t u = (p[i] << 4);

-                const int32_t v = (u << 7) + w1 * (dst[j * 384 + i] - u);

-                p[i] = iclip_pixel((v + (1 << 10)) >> 11);

-            }

-            p += PXSTRIDE(p_stride);

-        }

-    } else if (!dav1d_sgr_params[sgr_idx][1]) {

-        const int s0 = dav1d_sgr_params[sgr_idx][2];

-        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0);

-        const int w0 = sgr_w[0];

-        for (int j = 0; j < h; j++) {

-            for (int i = 0; i < w; i++) {

-                const int32_t u = (p[i] << 4);

-                const int32_t v = (u << 7) + w0 * (dst[j * 384 + i] - u);

-                p[i] = iclip_pixel((v + (1 << 10)) >> 11);

-            }

-            p += PXSTRIDE(p_stride);

-        }

-    } else {

-        int32_t dst1[64 * 384];

-        const int s0 = dav1d_sgr_params[sgr_idx][2];

-        const int s1 = dav1d_sgr_params[sgr_idx][3];

-        const int w0 = sgr_w[0];

-        const int w1 = (1 << 7) - w0 - sgr_w[1];

-        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0);

-        selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, s1);

-        for (int j = 0; j < h; j++) {

-            for (int i = 0; i < w; i++) {

-                const int32_t u = (p[i] << 4);

-                const int32_t v = (u << 7) + w0 * (dst[j * 384 + i] - u) +

-                                  w1 * (dst1[j * 384 + i] - u);

-                p[i] = iclip_pixel((v + (1 << 10)) >> 11);

-            }

-            p += PXSTRIDE(p_stride);

-        }

-    }

-}

-void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) {

-    c->wiener = wiener_c;

-    c->selfguided = selfguided_c;

-#if HAVE_ASM && ARCH_X86 && BITDEPTH == 8

-    bitfn(dav1d_loop_restoration_dsp_init_x86)(c);

-#endif

-}

--- /dev/null

+++ b/src/looprestoration_tmpl.c

@@ -1,0 +1,577 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "config.h"

+#include <stdlib.h>

+#include "common/intops.h"

+#include "src/looprestoration.h"

+#include "src/tables.h"

+// 256 * 1.5 + 3 + 3 = 390

+#define REST_UNIT_STRIDE (390)

+// TODO Reuse p when no padding is needed (add and remove lpf pixels in p)

+// TODO Chroma only requires 2 rows of padding.

+static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,

+                    const pixel (*left)[4],

+                    const pixel *lpf, const ptrdiff_t lpf_stride,

+                    int unit_w, const int stripe_h, const enum LrEdgeFlags edges)

+{

+    const int have_left = !!(edges & LR_HAVE_LEFT);

+    const int have_right = !!(edges & LR_HAVE_RIGHT);

+    // Copy more pixels if we don't have to pad them

+    unit_w += 3 * have_left + 3 * have_right;

+    pixel *dst_l = dst + 3 * !have_left;

+    p -= 3 * have_left;

+    lpf -= 3 * have_left;

+    if (edges & LR_HAVE_TOP) {

+        // Copy previous loop filtered rows

+        const pixel *const above_1 = lpf;

+        const pixel *const above_2 = above_1 + PXSTRIDE(lpf_stride);

+        pixel_copy(dst_l, above_1, unit_w);

+        pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);

+        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);

+    } else {

+        // Pad with first row

+        pixel_copy(dst_l, p, unit_w);

+        pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);

+        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);

+        if (have_left) {

+            pixel_copy(dst_l, &left[0][1], 3);

+            pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);

+            pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);

+        }

+    }

+    pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;

+    if (edges & LR_HAVE_BOTTOM) {

+        // Copy next loop filtered rows

+        const pixel *const below_1 = lpf + 6 * PXSTRIDE(lpf_stride);

+        const pixel *const below_2 = below_1 + PXSTRIDE(lpf_stride);

+        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);

+        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);

+        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);

+    } else {

+        // Pad with last row

+        const pixel *const src = p + (stripe_h - 1) * PXSTRIDE(p_stride);

+        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);

+        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);

+        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);

+        if (have_left) {

+            pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);

+            pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);

+            pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);

+        }

+    }

+    // Inner UNIT_WxSTRIPE_H

+    for (int j = 0; j < stripe_h; j++) {

+        pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);

+        dst_tl += REST_UNIT_STRIDE;

+        p += PXSTRIDE(p_stride);

+    }

+    if (!have_right) {

+        pixel *pad = dst_l + unit_w;

+        pixel *row_last = &dst_l[unit_w - 1];

+        // Pad 3x(STRIPE_H+6) with last column

+        for (int j = 0; j < stripe_h + 6; j++) {

+            pixel_set(pad, *row_last, 3);

+            pad += REST_UNIT_STRIDE;

+            row_last += REST_UNIT_STRIDE;

+        }

+    }

+    if (!have_left) {

+        // Pad 3x(STRIPE_H+6) with first column

+        for (int j = 0; j < stripe_h + 6; j++) {

+            pixel_set(dst, *dst_l, 3);

+            dst += REST_UNIT_STRIDE;

+            dst_l += REST_UNIT_STRIDE;

+        }

+    } else {

+        dst += 3 * REST_UNIT_STRIDE;

+        for (int j = 0; j < stripe_h; j++) {

+            pixel_copy(dst, &left[j][1], 3);

+            dst += REST_UNIT_STRIDE;

+        }

+    }

+}

+// FIXME Could split into luma and chroma specific functions,

+// (since first and last tops are always 0 for chroma)

+// FIXME Could implement a version that requires less temporary memory

+// (should be possible to implement with only 6 rows of temp storage)

+static void wiener_c(pixel *p, const ptrdiff_t p_stride,

+                     const pixel (*const left)[4],

+                     const pixel *lpf, const ptrdiff_t lpf_stride,

+                     const int w, const int h,

+                     const int16_t filterh[7], const int16_t filterv[7],

+                     const enum LrEdgeFlags edges)

+{

+    // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels

+    // of padding above and below

+    pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];

+    pixel *tmp_ptr = tmp;

+    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);

+    // Values stored between horizontal and vertical filtering don't

+    // fit in a uint8_t.

+    uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];

+    uint16_t *hor_ptr = hor;

+    const int round_bits_h = 3 + (BITDEPTH == 12) * 2;

+    const int rounding_off_h = 1 << (round_bits_h - 1);

+    const int clip_limit = 1 << ((BITDEPTH) + 1 + 7 - round_bits_h);

+    for (int j = 0; j < h + 6; j++) {

+        for (int i = 0; i < w; i++) {

+            int sum = (tmp_ptr[i + 3] << 7) + (1 << (BITDEPTH + 6));

+            for (int k = 0; k < 7; k++) {

+                sum += tmp_ptr[i + k] * filterh[k];

+            }

+            hor_ptr[i] =

+                iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);

+        }

+        tmp_ptr += REST_UNIT_STRIDE;

+        hor_ptr += REST_UNIT_STRIDE;

+    }

+    const int round_bits_v = 11 - (BITDEPTH == 12) * 2;

+    const int rounding_off_v = 1 << (round_bits_v - 1);

+    const int round_offset = 1 << (BITDEPTH + (round_bits_v - 1));

+    for (int i = 0; i < w; i++) {

+        for (int j = 0; j < h; j++) {

+            int sum = (hor[(j + 3) * REST_UNIT_STRIDE + i] << 7) - round_offset;

+            for (int k = 0; k < 7; k++) {

+                sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filterv[k];

+            }

+            p[j * PXSTRIDE(p_stride) + i] =

+                iclip_pixel((sum + rounding_off_v) >> round_bits_v);

+        }

+    }

+}

+// Sum over a 3x3 area

+// The dst and src pointers are positioned 3 pixels above and 3 pixels to the

+// left of the top left corner. However, the self guided filter only needs 1

+// pixel above and one pixel to the left. As for the pixels below and to the

+// right they must be computed in the sums, but don't need to be stored.

+//

+// Example for a 4x4 block:

+//      x x x x x x x x x x

+//      x c c c c c c c c x

+//      x i s s s s s s i x

+//      x i s s s s s s i x

+//      x i s s s s s s i x

+//      x i s s s s s s i x

+//      x i s s s s s s i x

+//      x i s s s s s s i x

+//      x c c c c c c c c x

+//      x x x x x x x x x x

+//

+// s: Pixel summed and stored

+// i: Pixel summed and stored (between loops)

+// c: Pixel summed not stored

+// x: Pixel not summed not stored

+static void boxsum3(coef *dst, const pixel *src, const int w, const int h) {

+    // We skip the first row, as it is never used

+    src += REST_UNIT_STRIDE;

+    dst += REST_UNIT_STRIDE;

+    // We skip the first and last columns, as they are never used

+    for (int x = 1; x < w - 1; x++) {

+        coef *ds = dst + x;

+        const pixel *s = src + x;

+        int a = s[0], b = s[REST_UNIT_STRIDE];

+        // We skip the first 2 rows, as they are skipped in the next loop and

+        // we don't need the last 2 row as it is skipped in the next loop

+        for (int y = 2; y < h - 2; y++) {

+            s += REST_UNIT_STRIDE;

+            const int c = s[REST_UNIT_STRIDE];

+            ds += REST_UNIT_STRIDE;

+            *ds = a + b + c;

+            a = b;

+            b = c;

+        }

+     }

+    // We skip the first 2 rows as they are never read

+    dst += REST_UNIT_STRIDE;

+    // We skip the last 2 rows as it is never read

+    for (int y = 2; y < h - 2; y++) {

+        int a = dst[1], b = dst[2];

+        // We don't store the first column as it is never read and

+        // we don't store the last 2 columns as they are never read

+        for (int x = 2; x < w - 2; x++) {

+            const int c = dst[x + 1];

+            dst[x] = a + b + c;

+            a = b;

+            b = c;

+        }

+        dst += REST_UNIT_STRIDE;

+    }

+}

+// Sum over a 5x5 area

+// The dst and src pointers are positioned 3 pixels above and 3 pixels to the

+// left of the top left corner. However, the self guided filter only needs 1

+// pixel above and one pixel to the left. As for the pixels below and to the

+// right they must be computed in the sums, but don't need to be stored.

+//

+// Example for a 4x4 block:

+//      c c c c c c c c c c

+//      c c c c c c c c c c

+//      i i s s s s s s i i

+//      i i s s s s s s i i

+//      i i s s s s s s i i

+//      i i s s s s s s i i

+//      i i s s s s s s i i

+//      i i s s s s s s i i

+//      c c c c c c c c c c

+//      c c c c c c c c c c

+//

+// s: Pixel summed and stored

+// i: Pixel summed and stored (between loops)

+// c: Pixel summed not stored

+// x: Pixel not summed not stored

+static void boxsum5(coef *dst, const pixel *const src, const int w, const int h) {

+    // We skip the first row, as it is never used

+    dst += REST_UNIT_STRIDE;

+    for (int x = 0; x < w; x++) {

+        coef *ds = dst + x;

+        const pixel *s = src + 3 * REST_UNIT_STRIDE + x;

+        int a = s[-3 * REST_UNIT_STRIDE];

+        int b = s[-2 * REST_UNIT_STRIDE];

+        int c = s[-1 * REST_UNIT_STRIDE];

+        int d = s[0];

+        // We skip the first 2 rows, as they are skipped in the next loop and

+        // we don't need the last 2 row as it is skipped in the next loop

+        for (int y = 2; y < h - 2; y++) {

+            s += REST_UNIT_STRIDE;

+            const int e = *s;

+            ds += REST_UNIT_STRIDE;

+            *ds = a + b + c + d + e;

+            a = b;

+            b = c;

+            c = d;

+            d = e;

+        }

+    }

+    // We skip the first 2 rows as they are never read

+    dst += REST_UNIT_STRIDE;

+    for (int y = 2; y < h - 2; y++) {

+        int a = dst[0];

+        int b = dst[1];

+        int c = dst[2];

+        int d = dst[3];

+        for (int x = 2; x < w - 2; x++) {

+            const int e = dst[x + 2];

+            dst[x] = a + b + c + d + e;

+            a = b;

+            b = c;

+            c = d;

+            d = e;

+        }

+        dst += REST_UNIT_STRIDE;

+    }

+}

+// See boxsum3 function comments for details on row and column skipping

+static void boxsum3sqr(int32_t *dst, const pixel *src, const int w, const int h) {

+    // We skip the first row, as it is never used

+    src += REST_UNIT_STRIDE;

+    dst += REST_UNIT_STRIDE;

+    // We skip the first and last columns, as they are never used

+    for (int x = 1; x < w - 1; x++) {

+        int *ds = dst + x;

+        const pixel *s = src + x;

+        int a = s[0] * s[0];

+        int b = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];

+        // We skip the first row, as it is skipped in the next loop and

+        // we don't need the last row as it is skipped in the next loop

+        for (int y = 2; y < h - 2; y++) {

+            s += REST_UNIT_STRIDE;

+            const int c = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];

+            ds += REST_UNIT_STRIDE;

+            *ds = a + b + c;

+            a = b;

+            b = c;

+        }

+     }

+    // We skip the first row as it is never read

+    dst += REST_UNIT_STRIDE;

+    // We skip the last row as it is never read

+    for (int y = 2; y < h - 2; y++) {

+        int a = dst[1], b = dst[2];

+        // We don't store the first column as it is never read and

+        // we don't store the last 2 columns as they are never read

+        for (int x = 2; x < w - 2; x++) {

+            const int c = dst[x + 1];

+            dst[x] = a + b + c;

+            a = b;

+            b = c;

+        }

+        dst += REST_UNIT_STRIDE;

+    }

+}

+// See boxsum5 function comments for details on row and column skipping

+static void boxsum5sqr(int32_t *dst, const pixel *const src, const int w,

+                       const int h)

+{

+    // We skip the first row, as it is never used

+    dst += REST_UNIT_STRIDE;

+    for (int x = 0; x < w; x++) {

+        int *ds = dst + x;

+        const pixel *s = src + 3 * REST_UNIT_STRIDE + x;

+        int a = s[-3 * REST_UNIT_STRIDE] * s[-3 * REST_UNIT_STRIDE];

+        int b = s[-2 * REST_UNIT_STRIDE] * s[-2 * REST_UNIT_STRIDE];

+        int c = s[-1 * REST_UNIT_STRIDE] * s[-1 * REST_UNIT_STRIDE];

+        int d = s[0] * s[0];

+        // We skip the first 2 rows, as they are skipped in the next loop and

+        // we don't need the last 2 row as it is skipped in the next loop

+        for (int y = 2; y < h - 2; y++) {

+            s += REST_UNIT_STRIDE;

+            const int e = s[0] * s[0];

+            ds += REST_UNIT_STRIDE;

+            *ds = a + b + c + d + e;

+            a = b;

+            b = c;

+            c = d;

+            d = e;

+        }

+    }

+    // We skip the first 2 rows as they are never read

+    dst += REST_UNIT_STRIDE;

+    for (int y = 2; y < h - 2; y++) {

+        int a = dst[0];

+        int b = dst[1];

+        int c = dst[2];

+        int d = dst[3];

+        for (int x = 2; x < w - 2; x++) {

+            const int e = dst[x + 2];

+            dst[x] = a + b + c + d + e;

+            a = b;

+            b = c;

+            c = d;

+            d = e;

+        }

+        dst += REST_UNIT_STRIDE;

+    }

+}

+static void selfguided_filter(int32_t *dst, const pixel *src,

+                              const ptrdiff_t src_stride, const int w,

+                              const int h, const int n, const int s)

+{

+    // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels

+    // of padding above and below

+    int32_t A_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];

+    int32_t *A = A_ + 3 * REST_UNIT_STRIDE + 3;

+    // By inverting A and B after the boxsums, B can be of size coef instead

+    // of int32_t

+    coef B_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];

+    coef *B = B_ + 3 * REST_UNIT_STRIDE + 3;

+    const int step = (n == 25) + 1;

+    if (n == 25) {

+        boxsum5(B_, src, w + 6, h + 6);

+        boxsum5sqr(A_, src, w + 6, h + 6);

+    } else {

+        boxsum3(B_, src, w + 6, h + 6);

+        boxsum3sqr(A_, src, w + 6, h + 6);

+    }

+    int32_t *AA = A - REST_UNIT_STRIDE;

+    coef *BB = B - REST_UNIT_STRIDE;

+    for (int j = -1; j < h + 1; j+= step) {

+        for (int i = -1; i < w + 1; i++) {

+            const int a =

+                (AA[i] + (1 << (2 * (BITDEPTH - 8)) >> 1)) >> (2 * (BITDEPTH - 8));

+            const int b =

+                (BB[i] + (1 << (BITDEPTH - 8) >> 1)) >> (BITDEPTH - 8);

+            const uint32_t p = (a * n >= b * b) * (a * n - b * b);

+            const uint32_t z = (p * s + (1 << 19)) >> 20;

+            const int x = dav1d_sgr_x_by_xplus1[imin(z, 255)];

+            // This is where we invert A and B, so that B is of size coef.

+            AA[i] = (((1 << 8) - x) * BB[i] * dav1d_sgr_one_by_x[n - 1] + (1 << 11)) >> 12;

+            BB[i] = x;

+        }

+        AA += step * REST_UNIT_STRIDE;

+        BB += step * REST_UNIT_STRIDE;

+    }

+    src += 3 * REST_UNIT_STRIDE + 3;

+    if (n == 25) {

+        int j = 0;

+#define SIX_NEIGHBORS(P, i)\

+    ((P[i - REST_UNIT_STRIDE]     + P[i + REST_UNIT_STRIDE]) * 6 +   \

+     (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] +    \

+      P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 5)

+        for (; j < h - 1; j+=2) {

+            for (int i = 0; i < w; i++) {

+                const int32_t a = SIX_NEIGHBORS(B, i);

+                const int32_t b = SIX_NEIGHBORS(A, i);

+                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;

+            }

+            dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;

+            src += REST_UNIT_STRIDE;

+            B += REST_UNIT_STRIDE;

+            A += REST_UNIT_STRIDE;

+            for (int i = 0; i < w; i++) {

+                const int32_t a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;

+                const int32_t b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;

+                dst[i] = (a * src[i] + b + (1 << 7)) >> 8;

+            }

+            dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;

+            src += REST_UNIT_STRIDE;

+            B += REST_UNIT_STRIDE;

+            A += REST_UNIT_STRIDE;

+        }

+        if (j + 1 == h) { // Last row, when number of rows is odd

+            for (int i = 0; i < w; i++) {

+                const int32_t a = SIX_NEIGHBORS(B, i);

+                const int32_t b = SIX_NEIGHBORS(A, i);

+                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;

+            }

+        }

+#undef SIX_NEIGHBORS

+    } else {

+#define EIGHT_NEIGHBORS(P, i)\

+    ((P[i] + P[i - 1] + P[i + 1] + P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 4 + \

+     (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] +                           \

+      P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 3)

+        for (int j = 0; j < h; j++) {

+            for (int i = 0; i < w; i++) {

+                const int32_t a = EIGHT_NEIGHBORS(B, i);

+                const int32_t b = EIGHT_NEIGHBORS(A, i);

+                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;

+            }

+            dst += 384;

+            src += REST_UNIT_STRIDE;

+            B += REST_UNIT_STRIDE;

+            A += REST_UNIT_STRIDE;

+        }

+    }

+#undef NINE_NEIGHBORS

+}

+static void selfguided_c(pixel *p, const ptrdiff_t p_stride,

+                         const pixel (*const left)[4],

+                         const pixel *lpf, const ptrdiff_t lpf_stride,

+                         const int w, const int h, const int sgr_idx,

+                         const int16_t sgr_w[2], const enum LrEdgeFlags edges)

+{

+    // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels

+    // of padding above and below

+    pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];

+    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);

+    // Selfguided filter outputs to a maximum stripe height of 64 and a

+    // maximum restoration width of 384 (256 * 1.5)

+    int32_t dst[64 * 384];

+    // both r1 and r0 can't be zero

+    if (!dav1d_sgr_params[sgr_idx][0]) {

+        const int s1 = dav1d_sgr_params[sgr_idx][3];

+        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, s1);

+        const int w1 = (1 << 7) - sgr_w[1];

+        for (int j = 0; j < h; j++) {

+            for (int i = 0; i < w; i++) {

+                const int32_t u = (p[i] << 4);

+                const int32_t v = (u << 7) + w1 * (dst[j * 384 + i] - u);

+                p[i] = iclip_pixel((v + (1 << 10)) >> 11);

+            }

+            p += PXSTRIDE(p_stride);

+        }

+    } else if (!dav1d_sgr_params[sgr_idx][1]) {

+        const int s0 = dav1d_sgr_params[sgr_idx][2];

+        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0);

+        const int w0 = sgr_w[0];

+        for (int j = 0; j < h; j++) {

+            for (int i = 0; i < w; i++) {

+                const int32_t u = (p[i] << 4);

+                const int32_t v = (u << 7) + w0 * (dst[j * 384 + i] - u);

+                p[i] = iclip_pixel((v + (1 << 10)) >> 11);

+            }

+            p += PXSTRIDE(p_stride);

+        }

+    } else {

+        int32_t dst1[64 * 384];

+        const int s0 = dav1d_sgr_params[sgr_idx][2];

+        const int s1 = dav1d_sgr_params[sgr_idx][3];

+        const int w0 = sgr_w[0];

+        const int w1 = (1 << 7) - w0 - sgr_w[1];

+        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0);

+        selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, s1);

+        for (int j = 0; j < h; j++) {

+            for (int i = 0; i < w; i++) {

+                const int32_t u = (p[i] << 4);

+                const int32_t v = (u << 7) + w0 * (dst[j * 384 + i] - u) +

+                                  w1 * (dst1[j * 384 + i] - u);

+                p[i] = iclip_pixel((v + (1 << 10)) >> 11);

+            }

+            p += PXSTRIDE(p_stride);

+        }

+    }

+}

+void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) {

+    c->wiener = wiener_c;

+    c->selfguided = selfguided_c;

+#if HAVE_ASM && ARCH_X86 && BITDEPTH == 8

+    bitfn(dav1d_loop_restoration_dsp_init_x86)(c);

+#endif

+}

--- a/src/lr_apply.c

+++ /dev/null

@@ -1,296 +1,0 @@

-/*

- * Copyright © 2018, VideoLAN and dav1d authors

- * Copyright © 2018, Two Orioles, LLC

- * All rights reserved.

- *

- * Redistribution and use in source and binary forms, with or without

- * modification, are permitted provided that the following conditions are met:

- *

- * 1. Redistributions of source code must retain the above copyright notice, this

- *    list of conditions and the following disclaimer.

- *

- * 2. Redistributions in binary form must reproduce the above copyright notice,

- *    this list of conditions and the following disclaimer in the documentation

- *    and/or other materials provided with the distribution.

- *

- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

- */

-#include "config.h"

-#include <stdio.h>

-#include "common/intops.h"

-#include "src/lr_apply.h"

-enum LrRestorePlanes {

-    LR_RESTORE_Y = 1 << 0,

-    LR_RESTORE_U = 1 << 1,

-    LR_RESTORE_V = 1 << 2,

-};

-// The loop filter buffer stores 12 rows of pixels. A superblock block will

-// contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above

-// and 2 below) the final 4 rows are used to swap the bottom of the last

-// stripe with the top of the next super block row.

-static void backup_lpf(pixel *dst, ptrdiff_t dst_stride,

-                       const pixel *src, ptrdiff_t src_stride,

-                       const int ss_ver, const int sb128,

-                       int row, const int row_h, const int w)

-{

-    src_stride = PXSTRIDE(src_stride);

-    dst_stride = PXSTRIDE(dst_stride);

-    // The first stripe of the frame is shorter by 8 luma pixel rows.

-    int stripe_h = (64 - 8 * !row) >> ss_ver;

-    if (row) {

-        const int top = 4 << sb128;

-        // Copy the top part of the stored loop filtered pixels from the

-        // previous sb row needed above the first stripe of this sb row.

-        pixel_copy(&dst[dst_stride *  0], &dst[dst_stride *  top], w);

-        pixel_copy(&dst[dst_stride *  1], &dst[dst_stride * (top + 1)], w);

-        pixel_copy(&dst[dst_stride *  2], &dst[dst_stride * (top + 2)], w);

-        pixel_copy(&dst[dst_stride *  3], &dst[dst_stride * (top + 3)], w);

-    }

-    dst += 4 * dst_stride;

-    src += (stripe_h - 2) * src_stride;

-    for (; row + stripe_h <= row_h; row += stripe_h) {

-        for (int i = 0; i < 4; i++) {

-            pixel_copy(dst, src, w);

-            dst += dst_stride;

-            src += src_stride;

-        }

-        stripe_h = 64 >> ss_ver;

-        src += (stripe_h - 4) * src_stride;

-    }

-}

-void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,

-                               /*const*/ pixel *const src[3], const int sby)

-{

-    const ptrdiff_t offset = 8 * !!sby;

-    const ptrdiff_t *const src_stride = f->cur.p.stride;

-    // TODO Also check block level restore type to reduce copying.

-    const int restore_planes =

-        ((f->frame_hdr.restoration.type[0] != RESTORATION_NONE) << 0) +

-        ((f->frame_hdr.restoration.type[1] != RESTORATION_NONE) << 1) +

-        ((f->frame_hdr.restoration.type[2] != RESTORATION_NONE) << 2);

-    if (restore_planes & LR_RESTORE_Y) {

-        const int h = f->bh << 2;

-        const int w = f->bw << 2;

-        const int row_h = imin((sby + 1) << (6 + f->seq_hdr.sb128), h);

-        const int y_stripe = (sby << (6 + f->seq_hdr.sb128)) - offset;

-        backup_lpf(f->lf.lr_lpf_line_ptr[0], sizeof(pixel) * f->b4_stride * 4,

-                   src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],

-                   0, f->seq_hdr.sb128, y_stripe, row_h, w);

-    }

-    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {

-        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

-        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

-        const int h = f->bh << (2 - ss_ver);

-        const int w = f->bw << (2 - ss_hor);

-        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr.sb128), h);

-        const ptrdiff_t offset_uv = offset >> ss_ver;

-        const int y_stripe =

-            (sby << ((6 - ss_ver) + f->seq_hdr.sb128)) - offset_uv;

-        if (restore_planes & LR_RESTORE_U) {

-            backup_lpf(f->lf.lr_lpf_line_ptr[1], sizeof(pixel) * f->b4_stride * 4,

-                       src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],

-                       ss_ver, f->seq_hdr.sb128, y_stripe, row_h, w);

-        }

-        if (restore_planes & LR_RESTORE_V) {

-            backup_lpf(f->lf.lr_lpf_line_ptr[2], sizeof(pixel) * f->b4_stride * 4,

-                       src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],

-                       ss_ver, f->seq_hdr.sb128, y_stripe, row_h, w);

-        }

-    }

-}

-static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,

-                      const pixel (*left)[4], int x, int y,

-                      const int plane, const int unit_w, const int row_h,

-                      const Av1RestorationUnit *const lr, enum LrEdgeFlags edges)

-{

-    const Dav1dDSPContext *const dsp = f->dsp;

-    const int chroma = !!plane;

-    const int ss_ver = chroma & (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);

-    const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM);

-    const pixel *lpf = f->lf.lr_lpf_line_ptr[plane] + x;

-    const ptrdiff_t p_stride = f->cur.p.stride[chroma];

-    const ptrdiff_t lpf_stride = sizeof(pixel) * f->b4_stride * 4;

-    // The first stripe of the frame is shorter by 8 luma pixel rows.

-    int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);

-    // FIXME [8] might be easier for SIMD

-    int16_t filterh[7], filterv[7];

-    if (lr->type == RESTORATION_WIENER) {

-        filterh[0] = filterh[6] = lr->filter_h[0];

-        filterh[1] = filterh[5] = lr->filter_h[1];

-        filterh[2] = filterh[4] = lr->filter_h[2];

-        filterh[3] = -((filterh[0] + filterh[1] + filterh[2]) * 2);

-        filterv[0] = filterv[6] = lr->filter_v[0];

-        filterv[1] = filterv[5] = lr->filter_v[1];

-        filterv[2] = filterv[4] = lr->filter_v[2];

-        filterv[3] = -((filterv[0] + filterv[1] + filterv[2]) * 2);

-    }

-    while (y + stripe_h <= row_h) {

-        // TODO Look into getting rid of the this if

-        if (y + stripe_h == row_h) {

-            edges &= ~LR_HAVE_BOTTOM;

-        } else {

-            edges |= LR_HAVE_BOTTOM;

-        }

-        if (lr->type == RESTORATION_WIENER) {

-            dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,

-                           filterh, filterv, edges);

-        } else {

-            assert(lr->type == RESTORATION_SGRPROJ);

-            dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,

-                               lr->sgr_idx, lr->sgr_weights, edges);

-        }

-        left += stripe_h;

-        y += stripe_h;

-        if (y + stripe_h > row_h && sbrow_has_bottom) break;

-        p += stripe_h * PXSTRIDE(p_stride);

-        edges |= LR_HAVE_TOP;

-        stripe_h = imin(64 >> ss_ver, row_h - y);

-        if (stripe_h == 0) break;

-        lpf += 4 * PXSTRIDE(lpf_stride);

-    }

-}

-static void backup4xU(pixel (*dst)[4], const pixel *src, const ptrdiff_t src_stride,

-                      int u)

-{

-    for (; u > 0; u--, dst++, src += PXSTRIDE(src_stride))

-        pixel_copy(dst, src, 4);

-}

-static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,

-                     const int w, const int h, const int row_h, const int plane)

-{

-    const int chroma = !!plane;

-    const int ss_ver = chroma & (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);

-    const int ss_hor = chroma & (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444);

-    const ptrdiff_t p_stride = f->cur.p.stride[chroma];

-    const int unit_size_log2 = f->frame_hdr.restoration.unit_size[!!plane];

-    const int unit_size = 1 << unit_size_log2;

-    const int half_unit_size = unit_size >> 1;

-    const int max_unit_size = unit_size + half_unit_size;

-    // Y coordinate of the sbrow (y is 8 luma pixel rows above row_y)

-    const int row_y = y + ((8 >> ss_ver) * !!y);

-    // FIXME This is an ugly hack to lookup the proper AV1Filter unit for

-    // chroma planes. Question: For Multithreaded decoding, is it better

-    // to store the chroma LR information with collocated Luma information?

-    // In other words. For a chroma restoration unit locate at 128,128 and

-    // with a 4:2:0 chroma subsampling, do we store the filter information at

-    // the AV1Filter unit located at (128,128) or (256,256)

-    // TODO Support chroma subsampling.

-    const int shift_ver = 7 - ss_ver;

-    const int shift_hor = 7 - ss_hor;

-    int ruy = (row_y >> unit_size_log2);

-    // Merge last restoration unit if its height is < half_unit_size

-    if (ruy > 0) ruy -= (ruy << unit_size_log2) + half_unit_size > h;

-    // The first stripe of the frame is shorter by 8 luma pixel rows.

-    const int filter_h =

-        imin(((1 << (6 + f->seq_hdr.sb128)) - 8 * !y) >> ss_ver, h - y);

-    pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];

-    int unit_w = unit_size, bit = 0;

-    enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) |

-                             (row_h < h ? LR_HAVE_BOTTOM : 0);

-    for (int x = 0, rux = 0; x < w; x+= unit_w, rux++, edges |= LR_HAVE_LEFT, bit ^= 1) {

-        // TODO Clean up this if statement.

-        if (x + max_unit_size > w) {

-            unit_w = w - x;

-            edges &= ~LR_HAVE_RIGHT;

-        } else {

-            edges |= LR_HAVE_RIGHT;

-        }

-        // Based on the position of the restoration unit, find the corresponding

-        // AV1Filter unit.

-        const int unit_idx = ((ruy & 16) >> 3) + ((rux & 16) >> 4);

-        const Av1RestorationUnit *const lr =

-            &f->lf.mask[(((ruy << (unit_size_log2)) >> shift_ver) * f->sb128w) +

-                        (x >> shift_hor)].lr[plane][unit_idx];

-        // FIXME Don't backup if the next restoration unit is RESTORE_NONE

-        // This also requires not restoring in the same conditions.

-        if (edges & LR_HAVE_RIGHT) {

-            backup4xU(pre_lr_border[bit], p + unit_w - 4, p_stride, filter_h);

-        }

-        if (lr->type != RESTORATION_NONE) {

-            lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr, edges);

-        }

-        p += unit_w;

-    }

-}

-void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],

-                            const int sby)

-{

-    const ptrdiff_t offset_y = 8 * !!sby;

-    const ptrdiff_t *const dst_stride = f->cur.p.stride;

-    const int restore_planes =

-        ((f->frame_hdr.restoration.type[0] != RESTORATION_NONE) << 0) +

-        ((f->frame_hdr.restoration.type[1] != RESTORATION_NONE) << 1) +

-        ((f->frame_hdr.restoration.type[2] != RESTORATION_NONE) << 2);

-    if (restore_planes & LR_RESTORE_Y) {

-        const int h = f->cur.p.p.h;

-        const int w = f->cur.p.p.w;

-        const int row_h = imin((sby + 1) << (6 + f->seq_hdr.sb128), h);

-        const int y_stripe = (sby << (6 + f->seq_hdr.sb128)) - offset_y;

-        lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,

-                 h, row_h, 0);

-    }

-    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {

-        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

-        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

-        const int h = (f->cur.p.p.h + ss_ver) >> ss_ver;

-        const int w = (f->cur.p.p.w + ss_hor) >> ss_hor;

-        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr.sb128), h);

-        const ptrdiff_t offset_uv = offset_y >> ss_ver;

-        const int y_stripe =

-            (sby << ((6 - ss_ver) + f->seq_hdr.sb128)) - offset_uv;

-        if (restore_planes & LR_RESTORE_U)

-            lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,

-                     w, h, row_h, 1);

-        if (restore_planes & LR_RESTORE_V)

-            lr_sbrow(f, dst[2] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,

-                     w, h, row_h, 2);

-    }

-}

--- /dev/null

+++ b/src/lr_apply_tmpl.c

@@ -1,0 +1,296 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "config.h"

+#include <stdio.h>

+#include "common/intops.h"

+#include "src/lr_apply.h"

+enum LrRestorePlanes {

+    LR_RESTORE_Y = 1 << 0,

+    LR_RESTORE_U = 1 << 1,

+    LR_RESTORE_V = 1 << 2,

+};

+// The loop filter buffer stores 12 rows of pixels. A superblock block will

+// contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above

+// and 2 below) the final 4 rows are used to swap the bottom of the last

+// stripe with the top of the next super block row.

+static void backup_lpf(pixel *dst, ptrdiff_t dst_stride,

+                       const pixel *src, ptrdiff_t src_stride,

+                       const int ss_ver, const int sb128,

+                       int row, const int row_h, const int w)

+{

+    src_stride = PXSTRIDE(src_stride);

+    dst_stride = PXSTRIDE(dst_stride);

+    // The first stripe of the frame is shorter by 8 luma pixel rows.

+    int stripe_h = (64 - 8 * !row) >> ss_ver;

+    if (row) {

+        const int top = 4 << sb128;

+        // Copy the top part of the stored loop filtered pixels from the

+        // previous sb row needed above the first stripe of this sb row.

+        pixel_copy(&dst[dst_stride *  0], &dst[dst_stride *  top], w);

+        pixel_copy(&dst[dst_stride *  1], &dst[dst_stride * (top + 1)], w);

+        pixel_copy(&dst[dst_stride *  2], &dst[dst_stride * (top + 2)], w);

+        pixel_copy(&dst[dst_stride *  3], &dst[dst_stride * (top + 3)], w);

+    }

+    dst += 4 * dst_stride;

+    src += (stripe_h - 2) * src_stride;

+    for (; row + stripe_h <= row_h; row += stripe_h) {

+        for (int i = 0; i < 4; i++) {

+            pixel_copy(dst, src, w);

+            dst += dst_stride;

+            src += src_stride;

+        }

+        stripe_h = 64 >> ss_ver;

+        src += (stripe_h - 4) * src_stride;

+    }

+}

+void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,

+                               /*const*/ pixel *const src[3], const int sby)

+{

+    const ptrdiff_t offset = 8 * !!sby;

+    const ptrdiff_t *const src_stride = f->cur.p.stride;

+    // TODO Also check block level restore type to reduce copying.

+    const int restore_planes =

+        ((f->frame_hdr.restoration.type[0] != RESTORATION_NONE) << 0) +

+        ((f->frame_hdr.restoration.type[1] != RESTORATION_NONE) << 1) +

+        ((f->frame_hdr.restoration.type[2] != RESTORATION_NONE) << 2);

+    if (restore_planes & LR_RESTORE_Y) {

+        const int h = f->bh << 2;

+        const int w = f->bw << 2;

+        const int row_h = imin((sby + 1) << (6 + f->seq_hdr.sb128), h);

+        const int y_stripe = (sby << (6 + f->seq_hdr.sb128)) - offset;

+        backup_lpf(f->lf.lr_lpf_line_ptr[0], sizeof(pixel) * f->b4_stride * 4,

+                   src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],

+                   0, f->seq_hdr.sb128, y_stripe, row_h, w);

+    }

+    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {

+        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

+        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

+        const int h = f->bh << (2 - ss_ver);

+        const int w = f->bw << (2 - ss_hor);

+        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr.sb128), h);

+        const ptrdiff_t offset_uv = offset >> ss_ver;

+        const int y_stripe =

+            (sby << ((6 - ss_ver) + f->seq_hdr.sb128)) - offset_uv;

+        if (restore_planes & LR_RESTORE_U) {

+            backup_lpf(f->lf.lr_lpf_line_ptr[1], sizeof(pixel) * f->b4_stride * 4,

+                       src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],

+                       ss_ver, f->seq_hdr.sb128, y_stripe, row_h, w);

+        }

+        if (restore_planes & LR_RESTORE_V) {

+            backup_lpf(f->lf.lr_lpf_line_ptr[2], sizeof(pixel) * f->b4_stride * 4,

+                       src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],

+                       ss_ver, f->seq_hdr.sb128, y_stripe, row_h, w);

+        }

+    }

+}

+static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,

+                      const pixel (*left)[4], int x, int y,

+                      const int plane, const int unit_w, const int row_h,

+                      const Av1RestorationUnit *const lr, enum LrEdgeFlags edges)

+{

+    const Dav1dDSPContext *const dsp = f->dsp;

+    const int chroma = !!plane;

+    const int ss_ver = chroma & (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);

+    const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM);

+    const pixel *lpf = f->lf.lr_lpf_line_ptr[plane] + x;

+    const ptrdiff_t p_stride = f->cur.p.stride[chroma];

+    const ptrdiff_t lpf_stride = sizeof(pixel) * f->b4_stride * 4;

+    // The first stripe of the frame is shorter by 8 luma pixel rows.

+    int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);

+    // FIXME [8] might be easier for SIMD

+    int16_t filterh[7], filterv[7];

+    if (lr->type == RESTORATION_WIENER) {

+        filterh[0] = filterh[6] = lr->filter_h[0];

+        filterh[1] = filterh[5] = lr->filter_h[1];

+        filterh[2] = filterh[4] = lr->filter_h[2];

+        filterh[3] = -((filterh[0] + filterh[1] + filterh[2]) * 2);

+        filterv[0] = filterv[6] = lr->filter_v[0];

+        filterv[1] = filterv[5] = lr->filter_v[1];

+        filterv[2] = filterv[4] = lr->filter_v[2];

+        filterv[3] = -((filterv[0] + filterv[1] + filterv[2]) * 2);

+    }

+    while (y + stripe_h <= row_h) {

+        // TODO Look into getting rid of the this if

+        if (y + stripe_h == row_h) {

+            edges &= ~LR_HAVE_BOTTOM;

+        } else {

+            edges |= LR_HAVE_BOTTOM;

+        }

+        if (lr->type == RESTORATION_WIENER) {

+            dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,

+                           filterh, filterv, edges);

+        } else {

+            assert(lr->type == RESTORATION_SGRPROJ);

+            dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,

+                               lr->sgr_idx, lr->sgr_weights, edges);

+        }

+        left += stripe_h;

+        y += stripe_h;

+        if (y + stripe_h > row_h && sbrow_has_bottom) break;

+        p += stripe_h * PXSTRIDE(p_stride);

+        edges |= LR_HAVE_TOP;

+        stripe_h = imin(64 >> ss_ver, row_h - y);

+        if (stripe_h == 0) break;

+        lpf += 4 * PXSTRIDE(lpf_stride);

+    }

+}

+static void backup4xU(pixel (*dst)[4], const pixel *src, const ptrdiff_t src_stride,

+                      int u)

+{

+    for (; u > 0; u--, dst++, src += PXSTRIDE(src_stride))

+        pixel_copy(dst, src, 4);

+}

+static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,

+                     const int w, const int h, const int row_h, const int plane)

+{

+    const int chroma = !!plane;

+    const int ss_ver = chroma & (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);

+    const int ss_hor = chroma & (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444);

+    const ptrdiff_t p_stride = f->cur.p.stride[chroma];

+    const int unit_size_log2 = f->frame_hdr.restoration.unit_size[!!plane];

+    const int unit_size = 1 << unit_size_log2;

+    const int half_unit_size = unit_size >> 1;

+    const int max_unit_size = unit_size + half_unit_size;

+    // Y coordinate of the sbrow (y is 8 luma pixel rows above row_y)

+    const int row_y = y + ((8 >> ss_ver) * !!y);

+    // FIXME This is an ugly hack to lookup the proper AV1Filter unit for

+    // chroma planes. Question: For Multithreaded decoding, is it better

+    // to store the chroma LR information with collocated Luma information?

+    // In other words. For a chroma restoration unit locate at 128,128 and

+    // with a 4:2:0 chroma subsampling, do we store the filter information at

+    // the AV1Filter unit located at (128,128) or (256,256)

+    // TODO Support chroma subsampling.

+    const int shift_ver = 7 - ss_ver;

+    const int shift_hor = 7 - ss_hor;

+    int ruy = (row_y >> unit_size_log2);

+    // Merge last restoration unit if its height is < half_unit_size

+    if (ruy > 0) ruy -= (ruy << unit_size_log2) + half_unit_size > h;

+    // The first stripe of the frame is shorter by 8 luma pixel rows.

+    const int filter_h =

+        imin(((1 << (6 + f->seq_hdr.sb128)) - 8 * !y) >> ss_ver, h - y);

+    pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];

+    int unit_w = unit_size, bit = 0;

+    enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) |

+                             (row_h < h ? LR_HAVE_BOTTOM : 0);

+    for (int x = 0, rux = 0; x < w; x+= unit_w, rux++, edges |= LR_HAVE_LEFT, bit ^= 1) {

+        // TODO Clean up this if statement.

+        if (x + max_unit_size > w) {

+            unit_w = w - x;

+            edges &= ~LR_HAVE_RIGHT;

+        } else {

+            edges |= LR_HAVE_RIGHT;

+        }

+        // Based on the position of the restoration unit, find the corresponding

+        // AV1Filter unit.

+        const int unit_idx = ((ruy & 16) >> 3) + ((rux & 16) >> 4);

+        const Av1RestorationUnit *const lr =

+            &f->lf.mask[(((ruy << (unit_size_log2)) >> shift_ver) * f->sb128w) +

+                        (x >> shift_hor)].lr[plane][unit_idx];

+        // FIXME Don't backup if the next restoration unit is RESTORE_NONE

+        // This also requires not restoring in the same conditions.

+        if (edges & LR_HAVE_RIGHT) {

+            backup4xU(pre_lr_border[bit], p + unit_w - 4, p_stride, filter_h);

+        }

+        if (lr->type != RESTORATION_NONE) {

+            lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr, edges);

+        }

+        p += unit_w;

+    }

+}

+void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],

+                            const int sby)

+{

+    const ptrdiff_t offset_y = 8 * !!sby;

+    const ptrdiff_t *const dst_stride = f->cur.p.stride;

+    const int restore_planes =

+        ((f->frame_hdr.restoration.type[0] != RESTORATION_NONE) << 0) +

+        ((f->frame_hdr.restoration.type[1] != RESTORATION_NONE) << 1) +

+        ((f->frame_hdr.restoration.type[2] != RESTORATION_NONE) << 2);

+    if (restore_planes & LR_RESTORE_Y) {

+        const int h = f->cur.p.p.h;

+        const int w = f->cur.p.p.w;

+        const int row_h = imin((sby + 1) << (6 + f->seq_hdr.sb128), h);

+        const int y_stripe = (sby << (6 + f->seq_hdr.sb128)) - offset_y;

+        lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,

+                 h, row_h, 0);

+    }

+    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {

+        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

+        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

+        const int h = (f->cur.p.p.h + ss_ver) >> ss_ver;

+        const int w = (f->cur.p.p.w + ss_hor) >> ss_hor;

+        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr.sb128), h);

+        const ptrdiff_t offset_uv = offset_y >> ss_ver;

+        const int y_stripe =

+            (sby << ((6 - ss_ver) + f->seq_hdr.sb128)) - offset_uv;

+        if (restore_planes & LR_RESTORE_U)

+            lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,

+                     w, h, row_h, 1);

+        if (restore_planes & LR_RESTORE_V)

+            lr_sbrow(f, dst[2] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,

+                     w, h, row_h, 2);

+    }

+}

--- a/src/mc.c

+++ /dev/null

@@ -1,542 +1,0 @@

-/*

- * Copyright © 2018, VideoLAN and dav1d authors

- * Copyright © 2018, Two Orioles, LLC

- * All rights reserved.

- *

- * Redistribution and use in source and binary forms, with or without

- * modification, are permitted provided that the following conditions are met:

- *

- * 1. Redistributions of source code must retain the above copyright notice, this

- *    list of conditions and the following disclaimer.

- *

- * 2. Redistributions in binary form must reproduce the above copyright notice,

- *    this list of conditions and the following disclaimer in the documentation

- *    and/or other materials provided with the distribution.

- *

- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

- */

-#include "config.h"

-#include <stdlib.h>

-#include <string.h>

-#include "common/attributes.h"

-#include "common/intops.h"

-#include "src/mc.h"

-#include "src/tables.h"

-static NOINLINE void

-put_c(pixel *dst, const ptrdiff_t dst_stride,

-      const pixel *src, const ptrdiff_t src_stride, const int w, int h)

-{

-    do {

-        pixel_copy(dst, src, w);

-        dst += dst_stride;

-        src += src_stride;

-    } while (--h);

-}

-static NOINLINE void

-prep_c(coef *tmp, const pixel *src, const ptrdiff_t src_stride,

-       const int w, int h)

-{

-    do {

-        for (int x = 0; x < w; x++)

-            tmp[x] = src[x] << 4;

-        tmp += w;

-        src += src_stride;

-    } while (--h);

-}

-#define FILTER_8TAP(src, x, F, stride) \

-    (F[0] * src[x + -3 * stride] + \

-     F[1] * src[x + -2 * stride] + \

-     F[2] * src[x + -1 * stride] + \

-     F[3] * src[x + +0 * stride] + \

-     F[4] * src[x + +1 * stride] + \

-     F[5] * src[x + +2 * stride] + \

-     F[6] * src[x + +3 * stride] + \

-     F[7] * src[x + +4 * stride])

-#define FILTER_8TAP_RND(src, x, F, stride, sh) \

-    ((FILTER_8TAP(src, x, F, stride) + ((1 << sh) >> 1)) >> sh)

-#define FILTER_8TAP_CLIP(src, x, F, stride, sh) \

-    iclip_pixel(FILTER_8TAP_RND(src, x, F, stride, sh))

-#define GET_FILTERS() \

-    const int8_t *const fh = !mx ? NULL : w > 4 ? \

-        dav1d_mc_subpel_filters[filter_type & 3][mx - 1] : \

-        dav1d_mc_subpel_filters[3 + (filter_type & 1)][mx - 1]; \

-    const int8_t *const fv = !my ? NULL : h > 4 ? \

-        dav1d_mc_subpel_filters[filter_type >> 2][my - 1] : \

-        dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][my - 1]; \

-static NOINLINE void

-put_8tap_c(pixel *dst, ptrdiff_t dst_stride,

-           const pixel *src, ptrdiff_t src_stride,

-           const int w, int h, const int mx, const int my,

-           const int filter_type)

-{

-    GET_FILTERS();

-    dst_stride = PXSTRIDE(dst_stride);

-    src_stride = PXSTRIDE(src_stride);

-    if (fh) {

-        if (fv) {

-            int tmp_h = h + 7;

-            coef mid[128 * 135], *mid_ptr = mid;

-            src -= src_stride * 3;

-            do {

-                for (int x = 0; x < w; x++)

-                    mid_ptr[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);

-                mid_ptr += 128;

-                src += src_stride;

-            } while (--tmp_h);

-            mid_ptr = mid + 128 * 3;

-            do {

-                for (int x = 0; x < w; x++)

-                    dst[x] = FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, 10);

-                mid_ptr += 128;

-                dst += dst_stride;

-            } while (--h);

-        } else {

-            do {

-                for (int x = 0; x < w; x++) {

-                    const int px = FILTER_8TAP_RND(src, x, fh, 1, 2);

-                    dst[x] = iclip_pixel((px + 8) >> 4);

-                }

-                dst += dst_stride;

-                src += src_stride;

-            } while (--h);

-        }

-    } else if (fv) {

-        do {

-            for (int x = 0; x < w; x++)

-                dst[x] = FILTER_8TAP_CLIP(src, x, fv, src_stride, 6);

-            dst += dst_stride;

-            src += src_stride;

-        } while (--h);

-    } else

-        put_c(dst, dst_stride, src, src_stride, w, h);

-}

-static NOINLINE void

-prep_8tap_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,

-            const int w, int h, const int mx, const int my,

-            const int filter_type)

-{

-    GET_FILTERS();

-    src_stride = PXSTRIDE(src_stride);

-    if (fh) {

-        if (fv) {

-            int tmp_h = h + 7;

-            coef mid[128 * 135], *mid_ptr = mid;

-            src -= src_stride * 3;

-            do {

-                for (int x = 0; x < w; x++)

-                    mid_ptr[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);

-                mid_ptr += 128;

-                src += src_stride;

-            } while (--tmp_h);

-            mid_ptr = mid + 128 * 3;

-            do {

-                for (int x = 0; x < w; x++)

-                    tmp[x] = FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6);

-                mid_ptr += 128;

-                tmp += w;

-            } while (--h);

-        } else {

-            do {

-                for (int x = 0; x < w; x++)

-                    tmp[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);

-                tmp += w;

-                src += src_stride;

-            } while (--h);

-        }

-    } else if (fv) {

-        do {

-            for (int x = 0; x < w; x++)

-                tmp[x] = FILTER_8TAP_RND(src, x, fv, src_stride, 2);

-            tmp += w;

-            src += src_stride;

-        } while (--h);

-    } else

-        prep_c(tmp, src, src_stride, w, h);

-}

-#define filter_fns(type, type_h, type_v) \

-static void put_8tap_##type##_c(pixel *const dst, \

-                                const ptrdiff_t dst_stride, \

-                                const pixel *const src, \

-                                const ptrdiff_t src_stride, \

-                                const int w, const int h, \

-                                const int mx, const int my) \

-{ \

-    put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \

-               type_h | (type_v << 2)); \

-} \

-static void prep_8tap_##type##_c(coef *const tmp, \

-                                 const pixel *const src, \

-                                 const ptrdiff_t src_stride, \

-                                 const int w, const int h, \

-                                 const int mx, const int my) \

-{ \

-    prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \

-                type_h | (type_v << 2)); \

-}

-filter_fns(regular,        FILTER_8TAP_REGULAR, FILTER_8TAP_REGULAR)

-filter_fns(regular_sharp,  FILTER_8TAP_REGULAR, FILTER_8TAP_SHARP)

-filter_fns(regular_smooth, FILTER_8TAP_REGULAR, FILTER_8TAP_SMOOTH)

-filter_fns(smooth,         FILTER_8TAP_SMOOTH,  FILTER_8TAP_SMOOTH)

-filter_fns(smooth_regular, FILTER_8TAP_SMOOTH,  FILTER_8TAP_REGULAR)

-filter_fns(smooth_sharp,   FILTER_8TAP_SMOOTH,  FILTER_8TAP_SHARP)

-filter_fns(sharp,          FILTER_8TAP_SHARP,   FILTER_8TAP_SHARP)

-filter_fns(sharp_regular,  FILTER_8TAP_SHARP,   FILTER_8TAP_REGULAR)

-filter_fns(sharp_smooth,   FILTER_8TAP_SHARP,   FILTER_8TAP_SMOOTH)

-#define FILTER_BILIN(src, x, mxy, stride) \

-    (16 * src[x] + (mxy * (src[x + stride] - src[x])))

-#define FILTER_BILIN_RND(src, x, mxy, stride, sh) \

-    ((FILTER_BILIN(src, x, mxy, stride) + ((1 << sh) >> 1)) >> sh)

-#define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \

-    iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))

-static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride,

-                        const pixel *src, ptrdiff_t src_stride,

-                        const int w, int h, const int mx, const int my)

-{

-    dst_stride = PXSTRIDE(dst_stride);

-    src_stride = PXSTRIDE(src_stride);

-    if (mx) {

-        if (my) {

-            coef mid[128 * 129], *mid_ptr = mid;

-            int tmp_h = h + 1;

-            do {

-                for (int x = 0; x < w; x++)

-                    mid_ptr[x] = FILTER_BILIN(src, x, mx, 1);

-                mid_ptr += 128;

-                src += src_stride;

-            } while (--tmp_h);

-            mid_ptr = mid;

-            do {

-                for (int x = 0; x < w; x++)

-                    dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 128, 8);

-                mid_ptr += 128;

-                dst += dst_stride;

-            } while (--h);

-        } else {

-            do {

-                for (int x = 0; x < w; x++)

-                    dst[x] = FILTER_BILIN_CLIP(src, x, mx, 1, 4);

-                dst += dst_stride;

-                src += src_stride;

-            } while (--h);

-        }

-    } else if (my) {

-        do {

-            for (int x = 0; x < w; x++)

-                dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4);

-            dst += dst_stride;

-            src += src_stride;

-        } while (--h);

-    } else

-        put_c(dst, dst_stride, src, src_stride, w, h);

-}

-static void prep_bilin_c(coef *tmp,

-                         const pixel *src, ptrdiff_t src_stride,

-                         const int w, int h, const int mx, const int my)

-{

-    src_stride = PXSTRIDE(src_stride);

-    if (mx) {

-        if (my) {

-            coef mid[128 * 129], *mid_ptr = mid;

-            int tmp_h = h + 1;

-            do {

-                for (int x = 0; x < w; x++)

-                    mid_ptr[x] = FILTER_BILIN(src, x, mx, 1);

-                mid_ptr += 128;

-                src += src_stride;

-            } while (--tmp_h);

-            mid_ptr = mid;

-            do {

-                for (int x = 0; x < w; x++)

-                    tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4);

-                mid_ptr += 128;

-                tmp += w;

-            } while (--h);

-        } else {

-            do {

-                for (int x = 0; x < w; x++)

-                    tmp[x] = FILTER_BILIN(src, x, mx, 1);

-                tmp += w;

-                src += src_stride;

-            } while (--h);

-        }

-    } else if (my) {

-        do {

-            for (int x = 0; x < w; x++)

-                tmp[x] = FILTER_BILIN(src, x, my, src_stride);

-            tmp += w;

-            src += src_stride;

-        } while (--h);

-    } else

-        prep_c(tmp, src, src_stride, w, h);

-}

-static void avg_c(pixel *dst, const ptrdiff_t dst_stride,

-                  const coef *tmp1, const coef *tmp2, const int w, int h)

-{

-    do {

-        for (int x = 0; x < w; x++)

-            dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + 16) >> 5);

-        tmp1 += w;

-        tmp2 += w;

-        dst += PXSTRIDE(dst_stride);

-    } while (--h);

-}

-static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,

-                    const coef *tmp1, const coef *tmp2, const int w, int h,

-                    const int weight)

-{

-    do {

-        for (int x = 0; x < w; x++)

-            dst[x] = iclip_pixel((tmp1[x] * weight +

-                                  tmp2[x] * (16 - weight) + 128) >> 8);

-        tmp1 += w;

-        tmp2 += w;

-        dst += PXSTRIDE(dst_stride);

-    } while (--h);

-}

-static void mask_c(pixel *dst, const ptrdiff_t dst_stride,

-                   const coef *tmp1, const coef *tmp2, const int w, int h,

-                   const uint8_t *mask)

-{

-    do {

-        for (int x = 0; x < w; x++)

-            dst[x] = iclip_pixel((tmp1[x] * mask[x] +

-                                  tmp2[x] * (64 - mask[x]) + 512) >> 10);

-        tmp1 += w;

-        tmp2 += w;

-        mask += w;

-        dst += PXSTRIDE(dst_stride);

-    } while (--h);

-}

-static void blend_c(pixel *dst, const ptrdiff_t dst_stride,

-                    const pixel *tmp, const ptrdiff_t tmp_stride,

-                    const int w, const int h,

-                    const uint8_t *mask, const ptrdiff_t m_stride)

-{

-    for (int y = 0; y < h; y++) {

-        for (int x = 0; x < w; x++) {

-#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)

-            dst[x] = blend_px(dst[x], tmp[x], mask[m_stride == 1 ? 0 : x]);

-        }

-        dst += PXSTRIDE(dst_stride);

-        tmp += PXSTRIDE(tmp_stride);

-        mask += m_stride;

-    }

-}

-static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,

-                     const coef *tmp1, const coef *tmp2, const int w, int h,

-                     uint8_t *mask, const int sign,

-                     const int ss_hor, const int ss_ver)

-{

-    // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows,

-    // and then load this intermediate to calculate final value for odd rows

-    const int rnd = 8 << (BITDEPTH - 8);

-    do {

-        for (int x = 0; x < w; x++) {

-            const int m = imin(38 + ((abs(tmp1[x] - tmp2[x]) + rnd) >> BITDEPTH), 64);

-            dst[x] = iclip_pixel((tmp1[x] * m +

-                                  tmp2[x] * (64 - m) + 512) >> 10);

-            if (ss_hor) {

-                x++;

-                const int n = imin(38 + ((abs(tmp1[x] - tmp2[x]) + rnd) >> BITDEPTH), 64);

-                dst[x] = iclip_pixel((tmp1[x] * n +

-                                      tmp2[x] * (64 - n) + 512) >> 10);

-                if (h & ss_ver) {

-                    mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2;

-                } else if (ss_ver) {

-                    mask[x >> 1] = m + n;

-                } else {

-                    mask[x >> 1] = (m + n + 1 - sign) >> 1;

-                }

-            } else {

-                mask[x] = m;

-            }

-        }

-        tmp1 += w;

-        tmp2 += w;

-        dst += PXSTRIDE(dst_stride);

-        if (!ss_ver || (h & 1)) mask += w >> ss_hor;

-    } while (--h);

-}

-#define w_mask_fns(ssn, ss_hor, ss_ver) \

-static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \

-                             const coef *const tmp1, const coef *const tmp2, \

-                             const int w, const int h, uint8_t *mask, \

-                             const int sign) \

-{ \

-    w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, ss_hor, ss_ver); \

-}

-w_mask_fns(444, 0, 0);

-w_mask_fns(422, 1, 0);

-w_mask_fns(420, 1, 1);

-#undef w_mask_fns

-static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,

-                              const pixel *src, const ptrdiff_t src_stride,

-                              const int16_t *const abcd, int mx, int my)

-{

-    coef mid[15 * 8], *mid_ptr = mid;

-    src -= 3 * PXSTRIDE(src_stride);

-    for (int y = 0; y < 15; y++, mx += abcd[1]) {

-        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {

-            const int8_t *const filter =

-                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];

-            mid_ptr[x] = FILTER_8TAP_RND(src, x, filter, 1, 3);

-        }

-        src += PXSTRIDE(src_stride);

-        mid_ptr += 8;

-    }

-    mid_ptr = &mid[3 * 8];

-    for (int y = 0; y < 8; y++, my += abcd[3]) {

-        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {

-            const int8_t *const filter =

-                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];

-            dst[x] = FILTER_8TAP_CLIP(mid_ptr, x, filter, 8, 11);

-        }

-        mid_ptr += 8;

-        dst += PXSTRIDE(dst_stride);

-    }

-}

-static void warp_affine_8x8t_c(coef *tmp, const ptrdiff_t tmp_stride,

-                               const pixel *src, const ptrdiff_t src_stride,

-                               const int16_t *const abcd, int mx, int my)

-{

-    coef mid[15 * 8], *mid_ptr = mid;

-    src -= 3 * PXSTRIDE(src_stride);

-    for (int y = 0; y < 15; y++, mx += abcd[1]) {

-        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {

-            const int8_t *const filter =

-                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];

-            mid_ptr[x] = FILTER_8TAP_RND(src, x, filter, 1, 3);

-        }

-        src += PXSTRIDE(src_stride);

-        mid_ptr += 8;

-    }

-    mid_ptr = &mid[3 * 8];

-    for (int y = 0; y < 8; y++, my += abcd[3]) {

-        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {

-            const int8_t *const filter =

-                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];

-            tmp[x] = FILTER_8TAP_RND(mid_ptr, x, filter, 8, 7);

-        }

-        mid_ptr += 8;

-        tmp += tmp_stride;

-    }

-}

-void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {

-#define init_mc_fns(type, name) do { \

-    c->mc [type] = put_##name##_c; \

-    c->mct[type] = prep_##name##_c; \

-} while (0)

-    init_mc_fns(FILTER_2D_8TAP_REGULAR,        8tap_regular);

-    init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);

-    init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp);

-    init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular);

-    init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth);

-    init_mc_fns(FILTER_2D_8TAP_SHARP,          8tap_sharp);

-    init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular);

-    init_mc_fns(FILTER_2D_8TAP_SMOOTH,         8tap_smooth);

-    init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp);

-    init_mc_fns(FILTER_2D_BILINEAR,            bilin);

-    c->avg      = avg_c;

-    c->w_avg    = w_avg_c;

-    c->mask     = mask_c;

-    c->blend    = blend_c;

-    c->w_mask[0] = w_mask_444_c;

-    c->w_mask[1] = w_mask_422_c;

-    c->w_mask[2] = w_mask_420_c;

-    c->warp8x8  = warp_affine_8x8_c;

-    c->warp8x8t = warp_affine_8x8t_c;

-#if HAVE_ASM

-#if ARCH_AARCH64 || ARCH_ARM

-    bitfn(dav1d_mc_dsp_init_arm)(c);

-#elif ARCH_X86

-    bitfn(dav1d_mc_dsp_init_x86)(c);

-#endif

-#endif

-}

--- /dev/null

+++ b/src/mc_tmpl.c

@@ -1,0 +1,542 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "config.h"

+#include <stdlib.h>

+#include <string.h>

+#include "common/attributes.h"

+#include "common/intops.h"

+#include "src/mc.h"

+#include "src/tables.h"

+static NOINLINE void

+put_c(pixel *dst, const ptrdiff_t dst_stride,

+      const pixel *src, const ptrdiff_t src_stride, const int w, int h)

+{

+    do {

+        pixel_copy(dst, src, w);

+        dst += dst_stride;

+        src += src_stride;

+    } while (--h);

+}

+static NOINLINE void

+prep_c(coef *tmp, const pixel *src, const ptrdiff_t src_stride,

+       const int w, int h)

+{

+    do {

+        for (int x = 0; x < w; x++)

+            tmp[x] = src[x] << 4;

+        tmp += w;

+        src += src_stride;

+    } while (--h);

+}

+#define FILTER_8TAP(src, x, F, stride) \

+    (F[0] * src[x + -3 * stride] + \

+     F[1] * src[x + -2 * stride] + \

+     F[2] * src[x + -1 * stride] + \

+     F[3] * src[x + +0 * stride] + \

+     F[4] * src[x + +1 * stride] + \

+     F[5] * src[x + +2 * stride] + \

+     F[6] * src[x + +3 * stride] + \

+     F[7] * src[x + +4 * stride])

+#define FILTER_8TAP_RND(src, x, F, stride, sh) \

+    ((FILTER_8TAP(src, x, F, stride) + ((1 << sh) >> 1)) >> sh)

+#define FILTER_8TAP_CLIP(src, x, F, stride, sh) \

+    iclip_pixel(FILTER_8TAP_RND(src, x, F, stride, sh))

+#define GET_FILTERS() \

+    const int8_t *const fh = !mx ? NULL : w > 4 ? \

+        dav1d_mc_subpel_filters[filter_type & 3][mx - 1] : \

+        dav1d_mc_subpel_filters[3 + (filter_type & 1)][mx - 1]; \

+    const int8_t *const fv = !my ? NULL : h > 4 ? \

+        dav1d_mc_subpel_filters[filter_type >> 2][my - 1] : \

+        dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][my - 1]; \

+static NOINLINE void

+put_8tap_c(pixel *dst, ptrdiff_t dst_stride,

+           const pixel *src, ptrdiff_t src_stride,

+           const int w, int h, const int mx, const int my,

+           const int filter_type)

+{

+    GET_FILTERS();

+    dst_stride = PXSTRIDE(dst_stride);

+    src_stride = PXSTRIDE(src_stride);

+    if (fh) {

+        if (fv) {

+            int tmp_h = h + 7;

+            coef mid[128 * 135], *mid_ptr = mid;

+            src -= src_stride * 3;

+            do {

+                for (int x = 0; x < w; x++)

+                    mid_ptr[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);

+                mid_ptr += 128;

+                src += src_stride;

+            } while (--tmp_h);

+            mid_ptr = mid + 128 * 3;

+            do {

+                for (int x = 0; x < w; x++)

+                    dst[x] = FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, 10);

+                mid_ptr += 128;

+                dst += dst_stride;

+            } while (--h);

+        } else {

+            do {

+                for (int x = 0; x < w; x++) {

+                    const int px = FILTER_8TAP_RND(src, x, fh, 1, 2);

+                    dst[x] = iclip_pixel((px + 8) >> 4);

+                }

+                dst += dst_stride;

+                src += src_stride;

+            } while (--h);

+        }

+    } else if (fv) {

+        do {

+            for (int x = 0; x < w; x++)

+                dst[x] = FILTER_8TAP_CLIP(src, x, fv, src_stride, 6);

+            dst += dst_stride;

+            src += src_stride;

+        } while (--h);

+    } else

+        put_c(dst, dst_stride, src, src_stride, w, h);

+}

+static NOINLINE void

+prep_8tap_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,

+            const int w, int h, const int mx, const int my,

+            const int filter_type)

+{

+    GET_FILTERS();

+    src_stride = PXSTRIDE(src_stride);

+    if (fh) {

+        if (fv) {

+            int tmp_h = h + 7;

+            coef mid[128 * 135], *mid_ptr = mid;

+            src -= src_stride * 3;

+            do {

+                for (int x = 0; x < w; x++)

+                    mid_ptr[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);

+                mid_ptr += 128;

+                src += src_stride;

+            } while (--tmp_h);

+            mid_ptr = mid + 128 * 3;

+            do {

+                for (int x = 0; x < w; x++)

+                    tmp[x] = FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6);

+                mid_ptr += 128;

+                tmp += w;

+            } while (--h);

+        } else {

+            do {

+                for (int x = 0; x < w; x++)

+                    tmp[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);

+                tmp += w;

+                src += src_stride;

+            } while (--h);

+        }

+    } else if (fv) {

+        do {

+            for (int x = 0; x < w; x++)

+                tmp[x] = FILTER_8TAP_RND(src, x, fv, src_stride, 2);

+            tmp += w;

+            src += src_stride;

+        } while (--h);

+    } else

+        prep_c(tmp, src, src_stride, w, h);

+}

+#define filter_fns(type, type_h, type_v) \

+static void put_8tap_##type##_c(pixel *const dst, \

+                                const ptrdiff_t dst_stride, \

+                                const pixel *const src, \

+                                const ptrdiff_t src_stride, \

+                                const int w, const int h, \

+                                const int mx, const int my) \

+{ \

+    put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \

+               type_h | (type_v << 2)); \

+} \

+static void prep_8tap_##type##_c(coef *const tmp, \

+                                 const pixel *const src, \

+                                 const ptrdiff_t src_stride, \

+                                 const int w, const int h, \

+                                 const int mx, const int my) \

+{ \

+    prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \

+                type_h | (type_v << 2)); \

+}

+filter_fns(regular,        FILTER_8TAP_REGULAR, FILTER_8TAP_REGULAR)

+filter_fns(regular_sharp,  FILTER_8TAP_REGULAR, FILTER_8TAP_SHARP)

+filter_fns(regular_smooth, FILTER_8TAP_REGULAR, FILTER_8TAP_SMOOTH)

+filter_fns(smooth,         FILTER_8TAP_SMOOTH,  FILTER_8TAP_SMOOTH)

+filter_fns(smooth_regular, FILTER_8TAP_SMOOTH,  FILTER_8TAP_REGULAR)

+filter_fns(smooth_sharp,   FILTER_8TAP_SMOOTH,  FILTER_8TAP_SHARP)

+filter_fns(sharp,          FILTER_8TAP_SHARP,   FILTER_8TAP_SHARP)

+filter_fns(sharp_regular,  FILTER_8TAP_SHARP,   FILTER_8TAP_REGULAR)

+filter_fns(sharp_smooth,   FILTER_8TAP_SHARP,   FILTER_8TAP_SMOOTH)

+#define FILTER_BILIN(src, x, mxy, stride) \

+    (16 * src[x] + (mxy * (src[x + stride] - src[x])))

+#define FILTER_BILIN_RND(src, x, mxy, stride, sh) \

+    ((FILTER_BILIN(src, x, mxy, stride) + ((1 << sh) >> 1)) >> sh)

+#define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \

+    iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))

+static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride,

+                        const pixel *src, ptrdiff_t src_stride,

+                        const int w, int h, const int mx, const int my)

+{

+    dst_stride = PXSTRIDE(dst_stride);

+    src_stride = PXSTRIDE(src_stride);

+    if (mx) {

+        if (my) {

+            coef mid[128 * 129], *mid_ptr = mid;

+            int tmp_h = h + 1;

+            do {

+                for (int x = 0; x < w; x++)

+                    mid_ptr[x] = FILTER_BILIN(src, x, mx, 1);

+                mid_ptr += 128;

+                src += src_stride;

+            } while (--tmp_h);

+            mid_ptr = mid;

+            do {

+                for (int x = 0; x < w; x++)

+                    dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 128, 8);

+                mid_ptr += 128;

+                dst += dst_stride;

+            } while (--h);

+        } else {

+            do {

+                for (int x = 0; x < w; x++)

+                    dst[x] = FILTER_BILIN_CLIP(src, x, mx, 1, 4);

+                dst += dst_stride;

+                src += src_stride;

+            } while (--h);

+        }

+    } else if (my) {

+        do {

+            for (int x = 0; x < w; x++)

+                dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4);

+            dst += dst_stride;

+            src += src_stride;

+        } while (--h);

+    } else

+        put_c(dst, dst_stride, src, src_stride, w, h);

+}

+static void prep_bilin_c(coef *tmp,

+                         const pixel *src, ptrdiff_t src_stride,

+                         const int w, int h, const int mx, const int my)

+{

+    src_stride = PXSTRIDE(src_stride);

+    if (mx) {

+        if (my) {

+            coef mid[128 * 129], *mid_ptr = mid;

+            int tmp_h = h + 1;

+            do {

+                for (int x = 0; x < w; x++)

+                    mid_ptr[x] = FILTER_BILIN(src, x, mx, 1);

+                mid_ptr += 128;

+                src += src_stride;

+            } while (--tmp_h);

+            mid_ptr = mid;

+            do {

+                for (int x = 0; x < w; x++)

+                    tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4);

+                mid_ptr += 128;

+                tmp += w;

+            } while (--h);

+        } else {

+            do {

+                for (int x = 0; x < w; x++)

+                    tmp[x] = FILTER_BILIN(src, x, mx, 1);

+                tmp += w;

+                src += src_stride;

+            } while (--h);

+        }

+    } else if (my) {

+        do {

+            for (int x = 0; x < w; x++)

+                tmp[x] = FILTER_BILIN(src, x, my, src_stride);

+            tmp += w;

+            src += src_stride;

+        } while (--h);

+    } else

+        prep_c(tmp, src, src_stride, w, h);

+}

+static void avg_c(pixel *dst, const ptrdiff_t dst_stride,

+                  const coef *tmp1, const coef *tmp2, const int w, int h)

+{

+    do {

+        for (int x = 0; x < w; x++)

+            dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + 16) >> 5);

+        tmp1 += w;

+        tmp2 += w;

+        dst += PXSTRIDE(dst_stride);

+    } while (--h);

+}

+static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,

+                    const coef *tmp1, const coef *tmp2, const int w, int h,

+                    const int weight)

+{

+    do {

+        for (int x = 0; x < w; x++)

+            dst[x] = iclip_pixel((tmp1[x] * weight +

+                                  tmp2[x] * (16 - weight) + 128) >> 8);

+        tmp1 += w;

+        tmp2 += w;

+        dst += PXSTRIDE(dst_stride);

+    } while (--h);

+}

+static void mask_c(pixel *dst, const ptrdiff_t dst_stride,

+                   const coef *tmp1, const coef *tmp2, const int w, int h,

+                   const uint8_t *mask)

+{

+    do {

+        for (int x = 0; x < w; x++)

+            dst[x] = iclip_pixel((tmp1[x] * mask[x] +

+                                  tmp2[x] * (64 - mask[x]) + 512) >> 10);

+        tmp1 += w;

+        tmp2 += w;

+        mask += w;

+        dst += PXSTRIDE(dst_stride);

+    } while (--h);

+}

+static void blend_c(pixel *dst, const ptrdiff_t dst_stride,

+                    const pixel *tmp, const ptrdiff_t tmp_stride,

+                    const int w, const int h,

+                    const uint8_t *mask, const ptrdiff_t m_stride)

+{

+    for (int y = 0; y < h; y++) {

+        for (int x = 0; x < w; x++) {

+#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)

+            dst[x] = blend_px(dst[x], tmp[x], mask[m_stride == 1 ? 0 : x]);

+        }

+        dst += PXSTRIDE(dst_stride);

+        tmp += PXSTRIDE(tmp_stride);

+        mask += m_stride;

+    }

+}

+static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,

+                     const coef *tmp1, const coef *tmp2, const int w, int h,

+                     uint8_t *mask, const int sign,

+                     const int ss_hor, const int ss_ver)

+{

+    // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows,

+    // and then load this intermediate to calculate final value for odd rows

+    const int rnd = 8 << (BITDEPTH - 8);

+    do {

+        for (int x = 0; x < w; x++) {

+            const int m = imin(38 + ((abs(tmp1[x] - tmp2[x]) + rnd) >> BITDEPTH), 64);

+            dst[x] = iclip_pixel((tmp1[x] * m +

+                                  tmp2[x] * (64 - m) + 512) >> 10);

+            if (ss_hor) {

+                x++;

+                const int n = imin(38 + ((abs(tmp1[x] - tmp2[x]) + rnd) >> BITDEPTH), 64);

+                dst[x] = iclip_pixel((tmp1[x] * n +

+                                      tmp2[x] * (64 - n) + 512) >> 10);

+                if (h & ss_ver) {

+                    mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2;

+                } else if (ss_ver) {

+                    mask[x >> 1] = m + n;

+                } else {

+                    mask[x >> 1] = (m + n + 1 - sign) >> 1;

+                }

+            } else {

+                mask[x] = m;

+            }

+        }

+        tmp1 += w;

+        tmp2 += w;

+        dst += PXSTRIDE(dst_stride);

+        if (!ss_ver || (h & 1)) mask += w >> ss_hor;

+    } while (--h);

+}

+#define w_mask_fns(ssn, ss_hor, ss_ver) \

+static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \

+                             const coef *const tmp1, const coef *const tmp2, \

+                             const int w, const int h, uint8_t *mask, \

+                             const int sign) \

+{ \

+    w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, ss_hor, ss_ver); \

+}

+w_mask_fns(444, 0, 0);

+w_mask_fns(422, 1, 0);

+w_mask_fns(420, 1, 1);

+#undef w_mask_fns

+static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,

+                              const pixel *src, const ptrdiff_t src_stride,

+                              const int16_t *const abcd, int mx, int my)

+{

+    coef mid[15 * 8], *mid_ptr = mid;

+    src -= 3 * PXSTRIDE(src_stride);

+    for (int y = 0; y < 15; y++, mx += abcd[1]) {

+        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {

+            const int8_t *const filter =

+                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];

+            mid_ptr[x] = FILTER_8TAP_RND(src, x, filter, 1, 3);

+        }

+        src += PXSTRIDE(src_stride);

+        mid_ptr += 8;

+    }

+    mid_ptr = &mid[3 * 8];

+    for (int y = 0; y < 8; y++, my += abcd[3]) {

+        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {

+            const int8_t *const filter =

+                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];

+            dst[x] = FILTER_8TAP_CLIP(mid_ptr, x, filter, 8, 11);

+        }

+        mid_ptr += 8;

+        dst += PXSTRIDE(dst_stride);

+    }

+}

+static void warp_affine_8x8t_c(coef *tmp, const ptrdiff_t tmp_stride,

+                               const pixel *src, const ptrdiff_t src_stride,

+                               const int16_t *const abcd, int mx, int my)

+{

+    coef mid[15 * 8], *mid_ptr = mid;

+    src -= 3 * PXSTRIDE(src_stride);

+    for (int y = 0; y < 15; y++, mx += abcd[1]) {

+        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {

+            const int8_t *const filter =

+                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];

+            mid_ptr[x] = FILTER_8TAP_RND(src, x, filter, 1, 3);

+        }

+        src += PXSTRIDE(src_stride);

+        mid_ptr += 8;

+    }

+    mid_ptr = &mid[3 * 8];

+    for (int y = 0; y < 8; y++, my += abcd[3]) {

+        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {

+            const int8_t *const filter =

+                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];

+            tmp[x] = FILTER_8TAP_RND(mid_ptr, x, filter, 8, 7);

+        }

+        mid_ptr += 8;

+        tmp += tmp_stride;

+    }

+}

+void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {

+#define init_mc_fns(type, name) do { \

+    c->mc [type] = put_##name##_c; \

+    c->mct[type] = prep_##name##_c; \

+} while (0)

+    init_mc_fns(FILTER_2D_8TAP_REGULAR,        8tap_regular);

+    init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);

+    init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp);

+    init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular);

+    init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth);

+    init_mc_fns(FILTER_2D_8TAP_SHARP,          8tap_sharp);

+    init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular);

+    init_mc_fns(FILTER_2D_8TAP_SMOOTH,         8tap_smooth);

+    init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp);

+    init_mc_fns(FILTER_2D_BILINEAR,            bilin);

+    c->avg      = avg_c;

+    c->w_avg    = w_avg_c;

+    c->mask     = mask_c;

+    c->blend    = blend_c;

+    c->w_mask[0] = w_mask_444_c;

+    c->w_mask[1] = w_mask_422_c;

+    c->w_mask[2] = w_mask_420_c;

+    c->warp8x8  = warp_affine_8x8_c;

+    c->warp8x8t = warp_affine_8x8t_c;

+#if HAVE_ASM

+#if ARCH_AARCH64 || ARCH_ARM

+    bitfn(dav1d_mc_dsp_init_arm)(c);

+#elif ARCH_X86

+    bitfn(dav1d_mc_dsp_init_x86)(c);

+#endif

+#endif

+}

--- a/src/meson.build

+++ b/src/meson.build

@@ -52,17 +52,17 @@

 # These files are compiled for each bitdepth with

 # `BITDEPTH` defined to the currently built bitdepth.

 libdav1d_tmpl_sources = files(

-    'ipred.c',

-    'itx.c',

-    'ipred_prepare.c',

-    'lf_apply.c',

-    'loopfilter.c',

-    'mc.c',

-    'cdef_apply.c',

-    'cdef.c',

-    'lr_apply.c',

-    'looprestoration.c',

-    'recon.c'

+    'ipred_tmpl.c',

+    'itx_tmpl.c',

+    'ipred_prepare_tmpl.c',

+    'lf_apply_tmpl.c',

+    'loopfilter_tmpl.c',

+    'mc_tmpl.c',

+    'cdef_apply_tmpl.c',

+    'cdef_tmpl.c',

+    'lr_apply_tmpl.c',

+    'looprestoration_tmpl.c',

+    'recon_tmpl.c'

 # libdav1d entrypoint source files

--- a/src/recon.c

+++ /dev/null

@@ -1,1518 +1,0 @@

-/*

- * Copyright © 2018, VideoLAN and dav1d authors

- * Copyright © 2018, Two Orioles, LLC

- * All rights reserved.

- *

- * Redistribution and use in source and binary forms, with or without

- * modification, are permitted provided that the following conditions are met:

- *

- * 1. Redistributions of source code must retain the above copyright notice, this

- *    list of conditions and the following disclaimer.

- *

- * 2. Redistributions in binary form must reproduce the above copyright notice,

- *    this list of conditions and the following disclaimer in the documentation

- *    and/or other materials provided with the distribution.

- *

- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

- */

-#include "config.h"

-#include <string.h>

-#include <stdio.h>

-#include "common/attributes.h"

-#include "common/bitdepth.h"

-#include "common/dump.h"

-#include "common/intops.h"

-#include "common/mem.h"

-#include "src/cdef_apply.h"

-#include "src/ipred_prepare.h"

-#include "src/lf_apply.h"

-#include "src/lr_apply.h"

-#include "src/recon.h"

-#include "src/scan.h"

-#include "src/tables.h"

-#include "src/wedge.h"

-static unsigned read_golomb(MsacContext *const msac) {

-    int len = 0;

-    unsigned val = 1;

-    while (!msac_decode_bool(msac, 128 << 7) && len < 32) len++;

-    while (len--) val = (val << 1) | msac_decode_bool(msac, 128 << 7);

-    return val - 1;

-}

-static int decode_coefs(Dav1dTileContext *const t,

-                        uint8_t *const a, uint8_t *const l,

-                        const enum RectTxfmSize tx, const enum BlockSize bs,

-                        const Av1Block *const b, const int intra,

-                        const int plane, coef *cf,

-                        enum TxfmType *const txtp, uint8_t *res_ctx)

-{

-    Dav1dTileState *const ts = t->ts;

-    const int chroma = !!plane;

-    const Dav1dFrameContext *const f = t->f;

-    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];

-    const int dbg = DEBUG_BLOCK_INFO && plane && 0;

-    if (dbg) printf("Start: r=%d\n", ts->msac.rng);

-    // does this block have any non-zero coefficients

-    const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.p.layout);

-    const int all_skip =

-        msac_decode_bool_adapt(&ts->msac, ts->cdf.coef.skip[t_dim->ctx][sctx]);

-    if (dbg)

-    printf("Post-non-zero[%d][%d][%d]: r=%d\n",

-           t_dim->ctx, sctx, all_skip, ts->msac.rng);

-    if (all_skip) {

-        *res_ctx = 0x40;

-        *txtp = f->frame_hdr.segmentation.lossless[b->seg_id] ? WHT_WHT :

-                                                                DCT_DCT;

-        return -1;

-    }

-    // transform type (chroma: derived, luma: explicitly coded)

-    if (chroma) {

-        if (intra) {

-            *txtp = get_uv_intra_txtp(b->uv_mode, tx, &f->frame_hdr, b->seg_id);

-        } else {

-            const enum TxfmType y_txtp = *txtp;

-            *txtp = get_uv_inter_txtp(t_dim, y_txtp, &f->frame_hdr, b->seg_id);

-        }

-    } else {

-        const enum TxfmTypeSet set = get_ext_txtp_set(tx, !intra,

-                                                      &f->frame_hdr, b->seg_id);

-        const unsigned set_cnt = dav1d_tx_type_count[set];

-        unsigned idx;

-        if (set_cnt == 1) {

-            idx = 0;

-        } else {

-            const int set_idx = dav1d_tx_type_set_index[!intra][set];

-            const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?

-                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;

-            uint16_t *const txtp_cdf = intra ?

-                       ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :

-                       ts->cdf.m.txtp_inter[set_idx][t_dim->min];

-            idx = msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt);

-            if (dbg)

-            printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",

-                   set, set_idx, tx, t_dim->min, b->intra ? (int)y_mode_nofilt : -1,

-                   idx, dav1d_tx_types_per_set[set][idx], ts->msac.rng);

-        }

-        *txtp = dav1d_tx_types_per_set[set][idx];

-    }

-    // find end-of-block (eob)

-    int eob_bin;

-    const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32);

-    const enum TxClass tx_class = dav1d_tx_type_class[*txtp];

-    const int is_1d = tx_class != TX_CLASS_2D;

-    switch (tx2dszctx) {

-#define case_sz(sz, bin) \

-    case sz: { \

-        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \

-        eob_bin = msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \

-        break; \

-    }

-    case_sz(0,   16);

-    case_sz(1,   32);

-    case_sz(2,   64);

-    case_sz(3,  128);

-    case_sz(4,  256);

-    case_sz(5,  512);

-    case_sz(6, 1024);

-#undef case_sz

-    }

-    if (dbg)

-    printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",

-           16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);

-    int eob;

-    if (eob_bin > 1) {

-        eob = 1 << (eob_bin - 1);

-        uint16_t *const eob_hi_bit_cdf =

-            ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];

-        const int eob_hi_bit = msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);

-        if (dbg)

-        printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",

-               t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);

-        unsigned mask = eob >> 1;

-        if (eob_hi_bit) eob |= mask;

-        for (mask >>= 1; mask; mask >>= 1) {

-            const int eob_bit = msac_decode_bool(&ts->msac, 128 << 7);

-            if (eob_bit) eob |= mask;

-        }

-        if (dbg)

-        printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);

-    } else {

-        eob = eob_bin;

-    }

-    // base tokens

-    uint16_t (*const br_cdf)[5] =

-        ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];

-    const int16_t *const scan = dav1d_scans[tx][tx_class];

-    uint8_t levels[36 * 36];

-    ptrdiff_t stride = 4 * (imin(t_dim->h, 8) + 1);

-    memset(levels, 0, stride * 4 * (imin(t_dim->w, 8) + 1));

-    const int shift = 2 + imin(t_dim->lh, 3), mask = 4 * imin(t_dim->h, 8) - 1;

-    unsigned cul_level = 0;

-    for (int i = eob, is_last = 1; i >= 0; i--, is_last = 0) {

-        const int rc = scan[i], x = rc >> shift, y = rc & mask;

-        // lo tok

-        const int ctx = get_coef_nz_ctx(levels, i, rc, is_last, tx, tx_class);

-        uint16_t *const lo_cdf = is_last ?

-            ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] :

-            ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];

-        int tok = msac_decode_symbol_adapt(&ts->msac, lo_cdf,

-                                           4 - is_last) + is_last;

-        if (dbg)

-        printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",

-               t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);

-        if (!tok) continue;

-        // hi tok

-        if (tok == 3) {

-            const int br_ctx = get_br_ctx(levels, rc, tx, tx_class);

-            do {

-                const int tok_br =

-                    msac_decode_symbol_adapt(&ts->msac, br_cdf[br_ctx], 4);

-                if (dbg)

-                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",

-                       imin(t_dim->ctx, 3), chroma, br_ctx,

-                       i, rc, tok_br, tok, ts->msac.rng);

-                tok += tok_br;

-                if (tok_br < 3) break;

-            } while (tok < 15);

-        }

-        levels[x * stride + y] = cf[rc] = tok;

-    }

-    // residual and sign

-    int dc_sign = 1;

-    const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];

-    const uint8_t *const qm_tbl = f->qm[is_1d || *txtp == IDTX][tx][plane];

-    const int dq_shift = imax(0, t_dim->ctx - 2);

-    for (int i = 0; i <= eob; i++) {

-        const int rc = scan[i];

-        int tok = cf[rc];

-        if (!tok) continue;

-        int dq;

-        // sign

-        int sign;

-        if (i == 0) {

-            const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l);

-            uint16_t *const dc_sign_cdf =

-                ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];

-            sign = msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);

-            if (dbg)

-            printf("Post-dc_sign[%d][%d][%d]: r=%d\n",

-                   chroma, dc_sign_ctx, sign, ts->msac.rng);

-            dc_sign = sign ? 0 : 2;

-            dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5;

-        } else {

-            sign = msac_decode_bool(&ts->msac, 128 << 7);

-            if (dbg)

-            printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng);

-            dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5;

-        }

-        // residual

-        if (tok == 15) {

-            tok += read_golomb(&ts->msac);

-            if (dbg)

-            printf("Post-residual[%d=%d=%d->%d]: r=%d\n",

-                   i, rc, tok - 15, tok, ts->msac.rng);

-        }

-        // dequant

-        cul_level += tok;

-        tok *= dq;

-        tok >>= dq_shift;

-        cf[rc] = sign ? -tok : tok;

-    }

-    // context

-    *res_ctx = imin(cul_level, 63) | (dc_sign << 6);

-    return eob;

-}

-static void read_coef_tree(Dav1dTileContext *const t,

-                           const enum BlockSize bs, const Av1Block *const b,

-                           const enum RectTxfmSize ytx, const int depth,

-                           const uint16_t *const tx_split,

-                           const int x_off, const int y_off, pixel *dst)

-{

-    const Dav1dFrameContext *const f = t->f;

-    Dav1dTileState *const ts = t->ts;

-    const Dav1dDSPContext *const dsp = f->dsp;

-    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];

-    const int txw = t_dim->w, txh = t_dim->h;

-    if (depth < 2 && tx_split[depth] & (1 << (y_off * 4 + x_off))) {

-        const enum RectTxfmSize sub = t_dim->sub;

-        const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];

-        const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;

-        read_coef_tree(t, bs, b, sub, depth + 1, tx_split,

-                       x_off * 2 + 0, y_off * 2 + 0, dst);

-        t->bx += txsw;

-        if (txw >= txh && t->bx < f->bw)

-            read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,

-                           y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);

-        t->bx -= txsw;

-        t->by += txsh;

-        if (txh >= txw && t->by < f->bh) {

-            if (dst)

-                dst += 4 * txsh * PXSTRIDE(f->cur.p.stride[0]);

-            read_coef_tree(t, bs, b, sub, depth + 1, tx_split,

-                           x_off * 2 + 0, y_off * 2 + 1, dst);

-            t->bx += txsw;

-            if (txw >= txh && t->bx < f->bw)

-                read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,

-                               y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);

-            t->bx -= txsw;

-        }

-        t->by -= txsh;

-    } else {

-        const int bx4 = t->bx & 31, by4 = t->by & 31;

-        enum TxfmType txtp;

-        uint8_t cf_ctx;

-        int eob;

-        coef *cf;

-        struct CodedBlockInfo *cbi;

-        if (f->frame_thread.pass) {

-            cf = ts->frame_thread.cf;

-            ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;

-            cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];

-        } else {

-            cf = t->cf;

-        }

-        if (f->frame_thread.pass != 2) {

-            eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],

-                               ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);

-            if (DEBUG_BLOCK_INFO)

-                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",

-                       ytx, txtp, eob, ts->msac.rng);

-            memset(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));

-            memset(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));

-            for (int y = 0; y < txh; y++)

-                memset(&t->txtp_map[(by4 + y) * 32 + bx4], txtp, txw);

-            if (f->frame_thread.pass == 1) {

-                cbi->eob[0] = eob;

-                cbi->txtp[0] = txtp;

-            }

-        } else {

-            eob = cbi->eob[0];

-            txtp = cbi->txtp[0];

-        }

-        if (!(f->frame_thread.pass & 1)) {

-            assert(dst);

-            if (eob >= 0) {

-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

-                    coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");

-                dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.p.stride[0], cf, eob);

-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

-                    hex_dump(dst, f->cur.p.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");

-            }

-        }

-    }

-}

-void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,

-                                    const enum BlockSize bs, const Av1Block *const b)

-{

-    const Dav1dFrameContext *const f = t->f;

-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

-    const int bx4 = t->bx & 31, by4 = t->by & 31;

-    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;

-    const uint8_t *const b_dim = dav1d_block_dimensions[bs];

-    const int bw4 = b_dim[0], bh4 = b_dim[1];

-    const int cbw4 = (bw4 + 1) >> ss_hor, cbh4 = (bh4 + 1) >> ss_ver;

-    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&

-                           (bw4 > ss_hor || t->bx & 1) &&

-                           (bh4 > ss_ver || t->by & 1);

-    if (b->skip) {

-        memset(&t->a->lcoef[bx4], 0x40, bw4);

-        memset(&t->l.lcoef[by4], 0x40, bh4);

-        if (has_chroma) for (int pl = 0; pl < 2; pl++) {

-            memset(&t->a->ccoef[pl][cbx4], 0x40, cbw4);

-            memset(&t->l.ccoef[pl][cby4], 0x40, cbh4);

-        }

-        return;

-    }

-    Dav1dTileState *const ts = t->ts;

-    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);

-    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;

-    assert(f->frame_thread.pass == 1);

-    assert(!b->skip);

-    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];

-    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];

-    for (int init_y = 0; init_y < h4; init_y += 16) {

-        for (int init_x = 0; init_x < w4; init_x += 16) {

-            const int sub_h4 = imin(h4, 16 + init_y);

-            const int sub_w4 = imin(w4, init_x + 16);

-            int y_off = !!init_y, y, x;

-            for (y = init_y, t->by += init_y; y < sub_h4;

-                 y += t_dim->h, t->by += t_dim->h, y_off++)

-            {

-                struct CodedBlockInfo *const cbi =

-                    &f->frame_thread.cbi[t->by * f->b4_stride];

-                int x_off = !!init_x;

-                for (x = init_x, t->bx += init_x; x < sub_w4;

-                     x += t_dim->w, t->bx += t_dim->w, x_off++)

-                {

-                    if (!b->intra) {

-                        read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,

-                                       x_off, y_off, NULL);

-                    } else {

-                        uint8_t cf_ctx = 0x40;

-                        enum TxfmType txtp;

-                        const int eob = cbi[t->bx].eob[0] =

-                            decode_coefs(t, &t->a->lcoef[bx4 + x],

-                                         &t->l.lcoef[by4 + y], b->tx, bs, b, 1,

-                                         0, ts->frame_thread.cf, &txtp, &cf_ctx);

-                        if (DEBUG_BLOCK_INFO)

-                            printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",

-                                   b->tx, txtp, eob, ts->msac.rng);

-                        cbi[t->bx].txtp[0] = txtp;

-                        ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;

-                        memset(&t->a->lcoef[bx4 + x], cf_ctx,

-                               imin(t_dim->w, f->bw - t->bx));

-                        memset(&t->l.lcoef[by4 + y], cf_ctx,

-                               imin(t_dim->h, f->bh - t->by));

-                    }

-                }

-                t->bx -= x;

-            }

-            t->by -= y;

-            if (!has_chroma) continue;

-            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);

-            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);

-            for (int pl = 0; pl < 2; pl++) {

-                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;

-                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)

-                {

-                    struct CodedBlockInfo *const cbi =

-                        &f->frame_thread.cbi[t->by * f->b4_stride];

-                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;

-                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)

-                    {

-                        uint8_t cf_ctx = 0x40;

-                        enum TxfmType txtp;

-                        if (!b->intra)

-                            txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +

-                                                bx4 + (x << ss_hor)];

-                        const int eob = cbi[t->bx].eob[1 + pl] =

-                            decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],

-                                         &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,

-                                         b, b->intra, 1 + pl, ts->frame_thread.cf,

-                                         &txtp, &cf_ctx);

-                        if (DEBUG_BLOCK_INFO)

-                            printf("Post-uv-cf-blk[pl=%d,tx=%d,"

-                                   "txtp=%d,eob=%d]: r=%d\n",

-                                   pl, b->uvtx, txtp, eob, ts->msac.rng);

-                        cbi[t->bx].txtp[1 + pl] = txtp;

-                        ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;

-                        memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,

-                               imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));

-                        memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,

-                               imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));

-                    }

-                    t->bx -= x << ss_hor;

-                }

-                t->by -= y << ss_ver;

-            }

-        }

-    }

-}

-static void emu_edge(pixel *dst, const ptrdiff_t dst_stride,

-                     const pixel *ref, const ptrdiff_t ref_stride,

-                     const int bw, const int bh,

-                     const int iw, const int ih,

-                     const int x, const int y)

-{

-    // find offset in reference of visible block to copy

-    ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + iclip(x, 0, iw - 1);

-    // number of pixels to extend (left, right, top, bottom)

-    const int left_ext = iclip(-x, 0, bw - 1);

-    const int right_ext = iclip(x + bw - iw, 0, bw - 1);

-    assert(left_ext + right_ext < bw);

-    const int top_ext = iclip(-y, 0, bh - 1);

-    const int bottom_ext = iclip(y + bh - ih, 0, bh - 1);

-    assert(top_ext + bottom_ext < bh);

-    // copy visible portion first

-    pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);

-    const int center_w = bw - left_ext - right_ext;

-    const int center_h = bh - top_ext - bottom_ext;

-    for (int y = 0; y < center_h; y++) {

-        pixel_copy(blk + left_ext, ref, center_w);

-        // extend left edge for this line

-        if (left_ext)

-            pixel_set(blk, blk[left_ext], left_ext);

-        // extend right edge for this line

-        if (right_ext)

-            pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],

-                      right_ext);

-        ref += PXSTRIDE(ref_stride);

-        blk += PXSTRIDE(dst_stride);

-    }

-    // copy top

-    blk = dst + top_ext * PXSTRIDE(dst_stride);

-    for (int y = 0; y < top_ext; y++) {

-        pixel_copy(dst, blk, bw);

-        dst += PXSTRIDE(dst_stride);

-    }

-    // copy bottom

-    dst += center_h * PXSTRIDE(dst_stride);

-    for (int y = 0; y < bottom_ext; y++) {

-        pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);

-        dst += PXSTRIDE(dst_stride);

-    }

-}

-static void mc(Dav1dTileContext *const t,

-               pixel *const dst8, coef *const dst16, const ptrdiff_t dst_stride,

-               const int bw4, const int bh4,

-               const int bx, const int by, const int pl,

-               const mv mv, const Dav1dThreadPicture *const refp,

-               const enum Filter2d filter_2d)

-{

-    assert((dst8 != NULL) ^ (dst16 != NULL));

-    const Dav1dFrameContext *const f = t->f;

-    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

-    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

-    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;

-    const int mvx = mv.x, mvy = mv.y;

-    const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);

-    const int dx = bx * h_mul + (mvx >> (3 + ss_hor));

-    const int dy = by * v_mul + (mvy >> (3 + ss_ver));

-    ptrdiff_t ref_stride = refp->p.stride[!!pl];

-    const pixel *ref;

-    int w, h;

-    if (refp != &f->cur) { // i.e. not for intrabc

-        dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,

-                                  PLANE_TYPE_Y + !!pl);

-        w = (f->cur.p.p.w + ss_hor) >> ss_hor;

-        h = (f->cur.p.p.h + ss_ver) >> ss_ver;

-    } else {

-        w = f->bw * 4 >> ss_hor;

-        h = f->bh * 4 >> ss_ver;

-    }

-    if (dx < !!mx * 3 || dy < !!my * 3 ||

-        dx + bw4 * h_mul + !!mx * 4 > w ||

-        dy + bh4 * v_mul + !!my * 4 > h)

-    {

-        emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl], ref_stride,

-                 bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7, w, h,

-                 dx - !!mx * 3, dy - !!my * 3);

-        ref = &t->emu_edge[160 * !!my * 3 + !!mx * 3];

-        ref_stride = 160 * sizeof(pixel);

-    } else {

-        ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;

-    }

-    if (dst8 != NULL) {

-        f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,

-                                 bh4 * v_mul, mx << !ss_hor, my << !ss_ver);

-    } else {

-        f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,

-                                  bh4 * v_mul, mx << !ss_hor, my << !ss_ver);

-    }

-}

-static void obmc(Dav1dTileContext *const t,

-                 pixel *const dst, const ptrdiff_t dst_stride,

-                 const uint8_t *const b_dim, const int pl,

-                 const int bx4, const int by4, const int w4, const int h4)

-{

-    assert(!(t->bx & 1) && !(t->by & 1));

-    const Dav1dFrameContext *const f = t->f;

-    const refmvs *const r = &f->mvs[t->by * f->b4_stride + t->bx];

-    pixel *const lap = t->scratch.lap;

-    static const uint8_t obmc_mask_2[2] = { 19,  0 };

-    static const uint8_t obmc_mask_4[4] = { 25, 14,  5,  0 };

-    static const uint8_t obmc_mask_8[8] = { 28, 22, 16, 11,  7,  3,  0,  0 };

-    static const uint8_t obmc_mask_16[16] = { 30, 27, 24, 21, 18, 15, 12, 10,

-                                               8,  6,  4,  3,  0,  0,  0,  0 };

-    static const uint8_t obmc_mask_32[32] = { 31, 29, 28, 26, 24, 23, 21, 20,

-                                              19, 17, 16, 14, 13, 12, 11,  9,

-                                               8,  7,  6,  5,  4,  4,  3,  2,

-                                               0,  0,  0,  0,  0,  0,  0,  0 };

-    static const uint8_t *const obmc_masks[] = {

-        obmc_mask_2, obmc_mask_4, obmc_mask_8, obmc_mask_16, obmc_mask_32

-    };

-    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

-    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

-    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;

-    if (t->by > t->ts->tiling.row_start &&

-        (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))

-    {

-        for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {

-            // only odd blocks are considered for overlap handling, hence +1

-            const refmvs *const a_r = &r[x - f->b4_stride + 1];

-            const uint8_t *const a_b_dim =

-                dav1d_block_dimensions[sbtype_to_bs[a_r->sb_type]];

-            if (a_r->ref[0] > 0) {

-                mc(t, lap, NULL, 128 * sizeof(pixel),

-                   iclip(a_b_dim[0], 2, b_dim[0]), imin(b_dim[1], 16) >> 1,

-                   t->bx + x, t->by, pl, a_r->mv[0],

-                   &f->refp[a_r->ref[0] - 1],

-                   dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);

-                f->dsp->mc.blend(&dst[x * h_mul], dst_stride,

-                                 lap, 128 * sizeof(pixel),

-                                 h_mul * iclip(a_b_dim[0], 2, b_dim[0]),

-                                 v_mul * imin(b_dim[1], 16) >> 1,

-                                 obmc_masks[imin(b_dim[3], 4) - ss_ver], 1);

-                i++;

-            }

-            x += imax(a_b_dim[0], 2);

-        }

-    }

-    if (t->bx > t->ts->tiling.col_start)

-        for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {

-            // only odd blocks are considered for overlap handling, hence +1

-            const refmvs *const l_r = &r[(y + 1) * f->b4_stride - 1];

-            const uint8_t *const l_b_dim =

-                dav1d_block_dimensions[sbtype_to_bs[l_r->sb_type]];

-            if (l_r->ref[0] > 0) {

-                mc(t, lap, NULL, 32 * sizeof(pixel),

-                   imin(b_dim[0], 16) >> 1,

-                   iclip(l_b_dim[1], 2, b_dim[1]),

-                   t->bx, t->by + y, pl, l_r->mv[0],

-                   &f->refp[l_r->ref[0] - 1],

-                   dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);

-                f->dsp->mc.blend(&dst[y * v_mul * PXSTRIDE(dst_stride)], dst_stride,

-                                 lap, 32 * sizeof(pixel),

-                                 h_mul * imin(b_dim[0], 16) >> 1,

-                                 v_mul * iclip(l_b_dim[1], 2, b_dim[1]),

-                                 obmc_masks[imin(b_dim[2], 4) - ss_hor], 0);

-                i++;

-            }

-            y += imax(l_b_dim[1], 2);

-        }

-}

-static void warp_affine(Dav1dTileContext *const t,

-                        pixel *dst8, coef *dst16, const ptrdiff_t dstride,

-                        const uint8_t *const b_dim, const int pl,

-                        const Dav1dThreadPicture *const refp,

-                        const WarpedMotionParams *const wmp)

-{

-    assert((dst8 != NULL) ^ (dst16 != NULL));

-    const Dav1dFrameContext *const f = t->f;

-    const Dav1dDSPContext *const dsp = f->dsp;

-    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

-    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

-    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;

-    assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));

-    const int32_t *const mat = wmp->matrix;

-    const int width = (f->cur.p.p.w + ss_hor) >> ss_hor;

-    const int height = (f->cur.p.p.h + ss_ver) >> ss_ver;

-    for (int y = 0; y < b_dim[1] * v_mul; y += 8) {

-        for (int x = 0; x < b_dim[0] * h_mul; x += 8) {

-            // calculate transformation relative to center of 8x8 block in

-            // luma pixel units

-            const int src_x = t->bx * 4 + ((x + 4) << ss_hor);

-            const int src_y = t->by * 4 + ((y + 4) << ss_ver);

-            const int mvx = (mat[2] * src_x + mat[3] * src_y + mat[0]) >> ss_hor;

-            const int mvy = (mat[4] * src_x + mat[5] * src_y + mat[1]) >> ss_ver;

-            const int dx = (mvx >> 16) - 4;

-            const int mx = ((mvx & 0xffff) - wmp->alpha * 4 -

-                                             wmp->beta  * 7) & ~0x3f;

-            const int dy = (mvy >> 16) - 4;

-            const int my = ((mvy & 0xffff) - wmp->gamma * 4 -

-                                             wmp->delta * 4) & ~0x3f;

-            const pixel *ref_ptr;

-            ptrdiff_t ref_stride = refp->p.stride[!!pl];

-            dav1d_thread_picture_wait(refp, dy + 4 + 8,

-                                      PLANE_TYPE_Y + !!pl);

-            if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {

-                emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl],

-                         ref_stride, 15, 15, width, height, dx - 3, dy - 3);

-                ref_ptr = &t->emu_edge[160 * 3 + 3];

-                ref_stride = 160 * sizeof(pixel);

-            } else {

-                ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;

-            }

-            if (dst16 != NULL)

-                dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,

-                                 wmp->abcd, mx, my);

-            else

-                dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,

-                                wmp->abcd, mx, my);

-        }

-        if (dst8) dst8  += 8 * PXSTRIDE(dstride);

-        else      dst16 += 8 * dstride;

-    }

-}

-void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize bs,

-                                 const enum EdgeFlags intra_edge_flags,

-                                 const Av1Block *const b)

-{

-    Dav1dTileState *const ts = t->ts;

-    const Dav1dFrameContext *const f = t->f;

-    const Dav1dDSPContext *const dsp = f->dsp;

-    const int bx4 = t->bx & 31, by4 = t->by & 31;

-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

-    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;

-    const uint8_t *const b_dim = dav1d_block_dimensions[bs];

-    const int bw4 = b_dim[0], bh4 = b_dim[1];

-    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);

-    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;

-    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&

-                           (bw4 > ss_hor || t->bx & 1) &&

-                           (bh4 > ss_ver || t->by & 1);

-    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];

-    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];

-    // coefficient coding

-    ALIGN_STK_32(pixel, edge_buf, 257,);

-    pixel *const edge = edge_buf + 128;

-    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;

-    for (int init_y = 0; init_y < h4; init_y += 16) {

-        for (int init_x = 0; init_x < w4; init_x += 16) {

-            if (b->pal_sz[0]) {

-                pixel *dst = ((pixel *) f->cur.p.data[0]) +

-                             4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);

-                const uint8_t *pal_idx;

-                if (f->frame_thread.pass) {

-                    pal_idx = ts->frame_thread.pal_idx;

-                    ts->frame_thread.pal_idx += bw4 * bh4 * 16;

-                } else {

-                    pal_idx = t->scratch.pal_idx;

-                }

-                const uint16_t *const pal = f->frame_thread.pass ?

-                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +

-                                        ((t->bx >> 1) + (t->by & 1))][0] : t->pal[0];

-                f->dsp->ipred.pal_pred(dst, f->cur.p.stride[0], pal,

-                                       pal_idx, bw4 * 4, bh4 * 4);

-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

-                    hex_dump(dst, PXSTRIDE(f->cur.p.stride[0]),

-                             bw4 * 4, bh4 * 4, "y-pal-pred");

-            }

-            const int sm_fl = sm_flag(t->a, bx4) | sm_flag(&t->l, by4);

-            const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :

-                              intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;

-            const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :

-                              intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;

-            int y, x;

-            const int sub_h4 = imin(h4, 16 + init_y);

-            const int sub_w4 = imin(w4, init_x + 16);

-            for (y = init_y, t->by += init_y; y < sub_h4;

-                 y += t_dim->h, t->by += t_dim->h)

-            {

-                pixel *dst = ((pixel *) f->cur.p.data[0]) +

-                               4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) +

-                                    t->bx + init_x);

-                for (x = init_x, t->bx += init_x; x < sub_w4;

-                     x += t_dim->w, t->bx += t_dim->w)

-                {

-                    if (b->pal_sz[0]) goto skip_y_pred;

-                    int angle = b->y_angle;

-                    const enum EdgeFlags edge_flags =

-                        (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?

-                             0 : EDGE_I444_TOP_HAS_RIGHT) |

-                        ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?

-                             0 : EDGE_I444_LEFT_HAS_BOTTOM);

-                    const pixel *top_sb_edge = NULL;

-                    if (!(t->by & (f->sb_step - 1))) {

-                        top_sb_edge = f->ipred_edge[0];

-                        const int sby = t->by >> f->sb_shift;

-                        top_sb_edge += f->sb128w * 128 * (sby - 1);

-                    }

-                    const enum IntraPredMode m =

-                        bytefn(dav1d_prepare_intra_edges)(t->bx,

-                                                          t->bx > ts->tiling.col_start,

-                                                          t->by,

-                                                          t->by > ts->tiling.row_start,

-                                                          ts->tiling.col_end,

-                                                          ts->tiling.row_end,

-                                                          edge_flags, dst,

-                                                          f->cur.p.stride[0], top_sb_edge,

-                                                          b->y_mode, &angle,

-                                                          t_dim->w, t_dim->h, edge);

-                    dsp->ipred.intra_pred[m](dst, f->cur.p.stride[0], edge,

-                                             t_dim->w * 4, t_dim->h * 4,

-                                             angle | sm_fl);

-                    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

-                        hex_dump(edge - t_dim->h * 4, t_dim->h * 4,

-                                 t_dim->h * 4, 2, "l");

-                        hex_dump(edge, 0, 1, 1, "tl");

-                        hex_dump(edge + 1, t_dim->w * 4,

-                                 t_dim->w * 4, 2, "t");

-                        hex_dump(dst, f->cur.p.stride[0],

-                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");

-                    }

-                skip_y_pred: {}

-                    if (!b->skip) {

-                        coef *cf;

-                        int eob;

-                        enum TxfmType txtp;

-                        if (f->frame_thread.pass) {

-                            cf = ts->frame_thread.cf;

-                            ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;

-                            const struct CodedBlockInfo *const cbi =

-                                &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];

-                            eob = cbi->eob[0];

-                            txtp = cbi->txtp[0];

-                        } else {

-                            uint8_t cf_ctx;

-                            cf = t->cf;

-                            eob = decode_coefs(t, &t->a->lcoef[bx4 + x],

-                                               &t->l.lcoef[by4 + y], b->tx, bs,

-                                               b, 1, 0, cf, &txtp, &cf_ctx);

-                            if (DEBUG_BLOCK_INFO)

-                                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",

-                                       b->tx, txtp, eob, ts->msac.rng);

-                            memset(&t->a->lcoef[bx4 + x], cf_ctx,

-                                   imin(t_dim->w, f->bw - t->bx));

-                            memset(&t->l.lcoef[by4 + y], cf_ctx,

-                                   imin(t_dim->h, f->bh - t->by));

-                        }

-                        if (eob >= 0) {

-                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

-                                coef_dump(cf, imin(t_dim->h, 8) * 4,

-                                          imin(t_dim->w, 8) * 4, 3, "dq");

-                            dsp->itx.itxfm_add[b->tx]

-                                              [txtp](dst,

-                                                     f->cur.p.stride[0],

-                                                     cf, eob);

-                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

-                                hex_dump(dst, f->cur.p.stride[0],

-                                         t_dim->w * 4, t_dim->h * 4, "recon");

-                        }

-                    } else if (!f->frame_thread.pass) {

-                        memset(&t->a->lcoef[bx4 + x], 0x40, t_dim->w);

-                        memset(&t->l.lcoef[by4 + y], 0x40, t_dim->h);

-                    }

-                    dst += 4 * t_dim->w;

-                }

-                t->bx -= x;

-            }

-            t->by -= y;

-            if (!has_chroma) continue;

-            const ptrdiff_t stride = f->cur.p.stride[1];

-            if (b->uv_mode == CFL_PRED) {

-                assert(!init_x && !init_y);

-                int16_t *const ac = t->scratch.ac;

-                pixel *y_src = ((pixel *) f->cur.p.data[0]) + 4 * (t->bx & ~ss_hor) +

-                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.p.stride[0]);

-                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +

-                                              (t->by >> ss_ver) * PXSTRIDE(stride));

-                pixel *const uv_dst[2] = { ((pixel *) f->cur.p.data[1]) + uv_off,

-                                           ((pixel *) f->cur.p.data[2]) + uv_off };

-                const int furthest_r =

-                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);

-                const int furthest_b =

-                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);

-                dsp->ipred.cfl_ac[f->cur.p.p.layout - 1]

-                                 [b->uvtx](ac, y_src, f->cur.p.stride[0],

-                                           cbw4 - (furthest_r >> ss_hor),

-                                           cbh4 - (furthest_b >> ss_ver));

-                for (int pl = 0; pl < 2; pl++) {

-                    if (!b->cfl_alpha[pl]) continue;

-                    int angle = 0;

-                    const pixel *top_sb_edge = NULL;

-                    if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {

-                        top_sb_edge = f->ipred_edge[pl + 1];

-                        const int sby = t->by >> f->sb_shift;

-                        top_sb_edge += f->sb128w * 128 * (sby - 1);

-                    }

-                    const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;

-                    const int xstart = ts->tiling.col_start >> ss_hor;

-                    const int ystart = ts->tiling.row_start >> ss_ver;

-                    const enum IntraPredMode m =

-                        bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,

-                                                          ypos, ypos > ystart,

-                                                          ts->tiling.col_end >> ss_hor,

-                                                          ts->tiling.row_end >> ss_ver,

-                                                          0, uv_dst[pl], stride,

-                                                          top_sb_edge, DC_PRED, &angle,

-                                                          uv_t_dim->w,

-                                                          uv_t_dim->h, edge);

-                    dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,

-                                           uv_t_dim->w * 4,

-                                           uv_t_dim->h * 4,

-                                           ac, b->cfl_alpha[pl]);

-                }

-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

-                    ac_dump(ac, 4*cbw4, 4*cbh4, "ac");

-                    hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");

-                    hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");

-                }

-            } else if (b->pal_sz[1]) {

-                ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +

-                                           (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));

-                const uint8_t *pal_idx;

-                if (f->frame_thread.pass) {

-                    pal_idx = ts->frame_thread.pal_idx;

-                    ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;

-                } else {

-                    pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];

-                }

-                const uint16_t *const pal_u = f->frame_thread.pass ?

-                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +

-                                        ((t->bx >> 1) + (t->by & 1))][1] : t->pal[1];

-                f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[1]) + uv_dstoff,

-                                       f->cur.p.stride[1], pal_u,

-                                       pal_idx, cbw4 * 4, cbh4 * 4);

-                const uint16_t *const pal_v = f->frame_thread.pass ?

-                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +

-                                        ((t->bx >> 1) + (t->by & 1))][2] : t->pal[2];

-                f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[2]) + uv_dstoff,

-                                       f->cur.p.stride[1], pal_v,

-                                       pal_idx, cbw4 * 4, cbh4 * 4);

-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

-                    hex_dump(((pixel *) f->cur.p.data[1]) + uv_dstoff,

-                             PXSTRIDE(f->cur.p.stride[1]),

-                             cbw4 * 4, cbh4 * 4, "u-pal-pred");

-                    hex_dump(((pixel *) f->cur.p.data[2]) + uv_dstoff,

-                             PXSTRIDE(f->cur.p.stride[1]),

-                             cbw4 * 4, cbh4 * 4, "v-pal-pred");

-                }

-            }

-            const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |

-                                 sm_uv_flag(&t->l, cby4);

-            const int uv_sb_has_tr =

-                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :

-                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.p.layout - 1));

-            const int uv_sb_has_bl =

-                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :

-                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.p.layout - 1));

-            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);

-            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);

-            for (int pl = 0; pl < 2; pl++) {

-                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;

-                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)

-                {

-                    pixel *dst = ((pixel *) f->cur.p.data[1 + pl]) +

-                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +

-                                        ((t->bx + init_x) >> ss_hor));

-                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;

-                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)

-                    {

-                        if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||

-                            b->pal_sz[1])

-                        {

-                            goto skip_uv_pred;

-                        }

-                        int angle = b->uv_angle;

-                        // this probably looks weird because we're using

-                        // luma flags in a chroma loop, but that's because

-                        // prepare_intra_edges() expects luma flags as input

-                        const enum EdgeFlags edge_flags =

-                            (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&

-                              (x + uv_t_dim->w >= sub_cw4)) ?

-                                 0 : EDGE_I444_TOP_HAS_RIGHT) |

-                            ((x > (init_x >> ss_hor) ||

-                              (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?

-                                 0 : EDGE_I444_LEFT_HAS_BOTTOM);

-                        const pixel *top_sb_edge = NULL;

-                        if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {

-                            top_sb_edge = f->ipred_edge[1 + pl];

-                            const int sby = t->by >> f->sb_shift;

-                            top_sb_edge += f->sb128w * 128 * (sby - 1);

-                        }

-                        const enum IntraPredMode uv_mode =

-                             b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;

-                        const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;

-                        const int xstart = ts->tiling.col_start >> ss_hor;

-                        const int ystart = ts->tiling.row_start >> ss_ver;

-                        const enum IntraPredMode m =

-                            bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,

-                                                              ypos, ypos > ystart,

-                                                              ts->tiling.col_end >> ss_hor,

-                                                              ts->tiling.row_end >> ss_ver,

-                                                              edge_flags, dst, stride,

-                                                              top_sb_edge, uv_mode,

-                                                              &angle, uv_t_dim->w,

-                                                              uv_t_dim->h, edge);

-                        dsp->ipred.intra_pred[m](dst, stride, edge,

-                                                 uv_t_dim->w * 4,

-                                                 uv_t_dim->h * 4,

-                                                 angle | sm_uv_fl);

-                        if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

-                            hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,

-                                     uv_t_dim->h * 4, 2, "l");

-                            hex_dump(edge, 0, 1, 1, "tl");

-                            hex_dump(edge + 1, uv_t_dim->w * 4,

-                                     uv_t_dim->w * 4, 2, "t");

-                            hex_dump(dst, stride, uv_t_dim->w * 4,

-                                     uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");

-                        }

-                    skip_uv_pred: {}

-                        if (!b->skip) {

-                            enum TxfmType txtp;

-                            int eob;

-                            coef *cf;

-                            if (f->frame_thread.pass) {

-                                cf = ts->frame_thread.cf;

-                                ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;

-                                const struct CodedBlockInfo *const cbi =

-                                    &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];

-                                eob = cbi->eob[pl + 1];

-                                txtp = cbi->txtp[pl + 1];

-                            } else {

-                                uint8_t cf_ctx;

-                                cf = t->cf;

-                                eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],

-                                                   &t->l.ccoef[pl][cby4 + y],

-                                                   b->uvtx, bs, b, 1, 1 + pl, cf,

-                                                   &txtp, &cf_ctx);

-                                if (DEBUG_BLOCK_INFO)

-                                    printf("Post-uv-cf-blk[pl=%d,tx=%d,"

-                                           "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",

-                                           pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);

-                                memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,

-                                       imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));

-                                memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,

-                                       imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));

-                            }

-                            if (eob >= 0) {

-                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

-                                    coef_dump(cf, uv_t_dim->h * 4,

-                                              uv_t_dim->w * 4, 3, "dq");

-                                dsp->itx.itxfm_add[b->uvtx]

-                                                  [txtp](dst, stride,

-                                                         cf, eob);

-                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

-                                    hex_dump(dst, stride, uv_t_dim->w * 4,

-                                             uv_t_dim->h * 4, "recon");

-                            }

-                        } else if (!f->frame_thread.pass) {

-                            memset(&t->a->ccoef[pl][cbx4 + x], 0x40, uv_t_dim->w);

-                            memset(&t->l.ccoef[pl][cby4 + y], 0x40, uv_t_dim->h);

-                        }

-                        dst += uv_t_dim->w * 4;

-                    }

-                    t->bx -= x << ss_hor;

-                }

-                t->by -= y << ss_ver;

-            }

-        }

-    }

-}

-void bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs,

-                                 const Av1Block *const b)

-{

-    Dav1dTileState *const ts = t->ts;

-    const Dav1dFrameContext *const f = t->f;

-    const Dav1dDSPContext *const dsp = f->dsp;

-    const int bx4 = t->bx & 31, by4 = t->by & 31;

-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

-    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;

-    const uint8_t *const b_dim = dav1d_block_dimensions[bs];

-    const int bw4 = b_dim[0], bh4 = b_dim[1];

-    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);

-    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&

-                           (bw4 > ss_hor || t->bx & 1) &&

-                           (bh4 > ss_ver || t->by & 1);

-    const int chr_layout_idx = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :

-                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.p.layout;

-    // prediction

-    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;

-    pixel *dst = ((pixel *) f->cur.p.data[0]) +

-        4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);

-    const ptrdiff_t uvdstoff =

-        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));

-    if (!(f->frame_hdr.frame_type & 1)) {

-        // intrabc

-        mc(t, dst, NULL, f->cur.p.stride[0],

-           bw4, bh4, t->bx, t->by, 0, b->mv[0], &f->cur, FILTER_2D_BILINEAR);

-        if (has_chroma) for (int pl = 1; pl < 3; pl++)

-            mc(t, ((pixel *) f->cur.p.data[pl]) + uvdstoff, NULL, f->cur.p.stride[1],

-               bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),

-               t->bx & ~ss_hor, t->by & ~ss_ver,

-               pl, b->mv[0], &f->cur, FILTER_2D_BILINEAR);

-    } else if (b->comp_type == COMP_INTER_NONE) {

-        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];

-        const enum Filter2d filter_2d = b->filter2d;

-        if (imin(bw4, bh4) > 1 && !f->frame_hdr.force_integer_mv &&

-            ((b->inter_mode == GLOBALMV &&

-              f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||

-             (b->motion_mode == MM_WARP &&

-              t->warpmv.type > WM_TYPE_TRANSLATION)))

-        {

-            warp_affine(t, dst, NULL, f->cur.p.stride[0], b_dim, 0, refp,

-                        b->motion_mode == MM_WARP ? &t->warpmv :

-                            &f->frame_hdr.gmv[b->ref[0]]);

-        } else {

-            mc(t, dst, NULL, f->cur.p.stride[0],

-               bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, filter_2d);

-            if (b->motion_mode == MM_OBMC)

-                obmc(t, dst, f->cur.p.stride[0], b_dim, 0, bx4, by4, w4, h4);

-        }

-        if (b->interintra_type) {

-            ALIGN_STK_32(pixel, tl_edge_buf, 65,);

-            pixel *const tl_edge = tl_edge_buf + 32;

-            enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?

-                                   SMOOTH_PRED : b->interintra_mode;

-            pixel *const tmp = t->scratch.interintra;

-            int angle = 0;

-            const pixel *top_sb_edge = NULL;

-            if (!(t->by & (f->sb_step - 1))) {

-                top_sb_edge = f->ipred_edge[0];

-                const int sby = t->by >> f->sb_shift;

-                top_sb_edge += f->sb128w * 128 * (sby - 1);

-            }

-            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,

-                                                  t->by, t->by > ts->tiling.row_start,

-                                                  ts->tiling.col_end, ts->tiling.row_end,

-                                                  0, dst, f->cur.p.stride[0], top_sb_edge,

-                                                  m, &angle, bw4, bh4, tl_edge);

-            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),

-                                     tl_edge, bw4 * 4, bh4 * 4, 0);

-            const uint8_t *const ii_mask =

-                b->interintra_type == INTER_INTRA_BLEND ?

-                     dav1d_ii_masks[bs][0][b->interintra_mode] :

-                     dav1d_wedge_masks[bs][0][0][b->wedge_idx];

-            dsp->mc.blend(dst, f->cur.p.stride[0], tmp, bw4 * 4 * sizeof(pixel),

-                          bw4 * 4, bh4 * 4, ii_mask, bw4 * 4);

-        }

-        if (!has_chroma) goto skip_inter_chroma_pred;

-        // sub8x8 derivation

-        int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;

-        refmvs *r;

-        if (is_sub8x8) {

-            assert(ss_hor == 1);

-            r = &f->mvs[t->by * f->b4_stride + t->bx];

-            if (bw4 == 1) is_sub8x8 &= r[-1].ref[0] > 0;

-            if (bh4 == ss_ver) is_sub8x8 &= r[-f->b4_stride].ref[0] > 0;

-            if (bw4 == 1 && bh4 == ss_ver)

-                is_sub8x8 &= r[-(1 + f->b4_stride)].ref[0] > 0;

-        }

-        // chroma prediction

-        if (is_sub8x8) {

-            assert(ss_hor == 1);

-            int h_off = 0, v_off = 0;

-            if (bw4 == 1 && bh4 == ss_ver) {

-                for (int pl = 0; pl < 2; pl++)

-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,

-                       NULL, f->cur.p.stride[1],

-                       bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,

-                       r[-(f->b4_stride + 1)].mv[0],

-                       &f->refp[r[-(f->b4_stride + 1)].ref[0] - 1],

-                       f->frame_thread.pass != 2 ? t->tl_4x4_filter :

-                           f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);

-                v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);

-                h_off = 2;

-            }

-            if (bw4 == 1) {

-                const enum Filter2d left_filter_2d =

-                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];

-                for (int pl = 0; pl < 2; pl++)

-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + v_off, NULL,

-                       f->cur.p.stride[1], bw4, bh4, t->bx - 1,

-                       t->by, 1 + pl, r[-1].mv[0], &f->refp[r[-1].ref[0] - 1],

-                       f->frame_thread.pass != 2 ? left_filter_2d :

-                           f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);

-                h_off = 2;

-            }

-            if (bh4 == ss_ver) {

-                const enum Filter2d top_filter_2d =

-                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];

-                for (int pl = 0; pl < 2; pl++)

-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off, NULL,

-                       f->cur.p.stride[1], bw4, bh4, t->bx, t->by - 1,

-                       1 + pl, r[-f->b4_stride].mv[0],

-                       &f->refp[r[-f->b4_stride].ref[0] - 1],

-                       f->frame_thread.pass != 2 ? top_filter_2d :

-                           f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);

-                v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);

-            }

-            for (int pl = 0; pl < 2; pl++)

-                mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.p.stride[1],

-                   bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0], refp, filter_2d);

-        } else {

-            if (imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&

-                ((b->inter_mode == GLOBALMV &&

-                  f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||

-                 (b->motion_mode == MM_WARP &&

-                  t->warpmv.type > WM_TYPE_TRANSLATION)))

-            {

-                for (int pl = 0; pl < 2; pl++)

-                    warp_affine(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, NULL,

-                                f->cur.p.stride[1], b_dim, 1 + pl, refp,

-                                b->motion_mode == MM_WARP ? &t->warpmv :

-                                    &f->frame_hdr.gmv[b->ref[0]]);

-            } else {

-                for (int pl = 0; pl < 2; pl++) {

-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,

-                       NULL, f->cur.p.stride[1],

-                       bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),

-                       t->bx & ~ss_hor, t->by & ~ss_ver,

-                       1 + pl, b->mv[0], refp, filter_2d);

-                    if (b->motion_mode == MM_OBMC)

-                        obmc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,

-                             f->cur.p.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);

-                }

-            }

-            if (b->interintra_type) {

-                // FIXME for 8x32 with 4:2:2 subsampling, this probably does

-                // the wrong thing since it will select 4x16, not 4x32, as a

-                // transform size...

-                const uint8_t *const ii_mask =

-                    b->interintra_type == INTER_INTRA_BLEND ?

-                         dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] :

-                         dav1d_wedge_masks[bs][chr_layout_idx][0][b->wedge_idx];

-                for (int pl = 0; pl < 2; pl++) {

-                    pixel *const tmp = t->scratch.interintra;

-                    pixel tl_edge_px[65], *const tl_edge = &tl_edge_px[32];

-                    enum IntraPredMode m =

-                        b->interintra_mode == II_SMOOTH_PRED ?

-                        SMOOTH_PRED : b->interintra_mode;

-                    int angle = 0;

-                    pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;

-                    const pixel *top_sb_edge = NULL;

-                    if (!(t->by & (f->sb_step - 1))) {

-                        top_sb_edge = f->ipred_edge[pl + 1];

-                        const int sby = t->by >> f->sb_shift;

-                        top_sb_edge += f->sb128w * 128 * (sby - 1);

-                    }

-                    m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,

-                                                          (t->bx >> ss_hor) >

-                                                              (ts->tiling.col_start >> ss_hor),

-                                                          t->by >> ss_ver,

-                                                          (t->by >> ss_ver) >

-                                                              (ts->tiling.row_start >> ss_ver),

-                                                          ts->tiling.col_end >> ss_hor,

-                                                          ts->tiling.row_end >> ss_ver,

-                                                          0, uvdst, f->cur.p.stride[1],

-                                                          top_sb_edge, m,

-                                                          &angle, cbw4, cbh4, tl_edge);

-                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),

-                                             tl_edge, cbw4 * 4, cbh4 * 4, 0);

-                    dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp, cbw4 * 4 * sizeof(pixel),

-                                  cbw4 * 4, cbh4 * 4, ii_mask, cbw4 * 4);

-                }

-            }

-        }

-    skip_inter_chroma_pred: {}

-        t->tl_4x4_filter = filter_2d;

-    } else {

-        const enum Filter2d filter_2d = b->filter2d;

-        // Maximum super block size is 128x128

-        coef (*tmp)[128 * 128] = (coef (*)[128 * 128]) t->scratch.compinter;

-        int jnt_weight;

-        uint8_t *const seg_mask = t->scratch_seg_mask;

-        const uint8_t *mask;

-        for (int i = 0; i < 2; i++) {

-            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];

-            if (b->inter_mode == GLOBALMV_GLOBALMV && !f->frame_hdr.force_integer_mv &&

-                f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)

-            {

-                warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,

-                            &f->frame_hdr.gmv[b->ref[i]]);

-            } else {

-                mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,

-                   b->mv[i], refp, filter_2d);

-            }

-        }

-        switch (b->comp_type) {

-        case COMP_INTER_AVG:

-            dsp->mc.avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],

-                        bw4 * 4, bh4 * 4);

-            break;

-        case COMP_INTER_WEIGHTED_AVG:

-            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];

-            dsp->mc.w_avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],

-                          bw4 * 4, bh4 * 4, jnt_weight);

-            break;

-        case COMP_INTER_SEG:

-            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.p.stride[0],

-                                           tmp[b->mask_sign], tmp[!b->mask_sign],

-                                           bw4 * 4, bh4 * 4, seg_mask, b->mask_sign);

-            mask = seg_mask;

-            break;

-        case COMP_INTER_WEDGE:

-            mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx];

-            dsp->mc.mask(dst, f->cur.p.stride[0],

-                         tmp[b->mask_sign], tmp[!b->mask_sign],

-                         bw4 * 4, bh4 * 4, mask);

-            if (has_chroma)

-                mask = dav1d_wedge_masks[bs][chr_layout_idx][b->mask_sign][b->wedge_idx];

-            break;

-        }

-        // chroma

-        if (has_chroma) for (int pl = 0; pl < 2; pl++) {

-            for (int i = 0; i < 2; i++) {

-                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];

-                if (b->inter_mode == GLOBALMV_GLOBALMV &&

-                    imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&

-                    f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)

-                {

-                    warp_affine(t, NULL, tmp[i], bw4 * 2, b_dim, 1 + pl,

-                                refp, &f->frame_hdr.gmv[b->ref[i]]);

-                } else {

-                    mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,

-                       1 + pl, b->mv[i], refp, filter_2d);

-                }

-            }

-            pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;

-            switch (b->comp_type) {

-            case COMP_INTER_AVG:

-                dsp->mc.avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],

-                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver);

-                break;

-            case COMP_INTER_WEIGHTED_AVG:

-                dsp->mc.w_avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],

-                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight);

-                break;

-            case COMP_INTER_WEDGE:

-            case COMP_INTER_SEG:

-                dsp->mc.mask(uvdst, f->cur.p.stride[1],

-                             tmp[b->mask_sign], tmp[!b->mask_sign],

-                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask);

-                break;

-            }

-        }

-    }

-    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

-        hex_dump(dst, f->cur.p.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");

-        if (has_chroma) {

-            hex_dump(&((pixel *) f->cur.p.data[1])[uvdstoff], f->cur.p.stride[1],

-                     cbw4 * 4, cbh4 * 4, "u-pred");

-            hex_dump(&((pixel *) f->cur.p.data[2])[uvdstoff], f->cur.p.stride[1],

-                     cbw4 * 4, cbh4 * 4, "v-pred");

-        }

-    }

-    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;

-    if (b->skip) {

-        // reset coef contexts

-        memset(&t->a->lcoef[bx4], 0x40, w4);

-        memset(&t->l.lcoef[by4], 0x40, h4);

-        if (has_chroma) {

-            memset(&t->a->ccoef[0][cbx4], 0x40, cw4);

-            memset(&t->l.ccoef[0][cby4], 0x40, ch4);

-            memset(&t->a->ccoef[1][cbx4], 0x40, cw4);

-            memset(&t->l.ccoef[1][cby4], 0x40, ch4);

-        }

-        return;

-    }

-    const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];

-    const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];

-    for (int init_y = 0; init_y < bh4; init_y += 16) {

-        for (int init_x = 0; init_x < bw4; init_x += 16) {

-            // coefficient coding & inverse transforms

-            int y_off = !!init_y, y;

-            dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * init_y;

-            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);

-                 y += ytx->h, y_off++)

-            {

-                int x, x_off = !!init_x;

-                for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);

-                     x += ytx->w, x_off++)

-                {

-                    read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,

-                                   x_off, y_off, &dst[x * 4]);

-                    t->bx += ytx->w;

-                }

-                dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * ytx->h;

-                t->bx -= x;

-                t->by += ytx->h;

-            }

-            dst -= PXSTRIDE(f->cur.p.stride[0]) * 4 * y;

-            t->by -= y;

-            // chroma coefs and inverse transform

-            if (has_chroma) for (int pl = 0; pl < 2; pl++) {

-                pixel *uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff +

-                    (PXSTRIDE(f->cur.p.stride[1]) * init_y * 4 >> ss_ver);

-                for (y = init_y >> ss_ver, t->by += init_y;

-                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)

-                {

-                    int x;

-                    for (x = init_x >> ss_hor, t->bx += init_x;

-                         x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)

-                    {

-                        coef *cf;

-                        int eob;

-                        enum TxfmType txtp;

-                        if (f->frame_thread.pass) {

-                            cf = ts->frame_thread.cf;

-                            ts->frame_thread.cf += uvtx->w * uvtx->h * 16;

-                            const struct CodedBlockInfo *const cbi =

-                                &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];

-                            eob = cbi->eob[1 + pl];

-                            txtp = cbi->txtp[1 + pl];

-                        } else {

-                            uint8_t cf_ctx;

-                            cf = t->cf;

-                            txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +

-                                                bx4 + (x << ss_hor)];

-                            eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],

-                                               &t->l.ccoef[pl][cby4 + y],

-                                               b->uvtx, bs, b, 0, 1 + pl,

-                                               cf, &txtp, &cf_ctx);

-                            if (DEBUG_BLOCK_INFO)

-                                printf("Post-uv-cf-blk[pl=%d,tx=%d,"

-                                       "txtp=%d,eob=%d]: r=%d\n",

-                                       pl, b->uvtx, txtp, eob, ts->msac.rng);

-                            memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,

-                                   imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor));

-                            memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,

-                                   imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver));

-                        }

-                        if (eob >= 0) {

-                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

-                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");

-                            dsp->itx.itxfm_add[b->uvtx]

-                                              [txtp](&uvdst[4 * x],

-                                                     f->cur.p.stride[1],

-                                                     cf, eob);

-                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

-                                hex_dump(&uvdst[4 * x], f->cur.p.stride[1],

-                                         uvtx->w * 4, uvtx->h * 4, "recon");

-                        }

-                        t->bx += uvtx->w << ss_hor;

-                    }

-                    uvdst += PXSTRIDE(f->cur.p.stride[1]) * 4 * uvtx->h;

-                    t->bx -= x << ss_hor;

-                    t->by += uvtx->h << ss_ver;

-                }

-                t->by -= y << ss_ver;

-            }

-        }

-    }

-}

-void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {

-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

-    const int sbsz = f->sb_step, sbh = f->sbh;

-    if (f->frame_hdr.loopfilter.level_y[0] ||

-        f->frame_hdr.loopfilter.level_y[1])

-    {

-        int start_of_tile_row = 0;

-        if (f->frame_hdr.tiling.row_start_sb[f->lf.tile_row] == sby)

-            start_of_tile_row = f->lf.tile_row++;

-        bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby,

-                                       start_of_tile_row);

-    }

-    if (f->seq_hdr.restoration) {

-        // Store loop filtered pixels required by loop restoration

-        bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);

-    }

-    if (f->seq_hdr.cdef) {

-        if (sby) {

-            pixel *p_up[3] = {

-                f->lf.p[0] - 8 * PXSTRIDE(f->cur.p.stride[0]),

-                f->lf.p[1] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),

-                f->lf.p[2] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),

-            };

-            bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,

-                                    sby * sbsz - 2, sby * sbsz);

-        }

-        const int n_blks = sbsz - 2 * (sby + 1 < sbh);

-        bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,

-                                imin(sby * sbsz + n_blks, f->bh));

-    }

-    if (f->seq_hdr.restoration) {

-        bytefn(dav1d_lr_sbrow)(f, f->lf.p, sby);

-    }

-    f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[0]);

-    f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;

-    f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;

-    f->lf.prev_mask_ptr = f->lf.mask_ptr;

-    if ((sby & 1) || f->seq_hdr.sb128) {

-        f->lf.mask_ptr += f->sb128w;

-    }

-}

-void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {

-    const Dav1dFrameContext *const f = t->f;

-    Dav1dTileState *const ts = t->ts;

-    const int sby = t->by >> f->sb_shift;

-    const int sby_off = f->sb128w * 128 * sby;

-    const int x_off = ts->tiling.col_start;

-    const pixel *const y =

-        ((const pixel *) f->cur.p.data[0]) + x_off * 4 +

-                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.p.stride[0]);

-    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,

-               4 * (ts->tiling.col_end - x_off));

-    if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {

-        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

-        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

-        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +

-            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.p.stride[1]);

-        for (int pl = 1; pl <= 2; pl++)

-            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],

-                       &((const pixel *) f->cur.p.data[pl])[uv_off],

-                       4 * (ts->tiling.col_end - x_off) >> ss_hor);

-    }

-}

--- /dev/null

+++ b/src/recon_tmpl.c

@@ -1,0 +1,1518 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "config.h"

+#include <string.h>

+#include <stdio.h>

+#include "common/attributes.h"

+#include "common/bitdepth.h"

+#include "common/dump.h"

+#include "common/intops.h"

+#include "common/mem.h"

+#include "src/cdef_apply.h"

+#include "src/ipred_prepare.h"

+#include "src/lf_apply.h"

+#include "src/lr_apply.h"

+#include "src/recon.h"

+#include "src/scan.h"

+#include "src/tables.h"

+#include "src/wedge.h"

+static unsigned read_golomb(MsacContext *const msac) {

+    int len = 0;

+    unsigned val = 1;

+    while (!msac_decode_bool(msac, 128 << 7) && len < 32) len++;

+    while (len--) val = (val << 1) | msac_decode_bool(msac, 128 << 7);

+    return val - 1;

+}

+static int decode_coefs(Dav1dTileContext *const t,

+                        uint8_t *const a, uint8_t *const l,

+                        const enum RectTxfmSize tx, const enum BlockSize bs,

+                        const Av1Block *const b, const int intra,

+                        const int plane, coef *cf,

+                        enum TxfmType *const txtp, uint8_t *res_ctx)

+{

+    Dav1dTileState *const ts = t->ts;

+    const int chroma = !!plane;

+    const Dav1dFrameContext *const f = t->f;

+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];

+    const int dbg = DEBUG_BLOCK_INFO && plane && 0;

+    if (dbg) printf("Start: r=%d\n", ts->msac.rng);

+    // does this block have any non-zero coefficients

+    const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.p.layout);

+    const int all_skip =

+        msac_decode_bool_adapt(&ts->msac, ts->cdf.coef.skip[t_dim->ctx][sctx]);

+    if (dbg)

+    printf("Post-non-zero[%d][%d][%d]: r=%d\n",

+           t_dim->ctx, sctx, all_skip, ts->msac.rng);

+    if (all_skip) {

+        *res_ctx = 0x40;

+        *txtp = f->frame_hdr.segmentation.lossless[b->seg_id] ? WHT_WHT :

+                                                                DCT_DCT;

+        return -1;

+    }

+    // transform type (chroma: derived, luma: explicitly coded)

+    if (chroma) {

+        if (intra) {

+            *txtp = get_uv_intra_txtp(b->uv_mode, tx, &f->frame_hdr, b->seg_id);

+        } else {

+            const enum TxfmType y_txtp = *txtp;

+            *txtp = get_uv_inter_txtp(t_dim, y_txtp, &f->frame_hdr, b->seg_id);

+        }

+    } else {

+        const enum TxfmTypeSet set = get_ext_txtp_set(tx, !intra,

+                                                      &f->frame_hdr, b->seg_id);

+        const unsigned set_cnt = dav1d_tx_type_count[set];

+        unsigned idx;

+        if (set_cnt == 1) {

+            idx = 0;

+        } else {

+            const int set_idx = dav1d_tx_type_set_index[!intra][set];

+            const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?

+                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;

+            uint16_t *const txtp_cdf = intra ?

+                       ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :

+                       ts->cdf.m.txtp_inter[set_idx][t_dim->min];

+            idx = msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt);

+            if (dbg)

+            printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",

+                   set, set_idx, tx, t_dim->min, b->intra ? (int)y_mode_nofilt : -1,

+                   idx, dav1d_tx_types_per_set[set][idx], ts->msac.rng);

+        }

+        *txtp = dav1d_tx_types_per_set[set][idx];

+    }

+    // find end-of-block (eob)

+    int eob_bin;

+    const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32);

+    const enum TxClass tx_class = dav1d_tx_type_class[*txtp];

+    const int is_1d = tx_class != TX_CLASS_2D;

+    switch (tx2dszctx) {

+#define case_sz(sz, bin) \

+    case sz: { \

+        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \

+        eob_bin = msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \

+        break; \

+    }

+    case_sz(0,   16);

+    case_sz(1,   32);

+    case_sz(2,   64);

+    case_sz(3,  128);

+    case_sz(4,  256);

+    case_sz(5,  512);

+    case_sz(6, 1024);

+#undef case_sz

+    }

+    if (dbg)

+    printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",

+           16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);

+    int eob;

+    if (eob_bin > 1) {

+        eob = 1 << (eob_bin - 1);

+        uint16_t *const eob_hi_bit_cdf =

+            ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];

+        const int eob_hi_bit = msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);

+        if (dbg)

+        printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",

+               t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);

+        unsigned mask = eob >> 1;

+        if (eob_hi_bit) eob |= mask;

+        for (mask >>= 1; mask; mask >>= 1) {

+            const int eob_bit = msac_decode_bool(&ts->msac, 128 << 7);

+            if (eob_bit) eob |= mask;

+        }

+        if (dbg)

+        printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);

+    } else {

+        eob = eob_bin;

+    }

+    // base tokens

+    uint16_t (*const br_cdf)[5] =

+        ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];

+    const int16_t *const scan = dav1d_scans[tx][tx_class];

+    uint8_t levels[36 * 36];

+    ptrdiff_t stride = 4 * (imin(t_dim->h, 8) + 1);

+    memset(levels, 0, stride * 4 * (imin(t_dim->w, 8) + 1));

+    const int shift = 2 + imin(t_dim->lh, 3), mask = 4 * imin(t_dim->h, 8) - 1;

+    unsigned cul_level = 0;

+    for (int i = eob, is_last = 1; i >= 0; i--, is_last = 0) {

+        const int rc = scan[i], x = rc >> shift, y = rc & mask;

+        // lo tok

+        const int ctx = get_coef_nz_ctx(levels, i, rc, is_last, tx, tx_class);

+        uint16_t *const lo_cdf = is_last ?

+            ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] :

+            ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];

+        int tok = msac_decode_symbol_adapt(&ts->msac, lo_cdf,

+                                           4 - is_last) + is_last;

+        if (dbg)

+        printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",

+               t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);

+        if (!tok) continue;

+        // hi tok

+        if (tok == 3) {

+            const int br_ctx = get_br_ctx(levels, rc, tx, tx_class);

+            do {

+                const int tok_br =

+                    msac_decode_symbol_adapt(&ts->msac, br_cdf[br_ctx], 4);

+                if (dbg)

+                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",

+                       imin(t_dim->ctx, 3), chroma, br_ctx,

+                       i, rc, tok_br, tok, ts->msac.rng);

+                tok += tok_br;

+                if (tok_br < 3) break;

+            } while (tok < 15);

+        }

+        levels[x * stride + y] = cf[rc] = tok;

+    }

+    // residual and sign

+    int dc_sign = 1;

+    const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];

+    const uint8_t *const qm_tbl = f->qm[is_1d || *txtp == IDTX][tx][plane];

+    const int dq_shift = imax(0, t_dim->ctx - 2);

+    for (int i = 0; i <= eob; i++) {

+        const int rc = scan[i];

+        int tok = cf[rc];

+        if (!tok) continue;

+        int dq;

+        // sign

+        int sign;

+        if (i == 0) {

+            const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l);

+            uint16_t *const dc_sign_cdf =

+                ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];

+            sign = msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);

+            if (dbg)

+            printf("Post-dc_sign[%d][%d][%d]: r=%d\n",

+                   chroma, dc_sign_ctx, sign, ts->msac.rng);

+            dc_sign = sign ? 0 : 2;

+            dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5;

+        } else {

+            sign = msac_decode_bool(&ts->msac, 128 << 7);

+            if (dbg)

+            printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng);

+            dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5;

+        }

+        // residual

+        if (tok == 15) {

+            tok += read_golomb(&ts->msac);

+            if (dbg)

+            printf("Post-residual[%d=%d=%d->%d]: r=%d\n",

+                   i, rc, tok - 15, tok, ts->msac.rng);

+        }

+        // dequant

+        cul_level += tok;

+        tok *= dq;

+        tok >>= dq_shift;

+        cf[rc] = sign ? -tok : tok;

+    }

+    // context

+    *res_ctx = imin(cul_level, 63) | (dc_sign << 6);

+    return eob;

+}

+static void read_coef_tree(Dav1dTileContext *const t,

+                           const enum BlockSize bs, const Av1Block *const b,

+                           const enum RectTxfmSize ytx, const int depth,

+                           const uint16_t *const tx_split,

+                           const int x_off, const int y_off, pixel *dst)

+{

+    const Dav1dFrameContext *const f = t->f;

+    Dav1dTileState *const ts = t->ts;

+    const Dav1dDSPContext *const dsp = f->dsp;

+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];

+    const int txw = t_dim->w, txh = t_dim->h;

+    if (depth < 2 && tx_split[depth] & (1 << (y_off * 4 + x_off))) {

+        const enum RectTxfmSize sub = t_dim->sub;

+        const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];

+        const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;

+        read_coef_tree(t, bs, b, sub, depth + 1, tx_split,

+                       x_off * 2 + 0, y_off * 2 + 0, dst);

+        t->bx += txsw;

+        if (txw >= txh && t->bx < f->bw)

+            read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,

+                           y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);

+        t->bx -= txsw;

+        t->by += txsh;

+        if (txh >= txw && t->by < f->bh) {

+            if (dst)

+                dst += 4 * txsh * PXSTRIDE(f->cur.p.stride[0]);

+            read_coef_tree(t, bs, b, sub, depth + 1, tx_split,

+                           x_off * 2 + 0, y_off * 2 + 1, dst);

+            t->bx += txsw;

+            if (txw >= txh && t->bx < f->bw)

+                read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,

+                               y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);

+            t->bx -= txsw;

+        }

+        t->by -= txsh;

+    } else {

+        const int bx4 = t->bx & 31, by4 = t->by & 31;

+        enum TxfmType txtp;

+        uint8_t cf_ctx;

+        int eob;

+        coef *cf;

+        struct CodedBlockInfo *cbi;

+        if (f->frame_thread.pass) {

+            cf = ts->frame_thread.cf;

+            ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;

+            cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];

+        } else {

+            cf = t->cf;

+        }

+        if (f->frame_thread.pass != 2) {

+            eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],

+                               ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);

+            if (DEBUG_BLOCK_INFO)

+                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",

+                       ytx, txtp, eob, ts->msac.rng);

+            memset(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));

+            memset(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));

+            for (int y = 0; y < txh; y++)

+                memset(&t->txtp_map[(by4 + y) * 32 + bx4], txtp, txw);

+            if (f->frame_thread.pass == 1) {

+                cbi->eob[0] = eob;

+                cbi->txtp[0] = txtp;

+            }

+        } else {

+            eob = cbi->eob[0];

+            txtp = cbi->txtp[0];

+        }

+        if (!(f->frame_thread.pass & 1)) {

+            assert(dst);

+            if (eob >= 0) {

+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

+                    coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");

+                dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.p.stride[0], cf, eob);

+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

+                    hex_dump(dst, f->cur.p.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");

+            }

+        }

+    }

+}

+void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,

+                                    const enum BlockSize bs, const Av1Block *const b)

+{

+    const Dav1dFrameContext *const f = t->f;

+    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

+    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

+    const int bx4 = t->bx & 31, by4 = t->by & 31;

+    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;

+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];

+    const int bw4 = b_dim[0], bh4 = b_dim[1];

+    const int cbw4 = (bw4 + 1) >> ss_hor, cbh4 = (bh4 + 1) >> ss_ver;

+    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&

+                           (bw4 > ss_hor || t->bx & 1) &&

+                           (bh4 > ss_ver || t->by & 1);

+    if (b->skip) {

+        memset(&t->a->lcoef[bx4], 0x40, bw4);

+        memset(&t->l.lcoef[by4], 0x40, bh4);

+        if (has_chroma) for (int pl = 0; pl < 2; pl++) {

+            memset(&t->a->ccoef[pl][cbx4], 0x40, cbw4);

+            memset(&t->l.ccoef[pl][cby4], 0x40, cbh4);

+        }

+        return;

+    }

+    Dav1dTileState *const ts = t->ts;

+    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);

+    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;

+    assert(f->frame_thread.pass == 1);

+    assert(!b->skip);

+    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];

+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];

+    for (int init_y = 0; init_y < h4; init_y += 16) {

+        for (int init_x = 0; init_x < w4; init_x += 16) {

+            const int sub_h4 = imin(h4, 16 + init_y);

+            const int sub_w4 = imin(w4, init_x + 16);

+            int y_off = !!init_y, y, x;

+            for (y = init_y, t->by += init_y; y < sub_h4;

+                 y += t_dim->h, t->by += t_dim->h, y_off++)

+            {

+                struct CodedBlockInfo *const cbi =

+                    &f->frame_thread.cbi[t->by * f->b4_stride];

+                int x_off = !!init_x;

+                for (x = init_x, t->bx += init_x; x < sub_w4;

+                     x += t_dim->w, t->bx += t_dim->w, x_off++)

+                {

+                    if (!b->intra) {

+                        read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,

+                                       x_off, y_off, NULL);

+                    } else {

+                        uint8_t cf_ctx = 0x40;

+                        enum TxfmType txtp;

+                        const int eob = cbi[t->bx].eob[0] =

+                            decode_coefs(t, &t->a->lcoef[bx4 + x],

+                                         &t->l.lcoef[by4 + y], b->tx, bs, b, 1,

+                                         0, ts->frame_thread.cf, &txtp, &cf_ctx);

+                        if (DEBUG_BLOCK_INFO)

+                            printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",

+                                   b->tx, txtp, eob, ts->msac.rng);

+                        cbi[t->bx].txtp[0] = txtp;

+                        ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;

+                        memset(&t->a->lcoef[bx4 + x], cf_ctx,

+                               imin(t_dim->w, f->bw - t->bx));

+                        memset(&t->l.lcoef[by4 + y], cf_ctx,

+                               imin(t_dim->h, f->bh - t->by));

+                    }

+                }

+                t->bx -= x;

+            }

+            t->by -= y;

+            if (!has_chroma) continue;

+            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);

+            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);

+            for (int pl = 0; pl < 2; pl++) {

+                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;

+                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)

+                {

+                    struct CodedBlockInfo *const cbi =

+                        &f->frame_thread.cbi[t->by * f->b4_stride];

+                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;

+                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)

+                    {

+                        uint8_t cf_ctx = 0x40;

+                        enum TxfmType txtp;

+                        if (!b->intra)

+                            txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +

+                                                bx4 + (x << ss_hor)];

+                        const int eob = cbi[t->bx].eob[1 + pl] =

+                            decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],

+                                         &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,

+                                         b, b->intra, 1 + pl, ts->frame_thread.cf,

+                                         &txtp, &cf_ctx);

+                        if (DEBUG_BLOCK_INFO)

+                            printf("Post-uv-cf-blk[pl=%d,tx=%d,"

+                                   "txtp=%d,eob=%d]: r=%d\n",

+                                   pl, b->uvtx, txtp, eob, ts->msac.rng);

+                        cbi[t->bx].txtp[1 + pl] = txtp;

+                        ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;

+                        memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,

+                               imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));

+                        memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,

+                               imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));

+                    }

+                    t->bx -= x << ss_hor;

+                }

+                t->by -= y << ss_ver;

+            }

+        }

+    }

+}

+static void emu_edge(pixel *dst, const ptrdiff_t dst_stride,

+                     const pixel *ref, const ptrdiff_t ref_stride,

+                     const int bw, const int bh,

+                     const int iw, const int ih,

+                     const int x, const int y)

+{

+    // find offset in reference of visible block to copy

+    ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + iclip(x, 0, iw - 1);

+    // number of pixels to extend (left, right, top, bottom)

+    const int left_ext = iclip(-x, 0, bw - 1);

+    const int right_ext = iclip(x + bw - iw, 0, bw - 1);

+    assert(left_ext + right_ext < bw);

+    const int top_ext = iclip(-y, 0, bh - 1);

+    const int bottom_ext = iclip(y + bh - ih, 0, bh - 1);

+    assert(top_ext + bottom_ext < bh);

+    // copy visible portion first

+    pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);

+    const int center_w = bw - left_ext - right_ext;

+    const int center_h = bh - top_ext - bottom_ext;

+    for (int y = 0; y < center_h; y++) {

+        pixel_copy(blk + left_ext, ref, center_w);

+        // extend left edge for this line

+        if (left_ext)

+            pixel_set(blk, blk[left_ext], left_ext);

+        // extend right edge for this line

+        if (right_ext)

+            pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],

+                      right_ext);

+        ref += PXSTRIDE(ref_stride);

+        blk += PXSTRIDE(dst_stride);

+    }

+    // copy top

+    blk = dst + top_ext * PXSTRIDE(dst_stride);

+    for (int y = 0; y < top_ext; y++) {

+        pixel_copy(dst, blk, bw);

+        dst += PXSTRIDE(dst_stride);

+    }

+    // copy bottom

+    dst += center_h * PXSTRIDE(dst_stride);

+    for (int y = 0; y < bottom_ext; y++) {

+        pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);

+        dst += PXSTRIDE(dst_stride);

+    }

+}

+static void mc(Dav1dTileContext *const t,

+               pixel *const dst8, coef *const dst16, const ptrdiff_t dst_stride,

+               const int bw4, const int bh4,

+               const int bx, const int by, const int pl,

+               const mv mv, const Dav1dThreadPicture *const refp,

+               const enum Filter2d filter_2d)

+{

+    assert((dst8 != NULL) ^ (dst16 != NULL));

+    const Dav1dFrameContext *const f = t->f;

+    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

+    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

+    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;

+    const int mvx = mv.x, mvy = mv.y;

+    const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);

+    const int dx = bx * h_mul + (mvx >> (3 + ss_hor));

+    const int dy = by * v_mul + (mvy >> (3 + ss_ver));

+    ptrdiff_t ref_stride = refp->p.stride[!!pl];

+    const pixel *ref;

+    int w, h;

+    if (refp != &f->cur) { // i.e. not for intrabc

+        dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,

+                                  PLANE_TYPE_Y + !!pl);

+        w = (f->cur.p.p.w + ss_hor) >> ss_hor;

+        h = (f->cur.p.p.h + ss_ver) >> ss_ver;

+    } else {

+        w = f->bw * 4 >> ss_hor;

+        h = f->bh * 4 >> ss_ver;

+    }

+    if (dx < !!mx * 3 || dy < !!my * 3 ||

+        dx + bw4 * h_mul + !!mx * 4 > w ||

+        dy + bh4 * v_mul + !!my * 4 > h)

+    {

+        emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl], ref_stride,

+                 bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7, w, h,

+                 dx - !!mx * 3, dy - !!my * 3);

+        ref = &t->emu_edge[160 * !!my * 3 + !!mx * 3];

+        ref_stride = 160 * sizeof(pixel);

+    } else {

+        ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;

+    }

+    if (dst8 != NULL) {

+        f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,

+                                 bh4 * v_mul, mx << !ss_hor, my << !ss_ver);

+    } else {

+        f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,

+                                  bh4 * v_mul, mx << !ss_hor, my << !ss_ver);

+    }

+}

+static void obmc(Dav1dTileContext *const t,

+                 pixel *const dst, const ptrdiff_t dst_stride,

+                 const uint8_t *const b_dim, const int pl,

+                 const int bx4, const int by4, const int w4, const int h4)

+{

+    assert(!(t->bx & 1) && !(t->by & 1));

+    const Dav1dFrameContext *const f = t->f;

+    const refmvs *const r = &f->mvs[t->by * f->b4_stride + t->bx];

+    pixel *const lap = t->scratch.lap;

+    static const uint8_t obmc_mask_2[2] = { 19,  0 };

+    static const uint8_t obmc_mask_4[4] = { 25, 14,  5,  0 };

+    static const uint8_t obmc_mask_8[8] = { 28, 22, 16, 11,  7,  3,  0,  0 };

+    static const uint8_t obmc_mask_16[16] = { 30, 27, 24, 21, 18, 15, 12, 10,

+                                               8,  6,  4,  3,  0,  0,  0,  0 };

+    static const uint8_t obmc_mask_32[32] = { 31, 29, 28, 26, 24, 23, 21, 20,

+                                              19, 17, 16, 14, 13, 12, 11,  9,

+                                               8,  7,  6,  5,  4,  4,  3,  2,

+                                               0,  0,  0,  0,  0,  0,  0,  0 };

+    static const uint8_t *const obmc_masks[] = {

+        obmc_mask_2, obmc_mask_4, obmc_mask_8, obmc_mask_16, obmc_mask_32

+    };

+    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

+    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

+    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;

+    if (t->by > t->ts->tiling.row_start &&

+        (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))

+    {

+        for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {

+            // only odd blocks are considered for overlap handling, hence +1

+            const refmvs *const a_r = &r[x - f->b4_stride + 1];

+            const uint8_t *const a_b_dim =

+                dav1d_block_dimensions[sbtype_to_bs[a_r->sb_type]];

+            if (a_r->ref[0] > 0) {

+                mc(t, lap, NULL, 128 * sizeof(pixel),

+                   iclip(a_b_dim[0], 2, b_dim[0]), imin(b_dim[1], 16) >> 1,

+                   t->bx + x, t->by, pl, a_r->mv[0],

+                   &f->refp[a_r->ref[0] - 1],

+                   dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);

+                f->dsp->mc.blend(&dst[x * h_mul], dst_stride,

+                                 lap, 128 * sizeof(pixel),

+                                 h_mul * iclip(a_b_dim[0], 2, b_dim[0]),

+                                 v_mul * imin(b_dim[1], 16) >> 1,

+                                 obmc_masks[imin(b_dim[3], 4) - ss_ver], 1);

+                i++;

+            }

+            x += imax(a_b_dim[0], 2);

+        }

+    }

+    if (t->bx > t->ts->tiling.col_start)

+        for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {

+            // only odd blocks are considered for overlap handling, hence +1

+            const refmvs *const l_r = &r[(y + 1) * f->b4_stride - 1];

+            const uint8_t *const l_b_dim =

+                dav1d_block_dimensions[sbtype_to_bs[l_r->sb_type]];

+            if (l_r->ref[0] > 0) {

+                mc(t, lap, NULL, 32 * sizeof(pixel),

+                   imin(b_dim[0], 16) >> 1,

+                   iclip(l_b_dim[1], 2, b_dim[1]),

+                   t->bx, t->by + y, pl, l_r->mv[0],

+                   &f->refp[l_r->ref[0] - 1],

+                   dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);

+                f->dsp->mc.blend(&dst[y * v_mul * PXSTRIDE(dst_stride)], dst_stride,

+                                 lap, 32 * sizeof(pixel),

+                                 h_mul * imin(b_dim[0], 16) >> 1,

+                                 v_mul * iclip(l_b_dim[1], 2, b_dim[1]),

+                                 obmc_masks[imin(b_dim[2], 4) - ss_hor], 0);

+                i++;

+            }

+            y += imax(l_b_dim[1], 2);

+        }

+}

+static void warp_affine(Dav1dTileContext *const t,

+                        pixel *dst8, coef *dst16, const ptrdiff_t dstride,

+                        const uint8_t *const b_dim, const int pl,

+                        const Dav1dThreadPicture *const refp,

+                        const WarpedMotionParams *const wmp)

+{

+    assert((dst8 != NULL) ^ (dst16 != NULL));

+    const Dav1dFrameContext *const f = t->f;

+    const Dav1dDSPContext *const dsp = f->dsp;

+    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

+    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

+    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;

+    assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));

+    const int32_t *const mat = wmp->matrix;

+    const int width = (f->cur.p.p.w + ss_hor) >> ss_hor;

+    const int height = (f->cur.p.p.h + ss_ver) >> ss_ver;

+    for (int y = 0; y < b_dim[1] * v_mul; y += 8) {

+        for (int x = 0; x < b_dim[0] * h_mul; x += 8) {

+            // calculate transformation relative to center of 8x8 block in

+            // luma pixel units

+            const int src_x = t->bx * 4 + ((x + 4) << ss_hor);

+            const int src_y = t->by * 4 + ((y + 4) << ss_ver);

+            const int mvx = (mat[2] * src_x + mat[3] * src_y + mat[0]) >> ss_hor;

+            const int mvy = (mat[4] * src_x + mat[5] * src_y + mat[1]) >> ss_ver;

+            const int dx = (mvx >> 16) - 4;

+            const int mx = ((mvx & 0xffff) - wmp->alpha * 4 -

+                                             wmp->beta  * 7) & ~0x3f;

+            const int dy = (mvy >> 16) - 4;

+            const int my = ((mvy & 0xffff) - wmp->gamma * 4 -

+                                             wmp->delta * 4) & ~0x3f;

+            const pixel *ref_ptr;

+            ptrdiff_t ref_stride = refp->p.stride[!!pl];

+            dav1d_thread_picture_wait(refp, dy + 4 + 8,

+                                      PLANE_TYPE_Y + !!pl);

+            if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {

+                emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl],

+                         ref_stride, 15, 15, width, height, dx - 3, dy - 3);

+                ref_ptr = &t->emu_edge[160 * 3 + 3];

+                ref_stride = 160 * sizeof(pixel);

+            } else {

+                ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;

+            }

+            if (dst16 != NULL)

+                dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,

+                                 wmp->abcd, mx, my);

+            else

+                dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,

+                                wmp->abcd, mx, my);

+        }

+        if (dst8) dst8  += 8 * PXSTRIDE(dstride);

+        else      dst16 += 8 * dstride;

+    }

+}

+void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize bs,

+                                 const enum EdgeFlags intra_edge_flags,

+                                 const Av1Block *const b)

+{

+    Dav1dTileState *const ts = t->ts;

+    const Dav1dFrameContext *const f = t->f;

+    const Dav1dDSPContext *const dsp = f->dsp;

+    const int bx4 = t->bx & 31, by4 = t->by & 31;

+    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

+    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

+    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;

+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];

+    const int bw4 = b_dim[0], bh4 = b_dim[1];

+    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);

+    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;

+    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&

+                           (bw4 > ss_hor || t->bx & 1) &&

+                           (bh4 > ss_ver || t->by & 1);

+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];

+    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];

+    // coefficient coding

+    ALIGN_STK_32(pixel, edge_buf, 257,);

+    pixel *const edge = edge_buf + 128;

+    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;

+    for (int init_y = 0; init_y < h4; init_y += 16) {

+        for (int init_x = 0; init_x < w4; init_x += 16) {

+            if (b->pal_sz[0]) {

+                pixel *dst = ((pixel *) f->cur.p.data[0]) +

+                             4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);

+                const uint8_t *pal_idx;

+                if (f->frame_thread.pass) {

+                    pal_idx = ts->frame_thread.pal_idx;

+                    ts->frame_thread.pal_idx += bw4 * bh4 * 16;

+                } else {

+                    pal_idx = t->scratch.pal_idx;

+                }

+                const uint16_t *const pal = f->frame_thread.pass ?

+                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +

+                                        ((t->bx >> 1) + (t->by & 1))][0] : t->pal[0];

+                f->dsp->ipred.pal_pred(dst, f->cur.p.stride[0], pal,

+                                       pal_idx, bw4 * 4, bh4 * 4);

+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

+                    hex_dump(dst, PXSTRIDE(f->cur.p.stride[0]),

+                             bw4 * 4, bh4 * 4, "y-pal-pred");

+            }

+            const int sm_fl = sm_flag(t->a, bx4) | sm_flag(&t->l, by4);

+            const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :

+                              intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;

+            const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :

+                              intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;

+            int y, x;

+            const int sub_h4 = imin(h4, 16 + init_y);

+            const int sub_w4 = imin(w4, init_x + 16);

+            for (y = init_y, t->by += init_y; y < sub_h4;

+                 y += t_dim->h, t->by += t_dim->h)

+            {

+                pixel *dst = ((pixel *) f->cur.p.data[0]) +

+                               4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) +

+                                    t->bx + init_x);

+                for (x = init_x, t->bx += init_x; x < sub_w4;

+                     x += t_dim->w, t->bx += t_dim->w)

+                {

+                    if (b->pal_sz[0]) goto skip_y_pred;

+                    int angle = b->y_angle;

+                    const enum EdgeFlags edge_flags =

+                        (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?

+                             0 : EDGE_I444_TOP_HAS_RIGHT) |

+                        ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?

+                             0 : EDGE_I444_LEFT_HAS_BOTTOM);

+                    const pixel *top_sb_edge = NULL;

+                    if (!(t->by & (f->sb_step - 1))) {

+                        top_sb_edge = f->ipred_edge[0];

+                        const int sby = t->by >> f->sb_shift;

+                        top_sb_edge += f->sb128w * 128 * (sby - 1);

+                    }

+                    const enum IntraPredMode m =

+                        bytefn(dav1d_prepare_intra_edges)(t->bx,

+                                                          t->bx > ts->tiling.col_start,

+                                                          t->by,

+                                                          t->by > ts->tiling.row_start,

+                                                          ts->tiling.col_end,

+                                                          ts->tiling.row_end,

+                                                          edge_flags, dst,

+                                                          f->cur.p.stride[0], top_sb_edge,

+                                                          b->y_mode, &angle,

+                                                          t_dim->w, t_dim->h, edge);

+                    dsp->ipred.intra_pred[m](dst, f->cur.p.stride[0], edge,

+                                             t_dim->w * 4, t_dim->h * 4,

+                                             angle | sm_fl);

+                    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

+                        hex_dump(edge - t_dim->h * 4, t_dim->h * 4,

+                                 t_dim->h * 4, 2, "l");

+                        hex_dump(edge, 0, 1, 1, "tl");

+                        hex_dump(edge + 1, t_dim->w * 4,

+                                 t_dim->w * 4, 2, "t");

+                        hex_dump(dst, f->cur.p.stride[0],

+                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");

+                    }

+                skip_y_pred: {}

+                    if (!b->skip) {

+                        coef *cf;

+                        int eob;

+                        enum TxfmType txtp;

+                        if (f->frame_thread.pass) {

+                            cf = ts->frame_thread.cf;

+                            ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;

+                            const struct CodedBlockInfo *const cbi =

+                                &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];

+                            eob = cbi->eob[0];

+                            txtp = cbi->txtp[0];

+                        } else {

+                            uint8_t cf_ctx;

+                            cf = t->cf;

+                            eob = decode_coefs(t, &t->a->lcoef[bx4 + x],

+                                               &t->l.lcoef[by4 + y], b->tx, bs,

+                                               b, 1, 0, cf, &txtp, &cf_ctx);

+                            if (DEBUG_BLOCK_INFO)

+                                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",

+                                       b->tx, txtp, eob, ts->msac.rng);

+                            memset(&t->a->lcoef[bx4 + x], cf_ctx,

+                                   imin(t_dim->w, f->bw - t->bx));

+                            memset(&t->l.lcoef[by4 + y], cf_ctx,

+                                   imin(t_dim->h, f->bh - t->by));

+                        }

+                        if (eob >= 0) {

+                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

+                                coef_dump(cf, imin(t_dim->h, 8) * 4,

+                                          imin(t_dim->w, 8) * 4, 3, "dq");

+                            dsp->itx.itxfm_add[b->tx]

+                                              [txtp](dst,

+                                                     f->cur.p.stride[0],

+                                                     cf, eob);

+                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

+                                hex_dump(dst, f->cur.p.stride[0],

+                                         t_dim->w * 4, t_dim->h * 4, "recon");

+                        }

+                    } else if (!f->frame_thread.pass) {

+                        memset(&t->a->lcoef[bx4 + x], 0x40, t_dim->w);

+                        memset(&t->l.lcoef[by4 + y], 0x40, t_dim->h);

+                    }

+                    dst += 4 * t_dim->w;

+                }

+                t->bx -= x;

+            }

+            t->by -= y;

+            if (!has_chroma) continue;

+            const ptrdiff_t stride = f->cur.p.stride[1];

+            if (b->uv_mode == CFL_PRED) {

+                assert(!init_x && !init_y);

+                int16_t *const ac = t->scratch.ac;

+                pixel *y_src = ((pixel *) f->cur.p.data[0]) + 4 * (t->bx & ~ss_hor) +

+                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.p.stride[0]);

+                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +

+                                              (t->by >> ss_ver) * PXSTRIDE(stride));

+                pixel *const uv_dst[2] = { ((pixel *) f->cur.p.data[1]) + uv_off,

+                                           ((pixel *) f->cur.p.data[2]) + uv_off };

+                const int furthest_r =

+                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);

+                const int furthest_b =

+                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);

+                dsp->ipred.cfl_ac[f->cur.p.p.layout - 1]

+                                 [b->uvtx](ac, y_src, f->cur.p.stride[0],

+                                           cbw4 - (furthest_r >> ss_hor),

+                                           cbh4 - (furthest_b >> ss_ver));

+                for (int pl = 0; pl < 2; pl++) {

+                    if (!b->cfl_alpha[pl]) continue;

+                    int angle = 0;

+                    const pixel *top_sb_edge = NULL;

+                    if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {

+                        top_sb_edge = f->ipred_edge[pl + 1];

+                        const int sby = t->by >> f->sb_shift;

+                        top_sb_edge += f->sb128w * 128 * (sby - 1);

+                    }

+                    const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;

+                    const int xstart = ts->tiling.col_start >> ss_hor;

+                    const int ystart = ts->tiling.row_start >> ss_ver;

+                    const enum IntraPredMode m =

+                        bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,

+                                                          ypos, ypos > ystart,

+                                                          ts->tiling.col_end >> ss_hor,

+                                                          ts->tiling.row_end >> ss_ver,

+                                                          0, uv_dst[pl], stride,

+                                                          top_sb_edge, DC_PRED, &angle,

+                                                          uv_t_dim->w,

+                                                          uv_t_dim->h, edge);

+                    dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,

+                                           uv_t_dim->w * 4,

+                                           uv_t_dim->h * 4,

+                                           ac, b->cfl_alpha[pl]);

+                }

+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

+                    ac_dump(ac, 4*cbw4, 4*cbh4, "ac");

+                    hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");

+                    hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");

+                }

+            } else if (b->pal_sz[1]) {

+                ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +

+                                           (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));

+                const uint8_t *pal_idx;

+                if (f->frame_thread.pass) {

+                    pal_idx = ts->frame_thread.pal_idx;

+                    ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;

+                } else {

+                    pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];

+                }

+                const uint16_t *const pal_u = f->frame_thread.pass ?

+                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +

+                                        ((t->bx >> 1) + (t->by & 1))][1] : t->pal[1];

+                f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[1]) + uv_dstoff,

+                                       f->cur.p.stride[1], pal_u,

+                                       pal_idx, cbw4 * 4, cbh4 * 4);

+                const uint16_t *const pal_v = f->frame_thread.pass ?

+                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +

+                                        ((t->bx >> 1) + (t->by & 1))][2] : t->pal[2];

+                f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[2]) + uv_dstoff,

+                                       f->cur.p.stride[1], pal_v,

+                                       pal_idx, cbw4 * 4, cbh4 * 4);

+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

+                    hex_dump(((pixel *) f->cur.p.data[1]) + uv_dstoff,

+                             PXSTRIDE(f->cur.p.stride[1]),

+                             cbw4 * 4, cbh4 * 4, "u-pal-pred");

+                    hex_dump(((pixel *) f->cur.p.data[2]) + uv_dstoff,

+                             PXSTRIDE(f->cur.p.stride[1]),

+                             cbw4 * 4, cbh4 * 4, "v-pal-pred");

+                }

+            }

+            const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |

+                                 sm_uv_flag(&t->l, cby4);

+            const int uv_sb_has_tr =

+                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :

+                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.p.layout - 1));

+            const int uv_sb_has_bl =

+                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :

+                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.p.layout - 1));

+            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);

+            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);

+            for (int pl = 0; pl < 2; pl++) {

+                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;

+                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)

+                {

+                    pixel *dst = ((pixel *) f->cur.p.data[1 + pl]) +

+                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +

+                                        ((t->bx + init_x) >> ss_hor));

+                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;

+                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)

+                    {

+                        if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||

+                            b->pal_sz[1])

+                        {

+                            goto skip_uv_pred;

+                        }

+                        int angle = b->uv_angle;

+                        // this probably looks weird because we're using

+                        // luma flags in a chroma loop, but that's because

+                        // prepare_intra_edges() expects luma flags as input

+                        const enum EdgeFlags edge_flags =

+                            (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&

+                              (x + uv_t_dim->w >= sub_cw4)) ?

+                                 0 : EDGE_I444_TOP_HAS_RIGHT) |

+                            ((x > (init_x >> ss_hor) ||

+                              (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?

+                                 0 : EDGE_I444_LEFT_HAS_BOTTOM);

+                        const pixel *top_sb_edge = NULL;

+                        if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {

+                            top_sb_edge = f->ipred_edge[1 + pl];

+                            const int sby = t->by >> f->sb_shift;

+                            top_sb_edge += f->sb128w * 128 * (sby - 1);

+                        }

+                        const enum IntraPredMode uv_mode =

+                             b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;

+                        const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;

+                        const int xstart = ts->tiling.col_start >> ss_hor;

+                        const int ystart = ts->tiling.row_start >> ss_ver;

+                        const enum IntraPredMode m =

+                            bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,

+                                                              ypos, ypos > ystart,

+                                                              ts->tiling.col_end >> ss_hor,

+                                                              ts->tiling.row_end >> ss_ver,

+                                                              edge_flags, dst, stride,

+                                                              top_sb_edge, uv_mode,

+                                                              &angle, uv_t_dim->w,

+                                                              uv_t_dim->h, edge);

+                        dsp->ipred.intra_pred[m](dst, stride, edge,

+                                                 uv_t_dim->w * 4,

+                                                 uv_t_dim->h * 4,

+                                                 angle | sm_uv_fl);

+                        if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

+                            hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,

+                                     uv_t_dim->h * 4, 2, "l");

+                            hex_dump(edge, 0, 1, 1, "tl");

+                            hex_dump(edge + 1, uv_t_dim->w * 4,

+                                     uv_t_dim->w * 4, 2, "t");

+                            hex_dump(dst, stride, uv_t_dim->w * 4,

+                                     uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");

+                        }

+                    skip_uv_pred: {}

+                        if (!b->skip) {

+                            enum TxfmType txtp;

+                            int eob;

+                            coef *cf;

+                            if (f->frame_thread.pass) {

+                                cf = ts->frame_thread.cf;

+                                ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;

+                                const struct CodedBlockInfo *const cbi =

+                                    &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];

+                                eob = cbi->eob[pl + 1];

+                                txtp = cbi->txtp[pl + 1];

+                            } else {

+                                uint8_t cf_ctx;

+                                cf = t->cf;

+                                eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],

+                                                   &t->l.ccoef[pl][cby4 + y],

+                                                   b->uvtx, bs, b, 1, 1 + pl, cf,

+                                                   &txtp, &cf_ctx);

+                                if (DEBUG_BLOCK_INFO)

+                                    printf("Post-uv-cf-blk[pl=%d,tx=%d,"

+                                           "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",

+                                           pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);

+                                memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,

+                                       imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));

+                                memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,

+                                       imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));

+                            }

+                            if (eob >= 0) {

+                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

+                                    coef_dump(cf, uv_t_dim->h * 4,

+                                              uv_t_dim->w * 4, 3, "dq");

+                                dsp->itx.itxfm_add[b->uvtx]

+                                                  [txtp](dst, stride,

+                                                         cf, eob);

+                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

+                                    hex_dump(dst, stride, uv_t_dim->w * 4,

+                                             uv_t_dim->h * 4, "recon");

+                            }

+                        } else if (!f->frame_thread.pass) {

+                            memset(&t->a->ccoef[pl][cbx4 + x], 0x40, uv_t_dim->w);

+                            memset(&t->l.ccoef[pl][cby4 + y], 0x40, uv_t_dim->h);

+                        }

+                        dst += uv_t_dim->w * 4;

+                    }

+                    t->bx -= x << ss_hor;

+                }

+                t->by -= y << ss_ver;

+            }

+        }

+    }

+}

+void bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs,

+                                 const Av1Block *const b)

+{

+    Dav1dTileState *const ts = t->ts;

+    const Dav1dFrameContext *const f = t->f;

+    const Dav1dDSPContext *const dsp = f->dsp;

+    const int bx4 = t->bx & 31, by4 = t->by & 31;

+    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

+    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

+    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;

+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];

+    const int bw4 = b_dim[0], bh4 = b_dim[1];

+    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);

+    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&

+                           (bw4 > ss_hor || t->bx & 1) &&

+                           (bh4 > ss_ver || t->by & 1);

+    const int chr_layout_idx = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :

+                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.p.layout;

+    // prediction

+    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;

+    pixel *dst = ((pixel *) f->cur.p.data[0]) +

+        4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);

+    const ptrdiff_t uvdstoff =

+        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));

+    if (!(f->frame_hdr.frame_type & 1)) {

+        // intrabc

+        mc(t, dst, NULL, f->cur.p.stride[0],

+           bw4, bh4, t->bx, t->by, 0, b->mv[0], &f->cur, FILTER_2D_BILINEAR);

+        if (has_chroma) for (int pl = 1; pl < 3; pl++)

+            mc(t, ((pixel *) f->cur.p.data[pl]) + uvdstoff, NULL, f->cur.p.stride[1],

+               bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),

+               t->bx & ~ss_hor, t->by & ~ss_ver,

+               pl, b->mv[0], &f->cur, FILTER_2D_BILINEAR);

+    } else if (b->comp_type == COMP_INTER_NONE) {

+        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];

+        const enum Filter2d filter_2d = b->filter2d;

+        if (imin(bw4, bh4) > 1 && !f->frame_hdr.force_integer_mv &&

+            ((b->inter_mode == GLOBALMV &&

+              f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||

+             (b->motion_mode == MM_WARP &&

+              t->warpmv.type > WM_TYPE_TRANSLATION)))

+        {

+            warp_affine(t, dst, NULL, f->cur.p.stride[0], b_dim, 0, refp,

+                        b->motion_mode == MM_WARP ? &t->warpmv :

+                            &f->frame_hdr.gmv[b->ref[0]]);

+        } else {

+            mc(t, dst, NULL, f->cur.p.stride[0],

+               bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, filter_2d);

+            if (b->motion_mode == MM_OBMC)

+                obmc(t, dst, f->cur.p.stride[0], b_dim, 0, bx4, by4, w4, h4);

+        }

+        if (b->interintra_type) {

+            ALIGN_STK_32(pixel, tl_edge_buf, 65,);

+            pixel *const tl_edge = tl_edge_buf + 32;

+            enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?

+                                   SMOOTH_PRED : b->interintra_mode;

+            pixel *const tmp = t->scratch.interintra;

+            int angle = 0;

+            const pixel *top_sb_edge = NULL;

+            if (!(t->by & (f->sb_step - 1))) {

+                top_sb_edge = f->ipred_edge[0];

+                const int sby = t->by >> f->sb_shift;

+                top_sb_edge += f->sb128w * 128 * (sby - 1);

+            }

+            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,

+                                                  t->by, t->by > ts->tiling.row_start,

+                                                  ts->tiling.col_end, ts->tiling.row_end,

+                                                  0, dst, f->cur.p.stride[0], top_sb_edge,

+                                                  m, &angle, bw4, bh4, tl_edge);

+            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),

+                                     tl_edge, bw4 * 4, bh4 * 4, 0);

+            const uint8_t *const ii_mask =

+                b->interintra_type == INTER_INTRA_BLEND ?

+                     dav1d_ii_masks[bs][0][b->interintra_mode] :

+                     dav1d_wedge_masks[bs][0][0][b->wedge_idx];

+            dsp->mc.blend(dst, f->cur.p.stride[0], tmp, bw4 * 4 * sizeof(pixel),

+                          bw4 * 4, bh4 * 4, ii_mask, bw4 * 4);

+        }

+        if (!has_chroma) goto skip_inter_chroma_pred;

+        // sub8x8 derivation

+        int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;

+        refmvs *r;

+        if (is_sub8x8) {

+            assert(ss_hor == 1);

+            r = &f->mvs[t->by * f->b4_stride + t->bx];

+            if (bw4 == 1) is_sub8x8 &= r[-1].ref[0] > 0;

+            if (bh4 == ss_ver) is_sub8x8 &= r[-f->b4_stride].ref[0] > 0;

+            if (bw4 == 1 && bh4 == ss_ver)

+                is_sub8x8 &= r[-(1 + f->b4_stride)].ref[0] > 0;

+        }

+        // chroma prediction

+        if (is_sub8x8) {

+            assert(ss_hor == 1);

+            int h_off = 0, v_off = 0;

+            if (bw4 == 1 && bh4 == ss_ver) {

+                for (int pl = 0; pl < 2; pl++)

+                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,

+                       NULL, f->cur.p.stride[1],

+                       bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,

+                       r[-(f->b4_stride + 1)].mv[0],

+                       &f->refp[r[-(f->b4_stride + 1)].ref[0] - 1],

+                       f->frame_thread.pass != 2 ? t->tl_4x4_filter :

+                           f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);

+                v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);

+                h_off = 2;

+            }

+            if (bw4 == 1) {

+                const enum Filter2d left_filter_2d =

+                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];

+                for (int pl = 0; pl < 2; pl++)

+                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + v_off, NULL,

+                       f->cur.p.stride[1], bw4, bh4, t->bx - 1,

+                       t->by, 1 + pl, r[-1].mv[0], &f->refp[r[-1].ref[0] - 1],

+                       f->frame_thread.pass != 2 ? left_filter_2d :

+                           f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);

+                h_off = 2;

+            }

+            if (bh4 == ss_ver) {

+                const enum Filter2d top_filter_2d =

+                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];

+                for (int pl = 0; pl < 2; pl++)

+                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off, NULL,

+                       f->cur.p.stride[1], bw4, bh4, t->bx, t->by - 1,

+                       1 + pl, r[-f->b4_stride].mv[0],

+                       &f->refp[r[-f->b4_stride].ref[0] - 1],

+                       f->frame_thread.pass != 2 ? top_filter_2d :

+                           f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);

+                v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);

+            }

+            for (int pl = 0; pl < 2; pl++)

+                mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.p.stride[1],

+                   bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0], refp, filter_2d);

+        } else {

+            if (imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&

+                ((b->inter_mode == GLOBALMV &&

+                  f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||

+                 (b->motion_mode == MM_WARP &&

+                  t->warpmv.type > WM_TYPE_TRANSLATION)))

+            {

+                for (int pl = 0; pl < 2; pl++)

+                    warp_affine(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, NULL,

+                                f->cur.p.stride[1], b_dim, 1 + pl, refp,

+                                b->motion_mode == MM_WARP ? &t->warpmv :

+                                    &f->frame_hdr.gmv[b->ref[0]]);

+            } else {

+                for (int pl = 0; pl < 2; pl++) {

+                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,

+                       NULL, f->cur.p.stride[1],

+                       bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),

+                       t->bx & ~ss_hor, t->by & ~ss_ver,

+                       1 + pl, b->mv[0], refp, filter_2d);

+                    if (b->motion_mode == MM_OBMC)

+                        obmc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,

+                             f->cur.p.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);

+                }

+            }

+            if (b->interintra_type) {

+                // FIXME for 8x32 with 4:2:2 subsampling, this probably does

+                // the wrong thing since it will select 4x16, not 4x32, as a

+                // transform size...

+                const uint8_t *const ii_mask =

+                    b->interintra_type == INTER_INTRA_BLEND ?

+                         dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] :

+                         dav1d_wedge_masks[bs][chr_layout_idx][0][b->wedge_idx];

+                for (int pl = 0; pl < 2; pl++) {

+                    pixel *const tmp = t->scratch.interintra;

+                    pixel tl_edge_px[65], *const tl_edge = &tl_edge_px[32];

+                    enum IntraPredMode m =

+                        b->interintra_mode == II_SMOOTH_PRED ?

+                        SMOOTH_PRED : b->interintra_mode;

+                    int angle = 0;

+                    pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;

+                    const pixel *top_sb_edge = NULL;

+                    if (!(t->by & (f->sb_step - 1))) {

+                        top_sb_edge = f->ipred_edge[pl + 1];

+                        const int sby = t->by >> f->sb_shift;

+                        top_sb_edge += f->sb128w * 128 * (sby - 1);

+                    }

+                    m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,

+                                                          (t->bx >> ss_hor) >

+                                                              (ts->tiling.col_start >> ss_hor),

+                                                          t->by >> ss_ver,

+                                                          (t->by >> ss_ver) >

+                                                              (ts->tiling.row_start >> ss_ver),

+                                                          ts->tiling.col_end >> ss_hor,

+                                                          ts->tiling.row_end >> ss_ver,

+                                                          0, uvdst, f->cur.p.stride[1],

+                                                          top_sb_edge, m,

+                                                          &angle, cbw4, cbh4, tl_edge);

+                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),

+                                             tl_edge, cbw4 * 4, cbh4 * 4, 0);

+                    dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp, cbw4 * 4 * sizeof(pixel),

+                                  cbw4 * 4, cbh4 * 4, ii_mask, cbw4 * 4);

+                }

+            }

+        }

+    skip_inter_chroma_pred: {}

+        t->tl_4x4_filter = filter_2d;

+    } else {

+        const enum Filter2d filter_2d = b->filter2d;

+        // Maximum super block size is 128x128

+        coef (*tmp)[128 * 128] = (coef (*)[128 * 128]) t->scratch.compinter;

+        int jnt_weight;

+        uint8_t *const seg_mask = t->scratch_seg_mask;

+        const uint8_t *mask;

+        for (int i = 0; i < 2; i++) {

+            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];

+            if (b->inter_mode == GLOBALMV_GLOBALMV && !f->frame_hdr.force_integer_mv &&

+                f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)

+            {

+                warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,

+                            &f->frame_hdr.gmv[b->ref[i]]);

+            } else {

+                mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,

+                   b->mv[i], refp, filter_2d);

+            }

+        }

+        switch (b->comp_type) {

+        case COMP_INTER_AVG:

+            dsp->mc.avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],

+                        bw4 * 4, bh4 * 4);

+            break;

+        case COMP_INTER_WEIGHTED_AVG:

+            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];

+            dsp->mc.w_avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],

+                          bw4 * 4, bh4 * 4, jnt_weight);

+            break;

+        case COMP_INTER_SEG:

+            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.p.stride[0],

+                                           tmp[b->mask_sign], tmp[!b->mask_sign],

+                                           bw4 * 4, bh4 * 4, seg_mask, b->mask_sign);

+            mask = seg_mask;

+            break;

+        case COMP_INTER_WEDGE:

+            mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx];

+            dsp->mc.mask(dst, f->cur.p.stride[0],

+                         tmp[b->mask_sign], tmp[!b->mask_sign],

+                         bw4 * 4, bh4 * 4, mask);

+            if (has_chroma)

+                mask = dav1d_wedge_masks[bs][chr_layout_idx][b->mask_sign][b->wedge_idx];

+            break;

+        }

+        // chroma

+        if (has_chroma) for (int pl = 0; pl < 2; pl++) {

+            for (int i = 0; i < 2; i++) {

+                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];

+                if (b->inter_mode == GLOBALMV_GLOBALMV &&

+                    imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&

+                    f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)

+                {

+                    warp_affine(t, NULL, tmp[i], bw4 * 2, b_dim, 1 + pl,

+                                refp, &f->frame_hdr.gmv[b->ref[i]]);

+                } else {

+                    mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,

+                       1 + pl, b->mv[i], refp, filter_2d);

+                }

+            }

+            pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;

+            switch (b->comp_type) {

+            case COMP_INTER_AVG:

+                dsp->mc.avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],

+                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver);

+                break;

+            case COMP_INTER_WEIGHTED_AVG:

+                dsp->mc.w_avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],

+                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight);

+                break;

+            case COMP_INTER_WEDGE:

+            case COMP_INTER_SEG:

+                dsp->mc.mask(uvdst, f->cur.p.stride[1],

+                             tmp[b->mask_sign], tmp[!b->mask_sign],

+                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask);

+                break;

+            }

+        }

+    }

+    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {

+        hex_dump(dst, f->cur.p.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");

+        if (has_chroma) {

+            hex_dump(&((pixel *) f->cur.p.data[1])[uvdstoff], f->cur.p.stride[1],

+                     cbw4 * 4, cbh4 * 4, "u-pred");

+            hex_dump(&((pixel *) f->cur.p.data[2])[uvdstoff], f->cur.p.stride[1],

+                     cbw4 * 4, cbh4 * 4, "v-pred");

+        }

+    }

+    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;

+    if (b->skip) {

+        // reset coef contexts

+        memset(&t->a->lcoef[bx4], 0x40, w4);

+        memset(&t->l.lcoef[by4], 0x40, h4);

+        if (has_chroma) {

+            memset(&t->a->ccoef[0][cbx4], 0x40, cw4);

+            memset(&t->l.ccoef[0][cby4], 0x40, ch4);

+            memset(&t->a->ccoef[1][cbx4], 0x40, cw4);

+            memset(&t->l.ccoef[1][cby4], 0x40, ch4);

+        }

+        return;

+    }

+    const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];

+    const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];

+    for (int init_y = 0; init_y < bh4; init_y += 16) {

+        for (int init_x = 0; init_x < bw4; init_x += 16) {

+            // coefficient coding & inverse transforms

+            int y_off = !!init_y, y;

+            dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * init_y;

+            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);

+                 y += ytx->h, y_off++)

+            {

+                int x, x_off = !!init_x;

+                for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);

+                     x += ytx->w, x_off++)

+                {

+                    read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,

+                                   x_off, y_off, &dst[x * 4]);

+                    t->bx += ytx->w;

+                }

+                dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * ytx->h;

+                t->bx -= x;

+                t->by += ytx->h;

+            }

+            dst -= PXSTRIDE(f->cur.p.stride[0]) * 4 * y;

+            t->by -= y;

+            // chroma coefs and inverse transform

+            if (has_chroma) for (int pl = 0; pl < 2; pl++) {

+                pixel *uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff +

+                    (PXSTRIDE(f->cur.p.stride[1]) * init_y * 4 >> ss_ver);

+                for (y = init_y >> ss_ver, t->by += init_y;

+                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)

+                {

+                    int x;

+                    for (x = init_x >> ss_hor, t->bx += init_x;

+                         x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)

+                    {

+                        coef *cf;

+                        int eob;

+                        enum TxfmType txtp;

+                        if (f->frame_thread.pass) {

+                            cf = ts->frame_thread.cf;

+                            ts->frame_thread.cf += uvtx->w * uvtx->h * 16;

+                            const struct CodedBlockInfo *const cbi =

+                                &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];

+                            eob = cbi->eob[1 + pl];

+                            txtp = cbi->txtp[1 + pl];

+                        } else {

+                            uint8_t cf_ctx;

+                            cf = t->cf;

+                            txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +

+                                                bx4 + (x << ss_hor)];

+                            eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],

+                                               &t->l.ccoef[pl][cby4 + y],

+                                               b->uvtx, bs, b, 0, 1 + pl,

+                                               cf, &txtp, &cf_ctx);

+                            if (DEBUG_BLOCK_INFO)

+                                printf("Post-uv-cf-blk[pl=%d,tx=%d,"

+                                       "txtp=%d,eob=%d]: r=%d\n",

+                                       pl, b->uvtx, txtp, eob, ts->msac.rng);

+                            memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,

+                                   imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor));

+                            memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,

+                                   imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver));

+                        }

+                        if (eob >= 0) {

+                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

+                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");

+                            dsp->itx.itxfm_add[b->uvtx]

+                                              [txtp](&uvdst[4 * x],

+                                                     f->cur.p.stride[1],

+                                                     cf, eob);

+                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)

+                                hex_dump(&uvdst[4 * x], f->cur.p.stride[1],

+                                         uvtx->w * 4, uvtx->h * 4, "recon");

+                        }

+                        t->bx += uvtx->w << ss_hor;

+                    }

+                    uvdst += PXSTRIDE(f->cur.p.stride[1]) * 4 * uvtx->h;

+                    t->bx -= x << ss_hor;

+                    t->by += uvtx->h << ss_ver;

+                }

+                t->by -= y << ss_ver;

+            }

+        }

+    }

+}

+void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {

+    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

+    const int sbsz = f->sb_step, sbh = f->sbh;

+    if (f->frame_hdr.loopfilter.level_y[0] ||

+        f->frame_hdr.loopfilter.level_y[1])

+    {

+        int start_of_tile_row = 0;

+        if (f->frame_hdr.tiling.row_start_sb[f->lf.tile_row] == sby)

+            start_of_tile_row = f->lf.tile_row++;

+        bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby,

+                                       start_of_tile_row);

+    }

+    if (f->seq_hdr.restoration) {

+        // Store loop filtered pixels required by loop restoration

+        bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);

+    }

+    if (f->seq_hdr.cdef) {

+        if (sby) {

+            pixel *p_up[3] = {

+                f->lf.p[0] - 8 * PXSTRIDE(f->cur.p.stride[0]),

+                f->lf.p[1] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),

+                f->lf.p[2] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),

+            };

+            bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,

+                                    sby * sbsz - 2, sby * sbsz);

+        }

+        const int n_blks = sbsz - 2 * (sby + 1 < sbh);

+        bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,

+                                imin(sby * sbsz + n_blks, f->bh));

+    }

+    if (f->seq_hdr.restoration) {

+        bytefn(dav1d_lr_sbrow)(f, f->lf.p, sby);

+    }

+    f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[0]);

+    f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;

+    f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;

+    f->lf.prev_mask_ptr = f->lf.mask_ptr;

+    if ((sby & 1) || f->seq_hdr.sb128) {

+        f->lf.mask_ptr += f->sb128w;

+    }

+}

+void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {

+    const Dav1dFrameContext *const f = t->f;

+    Dav1dTileState *const ts = t->ts;

+    const int sby = t->by >> f->sb_shift;

+    const int sby_off = f->sb128w * 128 * sby;

+    const int x_off = ts->tiling.col_start;

+    const pixel *const y =

+        ((const pixel *) f->cur.p.data[0]) + x_off * 4 +

+                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.p.stride[0]);

+    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,

+               4 * (ts->tiling.col_end - x_off));

+    if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {

+        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;

+        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

+        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +

+            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.p.stride[1]);

+        for (int pl = 1; pl <= 2; pl++)

+            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],

+                       &((const pixel *) f->cur.p.data[pl])[uv_off],

+                       4 * (ts->tiling.col_end - x_off) >> ss_hor);

+    }

+}

--

⑨