shithub: dav1d

Download patch

ref: aca57bf3db00c29e90605656f1015561d1d67c2d
parent: 89ea92ba12bab0bd06a12828480af6bf809e065f
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Thu Mar 19 05:31:25 EDT 2020

Rewrite refmvs.c

Approximately 12.1% faster while using 27.7% less memory on first 1000
frames of Chimera 8bit/1080p on a Haswell system, single-threaded.

--- a/src/decode.c
+++ b/src/decode.c
@@ -223,9 +223,7 @@
                               const int have_left, const int have_top,
                               const int ref, uint64_t masks[2])
 {
-    const Dav1dFrameContext *const f = t->f;
-    const ptrdiff_t b4_stride = f->b4_stride;
-    const refmvs *const r = &f->mvs[t->by * b4_stride + t->bx];
+    /*const*/ refmvs_block *const *r = &t->rt.r[(t->by & 31) + 5];
     int count = 0;
     int have_topleft = have_top && have_left;
     int have_topright = imax(bw4, bh4) < 32 &&
@@ -232,11 +230,11 @@
                         have_top && t->bx + bw4 < t->ts->tiling.col_end &&
                         (intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT);
 
-#define bs(rp) dav1d_block_dimensions[dav1d_sbtype_to_bs[(rp)->sb_type]]
-#define matches(rp) ((rp)->ref[0] == ref + 1 && (rp)->ref[1] == -1)
+#define bs(rp) dav1d_block_dimensions[(rp)->bs]
+#define matches(rp) ((rp)->ref.ref[0] == ref + 1 && (rp)->ref.ref[1] == -1)
 
     if (have_top) {
-        const refmvs *r2 = &r[-b4_stride];
+        const refmvs_block *r2 = &r[-1][t->bx];
         if (matches(r2)) {
             masks[0] |= 1;
             count = 1;
@@ -260,32 +258,32 @@
         }
     }
     if (have_left) {
-        const refmvs *r2 = &r[-1];
-        if (matches(r2)) {
+        /*const*/ refmvs_block *const *r2 = r;
+        if (matches(&r2[0][t->bx - 1])) {
             masks[1] |= 1;
             if (++count >= 8) return;
         }
-        int lh4 = bs(r2)[1];
+        int lh4 = bs(&r2[0][t->bx - 1])[1];
         if (lh4 >= bh4) {
             if (t->by & (lh4 - 1)) have_topleft = 0;
         } else {
             unsigned mask = 1 << lh4;
             for (int y = lh4; y < h4; y += lh4) {
-                r2 += lh4 * b4_stride;
-                if (matches(r2)) {
+                r2 += lh4;
+                if (matches(&r2[0][t->bx - 1])) {
                     masks[1] |= mask;
                     if (++count >= 8) return;
                 }
-                lh4 = bs(r2)[1];
+                lh4 = bs(&r2[0][t->bx - 1])[1];
                 mask <<= lh4;
             }
         }
     }
-    if (have_topleft && matches(&r[-(1 + b4_stride)])) {
+    if (have_topleft && matches(&r[-1][t->bx - 1])) {
         masks[1] |= 1ULL << 32;
         if (++count >= 8) return;
     }
-    if (have_topright && matches(&r[bw4 - b4_stride])) {
+    if (have_topright && matches(&r[-1][t->bx + bw4])) {
         masks[0] |= 1ULL << 32;
     }
 #undef matches
@@ -293,13 +291,11 @@
 
 static void derive_warpmv(const Dav1dTileContext *const t,
                           const int bw4, const int bh4,
-                          const uint64_t masks[2], const struct mv mv,
+                          const uint64_t masks[2], const union mv mv,
                           Dav1dWarpedMotionParams *const wmp)
 {
     int pts[8][2 /* in, out */][2 /* x, y */], np = 0;
-    const Dav1dFrameContext *const f = t->f;
-    const ptrdiff_t b4_stride = f->b4_stride;
-    const refmvs *const r = &f->mvs[t->by * b4_stride + t->bx];
+    /*const*/ refmvs_block *const *r = &t->rt.r[(t->by & 31) + 5];
 
 #define add_sample(dx, dy, sx, sy, rp) do { \
     pts[np][0][0] = 16 * (2 * dx + sx * bs(rp)[0]) - 8; \
@@ -311,29 +307,29 @@
 
     // use masks[] to find the projectable motion vectors in the edges
     if ((unsigned) masks[0] == 1 && !(masks[1] >> 32)) {
-        const int off = t->bx & (bs(&r[-b4_stride])[0] - 1);
-        add_sample(-off, 0, 1, -1, &r[-b4_stride]);
+        const int off = t->bx & (bs(&r[-1][t->bx])[0] - 1);
+        add_sample(-off, 0, 1, -1, &r[-1][t->bx]);
     } else for (unsigned off = 0, xmask = (uint32_t) masks[0]; np < 8 && xmask;) { // top
         const int tz = ctz(xmask);
         off += tz;
         xmask >>= tz;
-        add_sample(off, 0, 1, -1, &r[off - b4_stride]);
+        add_sample(off, 0, 1, -1, &r[-1][t->bx + off]);
         xmask &= ~1;
     }
     if (np < 8 && masks[1] == 1) {
-        const int off = t->by & (bs(&r[-1])[1] - 1);
-        add_sample(0, -off, -1, 1, &r[-1 - off * b4_stride]);
+        const int off = t->by & (bs(&r[0][t->bx - 1])[1] - 1);
+        add_sample(0, -off, -1, 1, &r[-off][t->bx - 1]);
     } else for (unsigned off = 0, ymask = (uint32_t) masks[1]; np < 8 && ymask;) { // left
         const int tz = ctz(ymask);
         off += tz;
         ymask >>= tz;
-        add_sample(0, off, -1, 1, &r[off * b4_stride - 1]);
+        add_sample(0, off, -1, 1, &r[off][t->bx - 1]);
         ymask &= ~1;
     }
     if (np < 8 && masks[1] >> 32) // top/left
-        add_sample(0, 0, -1, -1, &r[-(1 + b4_stride)]);
+        add_sample(0, 0, -1, -1, &r[-1][t->bx - 1]);
     if (np < 8 && masks[0] >> 32) // top/right
-        add_sample(bw4, 0, 1, -1, &r[bw4 - b4_stride]);
+        add_sample(bw4, 0, 1, -1, &r[-1][t->bx + bw4]);
     assert(np > 0 && np <= 8);
 #undef bs
 
@@ -625,7 +621,7 @@
     const int bw4 = b_dim[0], bh4 = b_dim[1];
 
     // var-tx tree coding
-    b->tx_split[0] = b->tx_split[1] = 0;
+    uint16_t tx_split[2] = { 0 };
     b->max_ytx = dav1d_max_txfm_size_for_bs[bs][0];
     if (!b->skip && (f->frame_hdr->segmentation.lossless[b->seg_id] ||
                      b->max_ytx == TX_4X4))
@@ -653,7 +649,7 @@
         const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
         for (y = 0, y_off = 0; y < bh4; y += ytx->h, y_off++) {
             for (x = 0, x_off = 0; x < bw4; x += ytx->w, x_off++) {
-                read_tx_tree(t, b->max_ytx, 0, b->tx_split, x_off, y_off);
+                read_tx_tree(t, b->max_ytx, 0, tx_split, x_off, y_off);
                 // contexts are updated inside read_tx_tree()
                 t->bx += ytx->w;
             }
@@ -663,9 +659,12 @@
         t->by -= y;
         if (DEBUG_BLOCK_INFO)
             printf("Post-vartxtree[%x/%x]: r=%d\n",
-                   b->tx_split[0], b->tx_split[1], t->ts->msac.rng);
+                   tx_split[0], tx_split[1], t->ts->msac.rng);
         b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
     }
+    assert(!(tx_split[0] & ~0x33));
+    b->tx_split0 = tx_split[0];
+    b->tx_split1 = tx_split[1];
 }
 
 static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f,
@@ -729,6 +728,18 @@
             case_set(bh4, l., 1, by4);
             case_set(bw4, a->, 0, bx4);
 #undef set_ctx
+            if (f->frame_hdr->frame_type & 1) {
+                refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
+                for (int x = 0; x < bw4; x++) {
+                    r[x].ref.ref[0] = 0;
+                    r[x].bs = bs;
+                }
+                refmvs_block *const *rr = &t->rt.r[(t->by & 31) + 5];
+                for (int y = 0; y < bh4 - 1; y++) {
+                    rr[y][t->bx + bw4 - 1].ref.ref[0] = 0;
+                    rr[y][t->bx + bw4 - 1].bs = bs;
+                }
+            }
 
             if (has_chroma) {
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
@@ -741,10 +752,34 @@
             if (f->frame_hdr->frame_type & 1 /* not intrabc */ &&
                 b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP)
             {
-                uint64_t mask[2] = { 0, 0 };
-                find_matching_ref(t, intra_edge_flags, bw4, bh4, w4, h4,
-                                  have_left, have_top, b->ref[0], mask);
-                derive_warpmv(t, bw4, bh4, mask, b->mv[0], &t->warpmv);
+                if (b->matrix[0] == SHRT_MIN) {
+                    t->warpmv.type = DAV1D_WM_TYPE_IDENTITY;
+                } else {
+                    t->warpmv.type = DAV1D_WM_TYPE_AFFINE;
+                    t->warpmv.matrix[2] = b->matrix[0] + 0x10000;
+                    t->warpmv.matrix[3] = b->matrix[1];
+                    t->warpmv.matrix[4] = b->matrix[2];
+                    t->warpmv.matrix[5] = b->matrix[3] + 0x10000;
+                    dav1d_set_affine_mv2d(bw4, bh4, b->mv2d, &t->warpmv,
+                                          t->bx, t->by);
+                    dav1d_get_shear_params(&t->warpmv);
+#define signabs(v) v < 0 ? '-' : ' ', abs(v)
+                    if (DEBUG_BLOCK_INFO)
+                        printf("[ %c%x %c%x %c%x\n  %c%x %c%x %c%x ]\n"
+                               "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x, mv=y:%d,x:%d\n",
+                               signabs(t->warpmv.matrix[0]),
+                               signabs(t->warpmv.matrix[1]),
+                               signabs(t->warpmv.matrix[2]),
+                               signabs(t->warpmv.matrix[3]),
+                               signabs(t->warpmv.matrix[4]),
+                               signabs(t->warpmv.matrix[5]),
+                               signabs(t->warpmv.alpha),
+                               signabs(t->warpmv.beta),
+                               signabs(t->warpmv.gamma),
+                               signabs(t->warpmv.delta),
+                               b->mv2d.y, b->mv2d.x);
+#undef signabs
+                }
             }
             if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
 
@@ -757,6 +792,21 @@
             case_set(bw4, a->, 0, bx4);
 #undef set_ctx
 
+            if (f->frame_hdr->frame_type & 1) {
+                refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
+                for (int x = 0; x < bw4; x++) {
+                    r[x].ref.ref[0] = b->ref[0] + 1;
+                    r[x].mv[0] = b->mv[0];
+                    r[x].bs = bs;
+                }
+                refmvs_block *const *rr = &t->rt.r[(t->by & 31) + 5];
+                for (int y = 0; y < bh4 - 1; y++) {
+                    rr[y][t->bx + bw4 - 1].ref.ref[0] = b->ref[0] + 1;
+                    rr[y][t->bx + bw4 - 1].mv[0] = b->mv[0];
+                    rr[y][t->bx + bw4 - 1].bs = bs;
+                }
+            }
+
             if (has_chroma) {
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
                 rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
@@ -1245,24 +1295,20 @@
             }
         }
         if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
-            splat_intraref(f->mvs, f->b4_stride, t->by, t->bx, bs,
-                           y_mode_nofilt);
+            splat_intraref(&t->rt, t->by, t->bx, bs);
         }
     } else if (!(f->frame_hdr->frame_type & 1)) {
         // intra block copy
-        candidate_mv mvstack[8];
-        int n_mvs;
-        mv mvlist[2][2];
-        dav1d_find_ref_mvs(mvstack, &n_mvs, mvlist, NULL,
-                           (int[2]) { -1, -1 }, f->bw, f->bh,
-                           bs, bp, t->by, t->bx, ts->tiling.col_start,
-                           ts->tiling.col_end, ts->tiling.row_start,
-                           ts->tiling.row_end, f->libaom_cm);
+        refmvs_candidate mvstack[8];
+        int n_mvs, ctx;
+        dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+                          (union refmvs_refpair) { .ref = { 0, -1 }},
+                          bs, intra_edge_flags, t->by, t->bx);
 
-        if (mvlist[0][0].y | mvlist[0][0].x)
-            b->mv[0] = mvlist[0][0];
-        else if (mvlist[0][1].y | mvlist[0][1].x)
-            b->mv[0] = mvlist[0][1];
+        if (mvstack[0].mv[0].n)
+            b->mv[0] = mvstack[0].mv[0];
+        else if (mvstack[1].mv[0].n)
+            b->mv[0] = mvstack[1].mv[0];
         else {
             if (t->by - (16 << f->seq_hdr->sb128) < ts->tiling.row_start) {
                 b->mv[0].y = 0;
@@ -1273,7 +1319,7 @@
             }
         }
 
-        const struct mv ref = b->mv[0];
+        const union mv ref = b->mv[0];
         read_mv_residual(t, &b->mv[0], &ts->cdf.dmv, 0);
 
         // clip intrabc motion vector to decoded parts of current tile
@@ -1335,7 +1381,7 @@
         if (DEBUG_BLOCK_INFO)
             printf("Post-dmv[%d/%d,ref=%d/%d|%d/%d]: r=%d\n",
                    b->mv[0].y, b->mv[0].x, ref.y, ref.x,
-                   mvlist[0][0].y, mvlist[0][0].x, ts->msac.rng);
+                   mvstack[0].mv[0].y, mvstack[0].mv[0].x, ts->msac.rng);
         read_vartx_tree(t, b, bs, bx4, by4);
 
         // reconstruction
@@ -1346,7 +1392,7 @@
             if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
         }
 
-        splat_intrabc_mv(f->mvs, f->b4_stride, t->by, t->bx, bs, b->mv[0]);
+        splat_intrabc_mv(&t->rt, t->by, t->bx, bs, b->mv[0]);
 
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
         rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
@@ -1395,17 +1441,15 @@
             b->drl_idx = NEAREST_DRL;
             has_subpel_filter = 0;
 
-            candidate_mv mvstack[8];
+            refmvs_candidate mvstack[8];
             int n_mvs, ctx;
-            mv mvlist[2][2];
-            dav1d_find_ref_mvs(mvstack, &n_mvs, mvlist, &ctx,
-                               (int[2]) { b->ref[0], b->ref[1] }, f->bw, f->bh,
-                               bs, bp, t->by, t->bx, ts->tiling.col_start,
-                               ts->tiling.col_end, ts->tiling.row_start,
-                               ts->tiling.row_end, f->libaom_cm);
+            dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+                              (union refmvs_refpair) { .ref = {
+                                    b->ref[0] + 1, b->ref[1] + 1 }},
+                              bs, intra_edge_flags, t->by, t->bx);
 
-            b->mv[0] = mvstack[0].this_mv;
-            b->mv[1] = mvstack[0].comp_mv;
+            b->mv[0] = mvstack[0].mv[0];
+            b->mv[1] = mvstack[0].mv[1];
             fix_mv_precision(f->frame_hdr, &b->mv[0]);
             fix_mv_precision(f->frame_hdr, &b->mv[1]);
             if (DEBUG_BLOCK_INFO)
@@ -1475,14 +1519,12 @@
                 printf("Post-refs[%d/%d]: r=%d\n",
                        b->ref[0], b->ref[1], ts->msac.rng);
 
-            candidate_mv mvstack[8];
+            refmvs_candidate mvstack[8];
             int n_mvs, ctx;
-            mv mvlist[2][2];
-            dav1d_find_ref_mvs(mvstack, &n_mvs, mvlist, &ctx,
-                               (int[2]) { b->ref[0], b->ref[1] }, f->bw, f->bh,
-                               bs, bp, t->by, t->bx, ts->tiling.col_start,
-                               ts->tiling.col_end, ts->tiling.row_start,
-                               ts->tiling.row_end, f->libaom_cm);
+            dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+                              (union refmvs_refpair) { .ref = {
+                                    b->ref[0] + 1, b->ref[1] + 1 }},
+                              bs, intra_edge_flags, t->by, t->bx);
 
             b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                                 ts->cdf.m.comp_inter_mode[ctx],
@@ -1525,11 +1567,11 @@
             }
             assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
 
-#define assign_comp_mv(idx, pfx) \
+#define assign_comp_mv(idx) \
             switch (im[idx]) { \
             case NEARMV: \
             case NEARESTMV: \
-                b->mv[idx] = mvstack[b->drl_idx].pfx##_mv; \
+                b->mv[idx] = mvstack[b->drl_idx].mv[idx]; \
                 fix_mv_precision(f->frame_hdr, &b->mv[idx]); \
                 break; \
             case GLOBALMV: \
@@ -1540,7 +1582,7 @@
                 fix_mv_precision(f->frame_hdr, &b->mv[idx]); \
                 break; \
             case NEWMV: \
-                b->mv[idx] = mvstack[b->drl_idx].pfx##_mv; \
+                b->mv[idx] = mvstack[b->drl_idx].mv[idx]; \
                 read_mv_residual(t, &b->mv[idx], &ts->cdf.mv, \
                                  !f->frame_hdr->force_integer_mv); \
                 break; \
@@ -1547,8 +1589,8 @@
             }
             has_subpel_filter = imin(bw4, bh4) == 1 ||
                                 b->inter_mode != GLOBALMV_GLOBALMV;
-            assign_comp_mv(0, this);
-            assign_comp_mv(1, comp);
+            assign_comp_mv(0);
+            assign_comp_mv(1);
 #undef assign_comp_mv
             if (DEBUG_BLOCK_INFO)
                 printf("Post-residual_mv[1:y=%d,x=%d,2:y=%d,x=%d]: r=%d\n",
@@ -1653,14 +1695,11 @@
             }
             b->ref[1] = -1;
 
-            candidate_mv mvstack[8];
+            refmvs_candidate mvstack[8];
             int n_mvs, ctx;
-            mv mvlist[2][2];
-            dav1d_find_ref_mvs(mvstack, &n_mvs, mvlist, &ctx,
-                               (int[2]) { b->ref[0], -1 }, f->bw, f->bh, bs, bp,
-                               t->by, t->bx, ts->tiling.col_start,
-                               ts->tiling.col_end, ts->tiling.row_start,
-                               ts->tiling.row_end, f->libaom_cm);
+            dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+                              (union refmvs_refpair) { .ref = { b->ref[0] + 1, -1 }},
+                              bs, intra_edge_flags, t->by, t->bx);
 
             // mode parsing and mv derivation from ref_mvs
             if ((seg && (seg->skip || seg->globalmv)) ||
@@ -1700,12 +1739,9 @@
                         b->drl_idx = NEAREST_DRL;
                     }
                     assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
-                    if (b->drl_idx >= NEAR_DRL) {
-                        b->mv[0] = mvstack[b->drl_idx].this_mv;
-                    } else {
-                        b->mv[0] = mvlist[0][b->drl_idx];
+                    b->mv[0] = mvstack[b->drl_idx].mv[0];
+                    if (b->drl_idx < NEAR_DRL)
                         fix_mv_precision(f->frame_hdr, &b->mv[0]);
-                    }
                 }
 
                 if (DEBUG_BLOCK_INFO)
@@ -1728,10 +1764,10 @@
                 }
                 assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
                 if (n_mvs > 1) {
-                    b->mv[0] = mvstack[b->drl_idx].this_mv;
+                    b->mv[0] = mvstack[b->drl_idx].mv[0];
                 } else {
                     assert(!b->drl_idx);
-                    b->mv[0] = mvlist[0][0];
+                    b->mv[0] = mvstack[0].mv[0];
                     fix_mv_precision(f->frame_hdr, &b->mv[0]);
                 }
                 if (DEBUG_BLOCK_INFO)
@@ -1801,7 +1837,8 @@
 #define signabs(v) v < 0 ? '-' : ' ', abs(v)
                     if (DEBUG_BLOCK_INFO)
                         printf("[ %c%x %c%x %c%x\n  %c%x %c%x %c%x ]\n"
-                               "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x\n",
+                               "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x, "
+                               "mv=y:%d,x:%d\n",
                                signabs(t->warpmv.matrix[0]),
                                signabs(t->warpmv.matrix[1]),
                                signabs(t->warpmv.matrix[2]),
@@ -1811,8 +1848,19 @@
                                signabs(t->warpmv.alpha),
                                signabs(t->warpmv.beta),
                                signabs(t->warpmv.gamma),
-                               signabs(t->warpmv.delta));
+                               signabs(t->warpmv.delta),
+                               b->mv[0].y, b->mv[0].x);
 #undef signabs
+                    if (f->frame_thread.pass) {
+                        if (t->warpmv.type == DAV1D_WM_TYPE_AFFINE) {
+                            b->matrix[0] = t->warpmv.matrix[2] - 0x10000;
+                            b->matrix[1] = t->warpmv.matrix[3];
+                            b->matrix[2] = t->warpmv.matrix[4];
+                            b->matrix[3] = t->warpmv.matrix[5] - 0x10000;
+                        } else {
+                            b->matrix[0] = SHRT_MIN;
+                        }
+                    }
                 }
 
                 if (DEBUG_BLOCK_INFO)
@@ -1876,11 +1924,12 @@
                 b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
             const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
                 &ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
+            const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
             dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride, lf_lvls,
                                        t->bx, t->by, f->w4, f->h4, b->skip, bs,
                                        f->frame_hdr->segmentation.lossless[b->seg_id] ?
                                            (enum RectTxfmSize) TX_4X4 : b->max_ytx,
-                                       b->tx_split, b->uvtx, f->cur.p.layout,
+                                       tx_split, b->uvtx, f->cur.p.layout,
                                        &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
                                        has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
                                        has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
@@ -1888,13 +1937,11 @@
 
         // context updates
         if (is_comp) {
-            splat_tworef_mv(f->mvs, f->b4_stride, t->by, t->bx, bs,
-                            b->inter_mode, b->ref[0], b->ref[1],
-                            b->mv[0], b->mv[1]);
+            splat_tworef_mv(&t->rt, t->by, t->bx, bs, b->inter_mode,
+                            b->ref[0], b->ref[1], b->mv);
         } else {
-            splat_oneref_mv(f->mvs, f->b4_stride, t->by, t->bx, bs,
-                            b->inter_mode, b->ref[0], b->mv[0],
-                            b->interintra_type);
+            splat_oneref_mv(&t->rt, t->by, t->bx, bs, b->inter_mode,
+                            b->ref[0], b->mv[0], b->interintra_type);
         }
 
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
@@ -2470,6 +2517,13 @@
     const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
     const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
 
+    if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
+        dav1d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start,
+                                     ts->tiling.col_end, ts->tiling.row_start,
+                                     ts->tiling.row_end, t->by >> f->sb_shift,
+                                     ts->tiling.row);
+    }
+
     reset_context(&t->l, !(f->frame_hdr->frame_type & 1), f->frame_thread.pass);
     if (f->frame_thread.pass == 2) {
         for (t->bx = ts->tiling.col_start,
@@ -2490,16 +2544,16 @@
     // error out on symbol decoder overread
     if (ts->msac.cnt < -15) return 1;
 
-    if (c->n_fc > 1 && f->frame_hdr->use_ref_frame_mvs) {
-        for (int n = 0; n < 7; n++)
+    if (f->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) {
+        if (c->n_fc > 1) for (int n = 0; n < 7; n++)
             if (dav1d_thread_picture_wait(&f->refp[n], 4 * (t->by + sb_step),
                                           PLANE_TYPE_BLOCK))
             {
                 return 1;
             }
-        dav1d_init_ref_mv_tile_row(f->libaom_cm,
-                                   ts->tiling.col_start, ts->tiling.col_end,
-                                   t->by, imin(t->by + sb_step, f->bh));
+        dav1d_refmvs_load_tmvs(&f->rf, ts->tiling.row,
+                               ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
+                               t->by >> 1, (t->by + sb_step) >> 1);
     }
     memset(t->pal_sz_uv[1], 0, sizeof(*t->pal_sz_uv));
     const int sb128y = t->by >> 5;
@@ -2581,6 +2635,12 @@
         }
     }
 
+    if (f->n_tc > 1 && f->frame_hdr->frame_type & 1) {
+        dav1d_refmvs_save_tmvs(&t->rt,
+                               ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
+                               t->by >> 1, (t->by + sb_step) >> 1);
+    }
+
     // backup pre-loopfilter pixels for intra prediction of the next sbrow
     if (f->frame_thread.pass != 1)
         f->bd_fn.backup_ipred_edge(t);
@@ -2894,20 +2954,10 @@
 
     // init ref mvs
     if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
-        f->mvs = f->mvs_ref->data;
-        const int order_hint_n_bits = f->seq_hdr->order_hint * f->seq_hdr->order_hint_n_bits;
-        const int ret = dav1d_init_ref_mv_common(f->libaom_cm, f->bw >> 1, f->bh >> 1,
-                                                 f->b4_stride, f->seq_hdr->sb128,
-                                                 f->mvs, f->ref_mvs,
-                                                 f->cur.frame_hdr->frame_offset,
-                                                 f->refpoc,
-                                                 f->refrefpoc, f->frame_hdr->gmv,
-                                                 f->frame_hdr->hp, f->frame_hdr->force_integer_mv,
-                                                 f->frame_hdr->use_ref_frame_mvs,
-                                                 order_hint_n_bits);
+        const int ret =
+            dav1d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr,
+                                    f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs, f->n_tc);
         if (ret < 0) goto error;
-        if (c->n_fc == 1 && f->frame_hdr->use_ref_frame_mvs)
-            dav1d_init_ref_mv_tile_row(f->libaom_cm, 0, f->bw, 0, f->bh);
     }
     retval = DAV1D_ERR(EINVAL);
 
@@ -3048,11 +3098,26 @@
                      sby < sbh_end; sby++)
                 {
                     t->by = sby << (4 + f->seq_hdr->sb128);
+                    const int by_end = (t->by + f->sb_step) >> 1;
+                    if (f->frame_thread.pass <= 1 && f->frame_hdr->use_ref_frame_mvs) {
+                        if (c->n_fc > 1) for (int n = 0; n < 7; n++)
+                            if (dav1d_thread_picture_wait(&f->refp[n],
+                                                          4 * (t->by + f->sb_step),
+                                                          PLANE_TYPE_BLOCK))
+                            {
+                                return 1;
+                            }
+                        dav1d_refmvs_load_tmvs(&f->rf, tile_row,
+                                               0, f->bw >> 1, t->by >> 1, by_end);
+                    }
                     for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
                         t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
 
                         if (dav1d_decode_tile_sbrow(t)) goto error;
                     }
+                    if (f->frame_thread.pass <= 1 && f->frame_hdr->frame_type & 1) {
+                        dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end);
+                    }
 
                     // loopfilter + cdef + restoration
                     if (f->frame_thread.pass != 1)
@@ -3401,7 +3466,7 @@
 
     // ref_mvs
     if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
-        f->mvs_ref = dav1d_ref_create(f->sb128h * 32 * f->b4_stride *
+        f->mvs_ref = dav1d_ref_create(f->sb128h * 16 * (f->b4_stride >> 1) *
                                       sizeof(*f->mvs));
         if (!f->mvs_ref) {
             res = DAV1D_ERR(ENOMEM);
--- a/src/env.h
+++ b/src/env.h
@@ -33,7 +33,7 @@
 #include <stdlib.h>
 
 #include "src/levels.h"
-#include "src/ref_mvs.h"
+#include "src/refmvs.h"
 #include "src/tables.h"
 
 typedef struct BlockContext {
@@ -428,7 +428,7 @@
     return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
 }
 
-static inline int get_drl_context(const candidate_mv *const ref_mv_stack,
+static inline int get_drl_context(const refmvs_candidate *const ref_mv_stack,
                                   const int ref_idx)
 {
     if (ref_mv_stack[ref_idx].weight >= 640)
--- a/src/internal.h
+++ b/src/internal.h
@@ -54,7 +54,7 @@
 #include "src/msac.h"
 #include "src/picture.h"
 #include "src/recon.h"
-#include "src/ref_mvs.h"
+#include "src/refmvs.h"
 #include "src/thread.h"
 
 typedef struct Dav1dDSPContext {
@@ -146,7 +146,7 @@
     Dav1dPicture cur; // during block coding / reconstruction
     Dav1dThreadPicture sr_cur; // after super-resolution upscaling
     Dav1dRef *mvs_ref;
-    refmvs *mvs, *ref_mvs[7];
+    refmvs_temporal_block *mvs, *ref_mvs[7];
     Dav1dRef *ref_mvs_ref[7];
     Dav1dRef *cur_segmap_ref, *prev_segmap_ref;
     uint8_t *cur_segmap;
@@ -187,7 +187,7 @@
     const uint8_t *qm[2 /* is_1d */][N_RECT_TX_SIZES][3 /* plane */];
     BlockContext *a;
     int a_sz /* w*tile_rows */;
-    AV1_COMMON *libaom_cm; // FIXME
+    refmvs_frame rf;
     uint8_t jnt_weights[7][7];
     int bitdepth_max;
 
@@ -290,6 +290,7 @@
     uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
     uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
     uint8_t txtp_map[32 * 32]; // inter-only
+    refmvs_tile rt;
     ALIGN(union, 64) {
         struct {
             union {
--- a/src/levels.h
+++ b/src/levels.h
@@ -243,8 +243,11 @@
     INTER_INTRA_WEDGE,
 };
 
-typedef struct mv {
-    int16_t y, x;
+typedef union mv {
+    struct {
+        int16_t y, x;
+    };
+    uint32_t n;
 } mv;
 
 enum MotionMode {
@@ -264,12 +267,20 @@
             int8_t y_angle, uv_angle, cfl_alpha[2];
         }; // intra
         struct {
+            union {
+                struct {
+                    union mv mv[2];
+                    uint8_t wedge_idx, mask_sign, interintra_mode;
+                };
+                struct {
+                    union mv mv2d;
+                    int16_t matrix[4];
+                };
+            };
+            uint8_t comp_type, inter_mode, motion_mode, drl_idx;
             int8_t ref[2];
-            uint8_t comp_type, wedge_idx, mask_sign, inter_mode, drl_idx;
-            uint8_t interintra_type, interintra_mode, motion_mode;
-            uint8_t max_ytx, filter2d;
-            uint16_t tx_split[2];
-            mv mv[2];
+            uint8_t max_ytx, filter2d, interintra_type, tx_split0;
+            uint16_t tx_split1;
         }; // inter
     };
 } Av1Block;
--- a/src/lib.c
+++ b/src/lib.c
@@ -191,8 +191,7 @@
                 t->tile_thread.td.inited = 1;
             }
         }
-        f->libaom_cm = dav1d_alloc_ref_mv_common();
-        if (!f->libaom_cm) goto error;
+        dav1d_refmvs_init(&f->rf);
         if (c->n_fc > 1) {
             if (pthread_mutex_init(&f->frame_thread.td.lock, NULL)) goto error;
             if (pthread_cond_init(&f->frame_thread.td.cond, NULL)) {
@@ -544,7 +543,7 @@
         free(f->lf.lr_mask);
         free(f->lf.level);
         free(f->lf.tx_lpf_right_edge[0]);
-        if (f->libaom_cm) dav1d_free_ref_mv_common(f->libaom_cm);
+        dav1d_refmvs_clear(&f->rf);
         dav1d_free_aligned(f->lf.cdef_line_buf);
         dav1d_free_aligned(f->lf.lr_lpf_line[0]);
     }
--- a/src/meson.build
+++ b/src/meson.build
@@ -43,7 +43,7 @@
     'picture.c',
     'qm.c',
     'ref.c',
-    'ref_mvs.c',
+    'refmvs.c',
     'scan.c',
     'tables.c',
     'warpmv.c',
--- a/src/recon_tmpl.c
+++ b/src/recon_tmpl.c
@@ -774,6 +774,7 @@
     assert(!b->skip);
     const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
     const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
+    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
 
     for (int init_y = 0; init_y < h4; init_y += 16) {
         for (int init_x = 0; init_x < w4; init_x += 16) {
@@ -790,7 +791,7 @@
                      x += t_dim->w, t->bx += t_dim->w, x_off++)
                 {
                     if (!b->intra) {
-                        read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,
+                        read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
                                        x_off, y_off, NULL);
                     } else {
                         uint8_t cf_ctx = 0x40;
@@ -998,7 +999,7 @@
 {
     assert(!(t->bx & 1) && !(t->by & 1));
     const Dav1dFrameContext *const f = t->f;
-    const refmvs *const r = &f->mvs[t->by * f->b4_stride + t->bx];
+    /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
     pixel *const lap = bitfn(t->scratch.lap);
     const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
@@ -1010,16 +1011,15 @@
     {
         for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
             // only odd blocks are considered for overlap handling, hence +1
-            const refmvs *const a_r = &r[x - f->b4_stride + 1];
-            const uint8_t *const a_b_dim =
-                dav1d_block_dimensions[dav1d_sbtype_to_bs[a_r->sb_type]];
+            const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
+            const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
 
-            if (a_r->ref[0] > 0) {
+            if (a_r->ref.ref[0] > 0) {
                 const int ow4 = iclip(a_b_dim[0], 2, b_dim[0]);
                 const int oh4 = imin(b_dim[1], 16) >> 1;
                 res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
                          t->bx + x, t->by, pl, a_r->mv[0],
-                         &f->refp[a_r->ref[0] - 1], a_r->ref[0] - 1,
+                         &f->refp[a_r->ref.ref[0] - 1], a_r->ref.ref[0] - 1,
                          dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
                 if (res) return res;
                 f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
@@ -1033,16 +1033,15 @@
     if (t->bx > t->ts->tiling.col_start)
         for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
             // only odd blocks are considered for overlap handling, hence +1
-            const refmvs *const l_r = &r[(y + 1) * f->b4_stride - 1];
-            const uint8_t *const l_b_dim =
-                dav1d_block_dimensions[dav1d_sbtype_to_bs[l_r->sb_type]];
+            const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
+            const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
 
-            if (l_r->ref[0] > 0) {
+            if (l_r->ref.ref[0] > 0) {
                 const int ow4 = imin(b_dim[0], 16) >> 1;
                 const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]);
                 res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
                          t->bx, t->by + y, pl, l_r->mv[0],
-                         &f->refp[l_r->ref[0] - 1], l_r->ref[0] - 1,
+                         &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1,
                          dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
                 if (res) return res;
                 f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
@@ -1613,14 +1612,14 @@
 
         // sub8x8 derivation
         int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
-        refmvs *r;
+        refmvs_block *const *r;
         if (is_sub8x8) {
             assert(ss_hor == 1);
-            r = &f->mvs[t->by * f->b4_stride + t->bx];
-            if (bw4 == 1) is_sub8x8 &= r[-1].ref[0] > 0;
-            if (bh4 == ss_ver) is_sub8x8 &= r[-f->b4_stride].ref[0] > 0;
+            r = &t->rt.r[(t->by & 31) + 5];
+            if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
+            if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
             if (bw4 == 1 && bh4 == ss_ver)
-                is_sub8x8 &= r[-(1 + f->b4_stride)].ref[0] > 0;
+                is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
         }
 
         // chroma prediction
@@ -1632,9 +1631,9 @@
                     res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
                              NULL, f->cur.stride[1],
                              bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
-                             r[-(f->b4_stride + 1)].mv[0],
-                             &f->refp[r[-(f->b4_stride + 1)].ref[0] - 1],
-                             r[-(f->b4_stride + 1)].ref[0] - 1,
+                             r[-1][t->bx - 1].mv[0],
+                             &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
+                             r[-1][t->bx - 1].ref.ref[0] - 1,
                              f->frame_thread.pass != 2 ? t->tl_4x4_filter :
                                  f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
                     if (res) return res;
@@ -1648,8 +1647,9 @@
                 for (int pl = 0; pl < 2; pl++) {
                     res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
                              f->cur.stride[1], bw4, bh4, t->bx - 1,
-                             t->by, 1 + pl, r[-1].mv[0], &f->refp[r[-1].ref[0] - 1],
-                             r[-1].ref[0] - 1,
+                             t->by, 1 + pl, r[0][t->bx - 1].mv[0],
+                             &f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
+                             r[0][t->bx - 1].ref.ref[0] - 1,
                              f->frame_thread.pass != 2 ? left_filter_2d :
                                  f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
                     if (res) return res;
@@ -1662,9 +1662,9 @@
                 for (int pl = 0; pl < 2; pl++) {
                     res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
                              f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,
-                             1 + pl, r[-f->b4_stride].mv[0],
-                             &f->refp[r[-f->b4_stride].ref[0] - 1],
-                             r[-f->b4_stride].ref[0] - 1,
+                             1 + pl, r[-1][t->bx].mv[0],
+                             &f->refp[r[-1][t->bx].ref.ref[0] - 1],
+                             r[-1][t->bx].ref.ref[0] - 1,
                              f->frame_thread.pass != 2 ? top_filter_2d :
                                  f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
                     if (res) return res;
@@ -1870,6 +1870,7 @@
 
     const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
     const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
+    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
 
     for (int init_y = 0; init_y < bh4; init_y += 16) {
         for (int init_x = 0; init_x < bw4; init_x += 16) {
@@ -1883,7 +1884,7 @@
                 for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
                      x += ytx->w, x_off++)
                 {
-                    read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,
+                    read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
                                    x_off, y_off, &dst[x * 4]);
                     t->bx += ytx->w;
                 }
--- a/src/ref_mvs.c
+++ /dev/null
@@ -1,2104 +1,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/*
- * Changes made compared to libaom version:
- * - we disable TMV and enable MV_COMPRESS so that the
- *   input array for prev_frames can be at 4x4 instead of
- *   8x8 resolution, and therefore shared between cur_frame
- *   and prev_frame. To make enc/dec behave consistent, we
- *   also make this change around line 2580:
-#if 0
-                AOMMIN(((mi_row >> 1) << 1) + 1 + (((xd->n8_h - 1) >> 1) << 1),
-                       mi_row_end - 1) *
-                    prev_frame_mvs_stride +
-                AOMMIN(((mi_col >> 1) << 1) + 1 + (((xd->n8_w - 1) >> 1) << 1),
-                       mi_col_end - 1)
-#else
-                (((mi_row >> 1) << 1) + 1) * prev_frame_mvs_stride +
-                (((mi_col >> 1) << 1) + 1)
-#endif
- *   and the same change (swap mi_cols from prev_frame.mv_stride) on line 2407
- * - we disable rect-block overhanging edge inclusion (see
- *   line 2642):
-  if (num_8x8_blocks_wide == num_8x8_blocks_high || 1) {
-    mv_ref_search[5].row = -1;
-    mv_ref_search[5].col = 0;
-    mv_ref_search[6].row = 0;
-    mv_ref_search[6].col = -1;
-  } else {
-    mv_ref_search[5].row = -1;
-    mv_ref_search[5].col = num_8x8_blocks_wide;
-    mv_ref_search[6].row = num_8x8_blocks_high;
-    mv_ref_search[6].col = -1;
-  }
- *   Note that this is a bitstream change and needs the same
- *   change on the decoder side also.
- * - we change xd->mi to be a pointer instead of a double ptr.
- */
-
-#include "config.h"
-
-#include <errno.h>
-#include <limits.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "dav1d/common.h"
-
-#include "common/intops.h"
-
-#define av1_zero(a) memset(a, 0, sizeof(a))
-
-#define ATTRIBUTE_PACKED
-#define INLINE inline
-#define IMPLIES(a, b) (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
-
-#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
-#define ROUND_POWER_OF_TWO_SIGNED(value, n)           \
-  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
-                 : ROUND_POWER_OF_TWO((value), (n)))
-#define NELEMENTS(x) (int)(sizeof(x) / sizeof(x[0]))
-
-#define MAX_MV_REF_CANDIDATES 2
-
-#define MAX_REF_MV_STACK_SIZE 8
-#define REF_CAT_LEVEL 640
-
-#define FRAME_OFFSET_BITS 5
-#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
-#define INVALID_MV 0x80008000
-
-#define COMP_NEWMV_CTXS 5
-#define REFMV_OFFSET 4
-#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1)
-
-#define MV_IN_USE_BITS 14
-#define MV_UPP (1 << MV_IN_USE_BITS)
-#define MV_LOW (-(1 << MV_IN_USE_BITS))
-
-typedef struct MV {
-    int16_t row;
-    int16_t col;
-} MV;
-typedef union int_mv {
-    uint32_t as_int;
-    MV as_mv;
-} int_mv;
-typedef int8_t MV_REFERENCE_FRAME;
-#define MFMV_STACK_SIZE 3
-typedef struct {
-  int_mv mfmv0;
-  uint8_t ref_frame_offset;
-} TPL_MV_REF;
-typedef struct {
-    int_mv mv[2];
-    MV_REFERENCE_FRAME ref_frame[2];
-    int8_t mode, sb_type;
-} MV_REF;
-#define MB_MODE_INFO MV_REF
-
-#define AOMMAX(a,b) ((a)>(b)?(a):(b))
-#define AOMMIN(a,b) ((a)<(b)?(a):(b))
-
-typedef struct candidate_mv {
-    int_mv this_mv;
-    int_mv comp_mv;
-    int weight;
-} CANDIDATE_MV;
-#define NONE_FRAME -1
-#define INTRA_FRAME 0
-#define LAST_FRAME 1
-
-#define LAST2_FRAME 2
-#define LAST3_FRAME 3
-#define GOLDEN_FRAME 4
-#define BWDREF_FRAME 5
-#define ALTREF2_FRAME 6
-#define ALTREF_FRAME 7
-#define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
-
-#define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
-#define TOTAL_REFS_PER_FRAME (ALTREF_FRAME - INTRA_FRAME + 1)
-
-#define FWD_REFS (GOLDEN_FRAME - LAST_FRAME + 1)
-#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
-#define BWD_REFS (ALTREF_FRAME - BWDREF_FRAME + 1)
-#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
-#define FWD_REFS (GOLDEN_FRAME - LAST_FRAME + 1)
-#define SINGLE_REFS (FWD_REFS + BWD_REFS)
-typedef enum ATTRIBUTE_PACKED {
-  LAST_LAST2_FRAMES,      // { LAST_FRAME, LAST2_FRAME }
-  LAST_LAST3_FRAMES,      // { LAST_FRAME, LAST3_FRAME }
-  LAST_GOLDEN_FRAMES,     // { LAST_FRAME, GOLDEN_FRAME }
-  BWDREF_ALTREF_FRAMES,   // { BWDREF_FRAME, ALTREF_FRAME }
-  LAST2_LAST3_FRAMES,     // { LAST2_FRAME, LAST3_FRAME }
-  LAST2_GOLDEN_FRAMES,    // { LAST2_FRAME, GOLDEN_FRAME }
-  LAST3_GOLDEN_FRAMES,    // { LAST3_FRAME, GOLDEN_FRAME }
-  BWDREF_ALTREF2_FRAMES,  // { BWDREF_FRAME, ALTREF2_FRAME }
-  ALTREF2_ALTREF_FRAMES,  // { ALTREF2_FRAME, ALTREF_FRAME }
-  TOTAL_UNIDIR_COMP_REFS,
-  // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs
-  //       that are explicitly signaled.
-  UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1,
-} UNIDIR_COMP_REF;
-#define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS)
-#define MODE_CTX_REF_FRAMES (TOTAL_REFS_PER_FRAME + TOTAL_COMP_REFS)
-
-#define GLOBALMV_OFFSET 3
-#define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1)
-#define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1)
-#define MI_SIZE_LOG2 2
-#define MI_SIZE (1 << MI_SIZE_LOG2)
-#define MAX_SB_SIZE_LOG2 7
-#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
-#define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2)
-#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
-#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
-#define MI_SIZE_128X128 (128 >> MI_SIZE_LOG2)
-#define REFMV_OFFSET 4
-
-typedef enum ATTRIBUTE_PACKED {
-  BLOCK_4X4,
-  BLOCK_4X8,
-  BLOCK_8X4,
-  BLOCK_8X8,
-  BLOCK_8X16,
-  BLOCK_16X8,
-  BLOCK_16X16,
-  BLOCK_16X32,
-  BLOCK_32X16,
-  BLOCK_32X32,
-  BLOCK_32X64,
-  BLOCK_64X32,
-  BLOCK_64X64,
-  BLOCK_64X128,
-  BLOCK_128X64,
-  BLOCK_128X128,
-  BLOCK_4X16,
-  BLOCK_16X4,
-  BLOCK_8X32,
-  BLOCK_32X8,
-  BLOCK_16X64,
-  BLOCK_64X16,
-  BLOCK_32X128,
-  BLOCK_128X32,
-  BLOCK_SIZES_ALL,
-  BLOCK_SIZES = BLOCK_4X16,
-  BLOCK_INVALID = 255,
-  BLOCK_LARGEST = (BLOCK_SIZES - 1)
-} BLOCK_SIZE;
-
-typedef enum ATTRIBUTE_PACKED {
-  PARTITION_NONE,
-  PARTITION_HORZ,
-  PARTITION_VERT,
-  PARTITION_SPLIT,
-  PARTITION_HORZ_A,  // HORZ split and the top partition is split again
-  PARTITION_HORZ_B,  // HORZ split and the bottom partition is split again
-  PARTITION_VERT_A,  // VERT split and the left partition is split again
-  PARTITION_VERT_B,  // VERT split and the right partition is split again
-  PARTITION_HORZ_4,  // 4:1 horizontal partition
-  PARTITION_VERT_4,  // 4:1 vertical partition
-  EXT_PARTITION_TYPES,
-  PARTITION_TYPES = PARTITION_SPLIT + 1,
-  PARTITION_INVALID = 255
-} PARTITION_TYPE;
-typedef struct CUR_MODE_INFO {
-  PARTITION_TYPE partition;
-} CUR_MODE_INFO ;
-
-typedef enum ATTRIBUTE_PACKED {
-  DC_PRED,        // Average of above and left pixels
-  V_PRED,         // Vertical
-  H_PRED,         // Horizontal
-  D45_PRED,       // Directional 45  deg = round(arctan(1/1) * 180/pi)
-  D135_PRED,      // Directional 135 deg = 180 - 45
-  D117_PRED,      // Directional 117 deg = 180 - 63
-  D153_PRED,      // Directional 153 deg = 180 - 27
-  D207_PRED,      // Directional 207 deg = 180 + 27
-  D63_PRED,       // Directional 63  deg = round(arctan(2/1) * 180/pi)
-  SMOOTH_PRED,    // Combination of horizontal and vertical interpolation
-  SMOOTH_V_PRED,  // Vertical interpolation
-  SMOOTH_H_PRED,  // Horizontal interpolation
-  PAETH_PRED,     // Predict from the direction of smallest gradient
-  NEARESTMV,
-  NEARMV,
-  GLOBALMV,
-  NEWMV,
-  // Compound ref compound modes
-  NEAREST_NEARESTMV,
-  NEAR_NEARMV,
-  NEAREST_NEWMV,
-  NEW_NEARESTMV,
-  NEAR_NEWMV,
-  NEW_NEARMV,
-  GLOBAL_GLOBALMV,
-  NEW_NEWMV,
-  MB_MODE_COUNT,
-  INTRA_MODES = PAETH_PRED + 1,  // PAETH_PRED has to be the last intra mode.
-  INTRA_INVALID = MB_MODE_COUNT  // For uv_mode in inter blocks
-} PREDICTION_MODE;
-typedef enum {
-  IDENTITY = 0,      // identity transformation, 0-parameter
-  TRANSLATION = 1,   // translational motion 2-parameter
-  ROTZOOM = 2,       // simplified affine with rotation + zoom only, 4-parameter
-  AFFINE = 3,        // affine, 6-parameter
-  TRANS_TYPES,
-} TransformationType;
-
-#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
-#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
-#define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
-
-static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = {
-  1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16,
-  16, 32, 32,  1, 4, 2, 8, 4, 16, 8, 32
-};
-static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = {
-  1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16,
-  32, 16, 32,  4, 1, 8, 2, 16, 4, 32, 8
-};
-
-static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = {
-  4,  4,
-  8,  8,
-  8,  16,
-  16, 16,
-  32, 32,
-  32, 64,
-  64, 64, 128, 128, 4,
-  16, 8,
-  32, 16,
-  64, 32, 128
-};
-
-static const uint8_t block_size_high[BLOCK_SIZES_ALL] = {
-  4,  8,
-  4,  8,
-  16, 8,
-  16, 32,
-  16, 32,
-  64, 32,
-  64, 128, 64, 128, 16,
-  4,  32,
-  8,  64,
-  16, 128, 32
-};
-
-static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi,
-                                     TransformationType type) {
-  const PREDICTION_MODE mode = mbmi->mode;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int block_size_allowed =
-      AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
-  return block_size_allowed && type > TRANSLATION &&
-         (mode == GLOBALMV || mode == GLOBAL_GLOBALMV);
-}
-
-typedef struct {
-  TransformationType wmtype;
-  int32_t wmmat[6];
-  int16_t alpha, beta, gamma, delta;
-} Dav1dWarpedMotionParams;
-
-#define REF_FRAMES_LOG2 3
-#define REF_FRAMES (1 << REF_FRAMES_LOG2)
-#define FRAME_BUFFERS (REF_FRAMES + 7)
-typedef struct {
-
-  unsigned int cur_frame_offset;
-  unsigned int ref_frame_offset[INTER_REFS_PER_FRAME];
-
-  MV_REF *mvs;
-  ptrdiff_t mv_stride;
-  int mi_rows;
-  int mi_cols;
-  uint8_t intra_only;
-} RefCntBuffer;
-
-#define INVALID_IDX -1  // Invalid buffer index.
-typedef struct TileInfo {
-  int mi_row_start, mi_row_end;
-  int mi_col_start, mi_col_end;
-  int tg_horz_boundary;
-} TileInfo;
-typedef struct macroblockd {
-  TileInfo tile;
-  int mi_stride;
-
-  CUR_MODE_INFO cur_mi;
-  MB_MODE_INFO *mi;
-  int up_available;
-  int left_available;
-  /* Distance of MB away from frame edges in subpixels (1/8th pixel)  */
-  int mb_to_left_edge;
-  int mb_to_right_edge;
-  int mb_to_top_edge;
-  int mb_to_bottom_edge;
-  // block dimension in the unit of mode_info.
-  uint8_t n8_w, n8_h;
-  uint8_t is_sec_rect;
-
-} MACROBLOCKD;
-typedef struct RefBuffer {
-  int idx;  // frame buf idx
-} RefBuffer;
-typedef struct BufferPool {
-  RefCntBuffer frame_bufs[FRAME_BUFFERS];
-} BufferPool;
-typedef struct AV1Common {
-
-  // TODO(hkuang): Combine this with cur_buf in macroblockd.
-  RefCntBuffer cur_frame;
-
-  // Each Inter frame can reference INTER_REFS_PER_FRAME buffers
-  RefBuffer frame_refs[INTER_REFS_PER_FRAME];
-
-  int allow_high_precision_mv;
-  int cur_frame_force_integer_mv;  // 0 the default in AOM, 1 only integer
-  int mi_rows;
-  int mi_cols;
-  int mi_stride;
-
-  // Whether to use previous frame's motion vectors for prediction.
-  int allow_ref_frame_mvs;
-
-  int ref_frame_sign_bias[TOTAL_REFS_PER_FRAME]; /* Two state 0, 1 */
-  int frame_parallel_decode;  // frame-based threading.
-
-  unsigned int frame_offset;
-
-  // External BufferPool passed from outside.
-  BufferPool buffer_pool;
-
-  Dav1dWarpedMotionParams global_motion[TOTAL_REFS_PER_FRAME];
-  struct {
-    BLOCK_SIZE sb_size;
-    int enable_order_hint;
-    int order_hint_bits_minus1;
-  } seq_params;
-  TPL_MV_REF *tpl_mvs;
-  // TODO(jingning): This can be combined with sign_bias later.
-  int8_t ref_frame_side[TOTAL_REFS_PER_FRAME];
-
-    int ref_buf_idx[INTER_REFS_PER_FRAME];
-    int ref_order_hint[INTER_REFS_PER_FRAME];
-} AV1_COMMON;
-
-static INLINE void integer_mv_precision(MV *mv) {
-  int mod = (mv->row % 8);
-  if (mod != 0) {
-    mv->row -= mod;
-    if (abs(mod) > 4) {
-      if (mod > 0) {
-        mv->row += 8;
-      } else {
-        mv->row -= 8;
-      }
-    }
-  }
-
-  mod = (mv->col % 8);
-  if (mod != 0) {
-    mv->col -= mod;
-    if (abs(mod) > 4) {
-      if (mod > 0) {
-        mv->col += 8;
-      } else {
-        mv->col -= 8;
-      }
-    }
-  }
-}
-
-static INLINE int clamp(int value, int low, int high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
-                            int max_row) {
-  mv->col = clamp(mv->col, min_col, max_col);
-  mv->row = clamp(mv->row, min_row, max_row);
-}
-
-static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
-  return mbmi->ref_frame[0] == INTRA_FRAME && mbmi->mv[0].as_mv.row != -0x8000;
-  //return mbmi->use_intrabc;
-}
-
-static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
-  if (is_intrabc_block(mbmi)) return 1;
-  return mbmi->ref_frame[0] > INTRA_FRAME;
-}
-
-static INLINE MV_REFERENCE_FRAME comp_ref0(int ref_idx) {
-  static const MV_REFERENCE_FRAME lut[] = {
-    LAST_FRAME,     // LAST_LAST2_FRAMES,
-    LAST_FRAME,     // LAST_LAST3_FRAMES,
-    LAST_FRAME,     // LAST_GOLDEN_FRAMES,
-    BWDREF_FRAME,   // BWDREF_ALTREF_FRAMES,
-    LAST2_FRAME,    // LAST2_LAST3_FRAMES
-    LAST2_FRAME,    // LAST2_GOLDEN_FRAMES,
-    LAST3_FRAME,    // LAST3_GOLDEN_FRAMES,
-    BWDREF_FRAME,   // BWDREF_ALTREF2_FRAMES,
-    ALTREF2_FRAME,  // ALTREF2_ALTREF_FRAMES,
-  };
-  assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
-  return lut[ref_idx];
-}
-
-static INLINE MV_REFERENCE_FRAME comp_ref1(int ref_idx) {
-  static const MV_REFERENCE_FRAME lut[] = {
-    LAST2_FRAME,    // LAST_LAST2_FRAMES,
-    LAST3_FRAME,    // LAST_LAST3_FRAMES,
-    GOLDEN_FRAME,   // LAST_GOLDEN_FRAMES,
-    ALTREF_FRAME,   // BWDREF_ALTREF_FRAMES,
-    LAST3_FRAME,    // LAST2_LAST3_FRAMES
-    GOLDEN_FRAME,   // LAST2_GOLDEN_FRAMES,
-    GOLDEN_FRAME,   // LAST3_GOLDEN_FRAMES,
-    ALTREF2_FRAME,  // BWDREF_ALTREF2_FRAMES,
-    ALTREF_FRAME,   // ALTREF2_ALTREF_FRAMES,
-  };
-  assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
-  return lut[ref_idx];
-}
-
-#define WARPEDMODEL_PREC_BITS 16
-#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
-#define WARPEDMODEL_ROW3HOMO_PREC_BITS 16
-
-static INLINE int convert_to_trans_prec(int allow_hp, int coor) {
-  if (allow_hp)
-    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3);
-  else
-    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2;
-}
-
-static INLINE int block_center_x(int mi_col, BLOCK_SIZE bs) {
-  const int bw = block_size_wide[bs];
-  return mi_col * MI_SIZE + bw / 2 - 1;
-}
-
-static INLINE int block_center_y(int mi_row, BLOCK_SIZE bs) {
-  const int bh = block_size_high[bs];
-  return mi_row * MI_SIZE + bh / 2 - 1;
-}
-
-// Convert a global motion vector into a motion vector at the centre of the
-// given block.
-//
-// The resulting motion vector will have three fractional bits of precision. If
-// allow_hp is zero, the bottom bit will always be zero. If CONFIG_AMVR and
-// is_integer is true, the bottom three bits will be zero (so the motion vector
-// represents an integer)
-static INLINE int_mv gm_get_motion_vector(const Dav1dWarpedMotionParams *gm,
-                                          int allow_hp, BLOCK_SIZE bsize,
-                                          int mi_col, int mi_row,
-                                          int is_integer) {
-  int_mv res;
-  const int32_t *mat = gm->wmmat;
-  int x, y, tx, ty;
-
-  if (gm->wmtype == TRANSLATION) {
-    // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16)
-    // bits of fractional precision. The offset for a translation is stored in
-    // entries 0 and 1. For translations, all but the top three (two if
-    // cm->allow_high_precision_mv is false) fractional bits are always zero.
-    //
-    // After the right shifts, there are 3 fractional bits of precision. If
-    // allow_hp is false, the bottom bit is always zero (so we don't need a
-    // call to convert_to_trans_prec here)
-    res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF;
-    res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF;
-    assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp));
-    if (is_integer) {
-      integer_mv_precision(&res.as_mv);
-    }
-    return res;
-  }
-
-  x = block_center_x(mi_col, bsize);
-  y = block_center_y(mi_row, bsize);
-
-  if (gm->wmtype == ROTZOOM) {
-    assert(gm->wmmat[5] == gm->wmmat[2]);
-    assert(gm->wmmat[4] == -gm->wmmat[3]);
-  }
-  if (gm->wmtype > AFFINE) {
-    int xc = (int)((int64_t)mat[2] * x + (int64_t)mat[3] * y + mat[0]);
-    int yc = (int)((int64_t)mat[4] * x + (int64_t)mat[5] * y + mat[1]);
-    const int Z = (int)((int64_t)mat[6] * x + (int64_t)mat[7] * y +
-                        (1 << WARPEDMODEL_ROW3HOMO_PREC_BITS));
-    xc *= 1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS - WARPEDMODEL_PREC_BITS);
-    yc *= 1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS - WARPEDMODEL_PREC_BITS);
-    xc = (int)(xc > 0 ? ((int64_t)xc + Z / 2) / Z : ((int64_t)xc - Z / 2) / Z);
-    yc = (int)(yc > 0 ? ((int64_t)yc + Z / 2) / Z : ((int64_t)yc - Z / 2) / Z);
-    tx = convert_to_trans_prec(allow_hp, xc) - (x << 3);
-    ty = convert_to_trans_prec(allow_hp, yc) - (y << 3);
-  } else {
-    const int xc =
-        (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0];
-    const int yc =
-        mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1];
-    tx = convert_to_trans_prec(allow_hp, xc);
-    ty = convert_to_trans_prec(allow_hp, yc);
-  }
-
-  res.as_mv.row = ty;
-  res.as_mv.col = tx;
-
-  if (is_integer) {
-    integer_mv_precision(&res.as_mv);
-  }
-  return res;
-}
-
-static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
-  return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV ||
-          mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
-}
-
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#ifndef AV1_COMMON_MVREF_COMMON_H_
-#define AV1_COMMON_MVREF_COMMON_H_
-
-//#include "av1/common/onyxc_int.h"
-//#include "av1/common/blockd.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define MVREF_ROW_COLS 3
-
-// Set the upper limit of the motion vector component magnitude.
-// This would make a motion vector fit in 26 bits. Plus 3 bits for the
-// reference frame index. A tuple of motion vector can hence be stored within
-// 32 bit range for efficient load/store operations.
-#define REFMVS_LIMIT ((1 << 12) - 1)
-
-typedef struct position {
-  int row;
-  int col;
-} POSITION;
-
-// clamp_mv_ref
-#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
-
-static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) {
-  if (!cm->seq_params.enable_order_hint) return 0;
-
-  const int bits = cm->seq_params.order_hint_bits_minus1 + 1;
-
-  assert(bits >= 1);
-  assert(a >= 0 && a < (1 << bits));
-  assert(b >= 0 && b < (1 << bits));
-
-  int diff = a - b;
-  int m = 1 << (bits - 1);
-  diff = (diff & (m - 1)) - (diff & m);
-  return diff;
-}
-
-static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
-  clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
-           xd->mb_to_right_edge + bw * 8 + MV_BORDER,
-           xd->mb_to_top_edge - bh * 8 - MV_BORDER,
-           xd->mb_to_bottom_edge + bh * 8 + MV_BORDER);
-}
-
-// This function returns either the appropriate sub block or block's mv
-// on whether the block_size < 8x8 and we have check_sub_blocks set.
-static INLINE int_mv get_sub_block_mv(const MB_MODE_INFO *candidate,
-                                      int which_mv, int search_col) {
-  (void)search_col;
-  return candidate->mv[which_mv];
-}
-
-// Checks that the given mi_row, mi_col and search point
-// are inside the borders of the tile.
-static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
-                            int mi_rows, const POSITION *mi_pos) {
-  const int dependent_horz_tile_flag = 0;
-  if (dependent_horz_tile_flag && !tile->tg_horz_boundary) {
-    return !(mi_row + mi_pos->row < 0 ||
-             mi_col + mi_pos->col < tile->mi_col_start ||
-             mi_row + mi_pos->row >= mi_rows ||
-             mi_col + mi_pos->col >= tile->mi_col_end);
-  } else {
-    return !(mi_row + mi_pos->row < tile->mi_row_start ||
-             mi_col + mi_pos->col < tile->mi_col_start ||
-             mi_row + mi_pos->row >= tile->mi_row_end ||
-             mi_col + mi_pos->col >= tile->mi_col_end);
-  }
-}
-
-static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row,
-                                        int mi_rows, int row_offset) {
-  const int dependent_horz_tile_flag = 0;
-  if (dependent_horz_tile_flag && !tile->tg_horz_boundary)
-    return clamp(row_offset, -mi_row, mi_rows - mi_row - 1);
-  else
-    return clamp(row_offset, tile->mi_row_start - mi_row,
-                 tile->mi_row_end - mi_row - 1);
-}
-
-static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col,
-                                        int col_offset) {
-  return clamp(col_offset, tile->mi_col_start - mi_col,
-               tile->mi_col_end - mi_col - 1);
-}
-
-static INLINE void lower_mv_precision(MV *mv, int allow_hp,
-                                      int is_integer) {
-  if (is_integer) {
-    integer_mv_precision(mv);
-  } else {
-    if (!allow_hp) {
-      if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
-      if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
-    }
-  }
-}
-
-static INLINE int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) {
-  // Single ref pred
-  if (rf[1] <= INTRA_FRAME) return -1;
-
-  // Bi-directional comp ref pred
-  if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1;
-
-  for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) {
-    if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx))
-      return ref_idx;
-  }
-  return -1;
-}
-
-static INLINE int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
-  if (rf[1] > INTRA_FRAME) {
-    const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf);
-    if (uni_comp_ref_idx >= 0) {
-      assert((REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx) <
-             MODE_CTX_REF_FRAMES);
-      return REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx;
-    } else {
-      return REF_FRAMES + FWD_RF_OFFSET(rf[0]) +
-             BWD_RF_OFFSET(rf[1]) * FWD_REFS;
-    }
-  }
-
-  return rf[0];
-}
-
-// clang-format off
-static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = {
-  { LAST_FRAME, BWDREF_FRAME },  { LAST2_FRAME, BWDREF_FRAME },
-  { LAST3_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, BWDREF_FRAME },
-
-  { LAST_FRAME, ALTREF2_FRAME },  { LAST2_FRAME, ALTREF2_FRAME },
-  { LAST3_FRAME, ALTREF2_FRAME }, { GOLDEN_FRAME, ALTREF2_FRAME },
-
-  { LAST_FRAME, ALTREF_FRAME },  { LAST2_FRAME, ALTREF_FRAME },
-  { LAST3_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME },
-
-  { LAST_FRAME, LAST2_FRAME }, { LAST_FRAME, LAST3_FRAME },
-  { LAST_FRAME, GOLDEN_FRAME }, { BWDREF_FRAME, ALTREF_FRAME },
-
-  // NOTE: Following reference frame pairs are not supported to be explicitly
-  //       signalled, but they are possibly chosen by the use of skip_mode,
-  //       which may use the most recent one-sided reference frame pair.
-  { LAST2_FRAME, LAST3_FRAME }, { LAST2_FRAME, GOLDEN_FRAME },
-  { LAST3_FRAME, GOLDEN_FRAME }, {BWDREF_FRAME, ALTREF2_FRAME},
-  { ALTREF2_FRAME, ALTREF_FRAME }
-};
-// clang-format on
-
-static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf,
-                                     int8_t ref_frame_type) {
-  if (ref_frame_type >= REF_FRAMES) {
-    rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0];
-    rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1];
-  } else {
-    rf[0] = ref_frame_type;
-    rf[1] = NONE_FRAME;
-    assert(ref_frame_type > NONE_FRAME);
-  }
-}
-
-static uint16_t compound_mode_ctx_map[3][COMP_NEWMV_CTXS] = {
-  { 0, 1, 1, 1, 1 },
-  { 1, 2, 3, 4, 4 },
-  { 4, 4, 5, 6, 7 },
-};
-
-static INLINE int16_t av1_mode_context_analyzer(
-    const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf) {
-  const int8_t ref_frame = av1_ref_frame_type(rf);
-
-  if (rf[1] <= INTRA_FRAME) return mode_context[ref_frame];
-
-  const int16_t newmv_ctx = mode_context[ref_frame] & NEWMV_CTX_MASK;
-  const int16_t refmv_ctx =
-      (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK;
-
-  const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
-      newmv_ctx, COMP_NEWMV_CTXS - 1)];
-  return comp_ctx;
-}
-
-#define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
-#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
-#define USE_WAVE_FRONT 1  // Use only top left area of frame for reference.
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AV1_COMMON_MVREF_COMMON_H_
-
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-//#include "av1/common/mvref_common.h"
-//#include "av1/common/warped_motion.h"
-
-// Although we assign 32 bit integers, all the values are strictly under 14
-// bits.
-static int div_mult[32] = { 0,    16384, 8192, 5461, 4096, 3276, 2730, 2340,
-                            2048, 1820,  1638, 1489, 1365, 1260, 1170, 1092,
-                            1024, 963,   910,  862,  819,  780,  744,  712,
-                            682,  655,   630,  606,  585,  564,  546,  528 };
-
-// TODO(jingning): Consider the use of lookup table for (num / den)
-// altogether.
-static void get_mv_projection(MV *output, MV ref, int num, int den) {
-  den = AOMMIN(den, MAX_FRAME_DISTANCE);
-  num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
-                : AOMMAX(num, -MAX_FRAME_DISTANCE);
-  int mv_row = ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
-  int mv_col = ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
-  const int clamp_max = MV_UPP - 1;
-  const int clamp_min = MV_LOW + 1;
-  output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max);
-  output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max);
-}
-
-static void add_ref_mv_candidate(
-    const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2],
-    uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count,
-    CANDIDATE_MV *ref_mv_stack, int_mv *gm_mv_candidates,
-    const Dav1dWarpedMotionParams *gm_params, int col, int weight) {
-  if (!is_inter_block(candidate)) return;  // for intrabc
-  int index = 0, ref;
-  assert(weight % 2 == 0);
-
-  if (rf[1] == NONE_FRAME) {
-    // single reference frame
-    for (ref = 0; ref < 2; ++ref) {
-      if (candidate->ref_frame[ref] == rf[0]) {
-        int_mv this_refmv;
-        if (is_global_mv_block(candidate, gm_params[rf[0]].wmtype))
-          this_refmv = gm_mv_candidates[0];
-        else
-          this_refmv = get_sub_block_mv(candidate, ref, col);
-
-        for (index = 0; index < *refmv_count; ++index)
-          if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break;
-
-        if (index < *refmv_count) ref_mv_stack[index].weight += weight;
-
-        // Add a new item to the list.
-        if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
-          ref_mv_stack[index].this_mv = this_refmv;
-          ref_mv_stack[index].weight = weight;
-          ++(*refmv_count);
-        }
-        if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
-        ++*ref_match_count;
-      }
-    }
-  } else {
-    // compound reference frame
-    if (candidate->ref_frame[0] == rf[0] && candidate->ref_frame[1] == rf[1]) {
-      int_mv this_refmv[2];
-
-      for (ref = 0; ref < 2; ++ref) {
-        if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype))
-          this_refmv[ref] = gm_mv_candidates[ref];
-        else
-          this_refmv[ref] = get_sub_block_mv(candidate, ref, col);
-      }
-
-      for (index = 0; index < *refmv_count; ++index)
-        if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) &&
-            (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int))
-          break;
-
-      if (index < *refmv_count) ref_mv_stack[index].weight += weight;
-
-      // Add a new item to the list.
-      if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
-        ref_mv_stack[index].this_mv = this_refmv[0];
-        ref_mv_stack[index].comp_mv = this_refmv[1];
-        ref_mv_stack[index].weight = weight;
-        ++(*refmv_count);
-      }
-      if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
-      ++*ref_match_count;
-    }
-  }
-}
-
-static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                          int mi_row, int mi_col,
-                          const MV_REFERENCE_FRAME rf[2], int row_offset,
-                          CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
-                          uint8_t *ref_match_count, uint8_t *newmv_count,
-                          int_mv *gm_mv_candidates, int max_row_offset,
-                          int *processed_rows) {
-  int end_mi = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
-  end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
-  const int n8_w_8 = mi_size_wide[BLOCK_8X8];
-  const int n8_w_16 = mi_size_wide[BLOCK_16X16];
-  int i;
-  int col_offset = 0;
-  const int shift = 0;
-  // TODO(jingning): Revisit this part after cb4x4 is stable.
-  if (abs(row_offset) > 1) {
-    col_offset = 1;
-    if ((mi_col & 0x01) && xd->n8_w < n8_w_8) --col_offset;
-  }
-  const int use_step_16 = (xd->n8_w >= 16);
-  MB_MODE_INFO *const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
-  (void)mi_row;
-
-  for (i = 0; i < end_mi;) {
-    const MB_MODE_INFO *const candidate = &candidate_mi0[col_offset + i];
-    const int candidate_bsize = candidate->sb_type;
-    const int n8_w = mi_size_wide[candidate_bsize];
-    int len = AOMMIN(xd->n8_w, n8_w);
-    if (use_step_16)
-      len = AOMMAX(n8_w_16, len);
-    else if (abs(row_offset) > 1)
-      len = AOMMAX(len, n8_w_8);
-
-    int weight = 2;
-    if (xd->n8_w >= n8_w_8 && xd->n8_w <= n8_w) {
-      int inc = AOMMIN(-max_row_offset + row_offset + 1,
-                       mi_size_high[candidate_bsize]);
-      // Obtain range used in weight calculation.
-      weight = AOMMAX(weight, (inc << shift));
-      // Update processed rows.
-      *processed_rows = inc - row_offset - 1;
-    }
-
-    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
-                         newmv_count, ref_mv_stack, gm_mv_candidates,
-                         cm->global_motion, col_offset + i, len * weight);
-
-    i += len;
-  }
-}
-
-static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                          int mi_row, int mi_col,
-                          const MV_REFERENCE_FRAME rf[2], int col_offset,
-                          CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
-                          uint8_t *ref_match_count, uint8_t *newmv_count,
-                          int_mv *gm_mv_candidates, int max_col_offset,
-                          int *processed_cols) {
-  int end_mi = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
-  end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
-  const int n8_h_8 = mi_size_high[BLOCK_8X8];
-  const int n8_h_16 = mi_size_high[BLOCK_16X16];
-  int i;
-  int row_offset = 0;
-  const int shift = 0;
-  if (abs(col_offset) > 1) {
-    row_offset = 1;
-    if ((mi_row & 0x01) && xd->n8_h < n8_h_8) --row_offset;
-  }
-  const int use_step_16 = (xd->n8_h >= 16);
-  (void)mi_col;
-
-  for (i = 0; i < end_mi;) {
-    const MB_MODE_INFO *const candidate =
-        &xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
-    const int candidate_bsize = candidate->sb_type;
-    const int n8_h = mi_size_high[candidate_bsize];
-    int len = AOMMIN(xd->n8_h, n8_h);
-    if (use_step_16)
-      len = AOMMAX(n8_h_16, len);
-    else if (abs(col_offset) > 1)
-      len = AOMMAX(len, n8_h_8);
-
-    int weight = 2;
-    if (xd->n8_h >= n8_h_8 && xd->n8_h <= n8_h) {
-      int inc = AOMMIN(-max_col_offset + col_offset + 1,
-                       mi_size_wide[candidate_bsize]);
-      // Obtain range used in weight calculation.
-      weight = AOMMAX(weight, (inc << shift));
-      // Update processed cols.
-      *processed_cols = inc - col_offset - 1;
-    }
-
-    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
-                         newmv_count, ref_mv_stack, gm_mv_candidates,
-                         cm->global_motion, col_offset, len * weight);
-
-    i += len;
-  }
-}
-
-static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                          const int mi_row, const int mi_col,
-                          const MV_REFERENCE_FRAME rf[2], int row_offset,
-                          int col_offset, CANDIDATE_MV *ref_mv_stack,
-                          uint8_t *ref_match_count, uint8_t *newmv_count,
-                          int_mv *gm_mv_candidates,
-                          uint8_t refmv_count[MODE_CTX_REF_FRAMES]) {
-  const TileInfo *const tile = &xd->tile;
-  POSITION mi_pos;
-
-  mi_pos.row = row_offset;
-  mi_pos.col = col_offset;
-
-  if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) {
-    const MB_MODE_INFO *const candidate =
-        &xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
-    const int len = mi_size_wide[BLOCK_8X8];
-
-    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
-                         newmv_count, ref_mv_stack, gm_mv_candidates,
-                         cm->global_motion, mi_pos.col, 2 * len);
-  }  // Analyze a single 8x8 block motion information.
-}
-
-static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                         int mi_row, int mi_col, int bs) {
-  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
-  const int mask_row = mi_row & (sb_mi_size - 1);
-  const int mask_col = mi_col & (sb_mi_size - 1);
-
-  if (bs > mi_size_wide[BLOCK_64X64]) return 0;
-
-  // In a split partition all apart from the bottom right has a top right
-  int has_tr = !((mask_row & bs) && (mask_col & bs));
-
-  // bs > 0 and bs is a power of 2
-  assert(bs > 0 && !(bs & (bs - 1)));
-
-  // For each 4x4 group of blocks, when the bottom right is decoded the blocks
-  // to the right have not been decoded therefore the bottom right does
-  // not have a top right
-  while (bs < sb_mi_size) {
-    if (mask_col & bs) {
-      if ((mask_col & (2 * bs)) && (mask_row & (2 * bs))) {
-        has_tr = 0;
-        break;
-      }
-    } else {
-      break;
-    }
-    bs <<= 1;
-  }
-
-  // The left hand of two vertical rectangles always has a top right (as the
-  // block above will have been decoded)
-  if (xd->n8_w < xd->n8_h)
-    if (!xd->is_sec_rect) has_tr = 1;
-
-  // The bottom of two horizontal rectangles never has a top right (as the block
-  // to the right won't have been decoded)
-  if (xd->n8_w > xd->n8_h)
-    if (xd->is_sec_rect) has_tr = 0;
-
-  // The bottom left square of a Vertical A (in the old format) does
-  // not have a top right as it is decoded before the right hand
-  // rectangle of the partition
-  if (xd->cur_mi.partition == PARTITION_VERT_A) {
-    if (xd->n8_w == xd->n8_h)
-      if (mask_row & bs) has_tr = 0;
-  }
-
-  return has_tr;
-}
-
-static int check_sb_border(const int mi_row, const int mi_col,
-                           const int row_offset, const int col_offset) {
-  const int sb_mi_size = mi_size_wide[BLOCK_64X64];
-  const int row = mi_row & (sb_mi_size - 1);
-  const int col = mi_col & (sb_mi_size - 1);
-
-  if (row + row_offset < 0 || row + row_offset >= sb_mi_size ||
-      col + col_offset < 0 || col + col_offset >= sb_mi_size)
-    return 0;
-
-  return 1;
-}
-
-static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                          int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame,
-                          int blk_row, int blk_col, int_mv *gm_mv_candidates,
-                          uint8_t refmv_count[MODE_CTX_REF_FRAMES],
-                          CANDIDATE_MV ref_mv_stacks[][MAX_REF_MV_STACK_SIZE],
-                          int16_t *mode_context) {
-  POSITION mi_pos;
-  int idx;
-  const int weight_unit = 1;  // mi_size_wide[BLOCK_8X8];
-
-  mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
-  mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
-
-  if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) return 0;
-
-  const TPL_MV_REF *prev_frame_mvs =
-      cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) +
-      ((mi_col + mi_pos.col) >> 1);
-
-  MV_REFERENCE_FRAME rf[2];
-  av1_set_ref_frame(rf, ref_frame);
-
-  if (rf[1] == NONE_FRAME) {
-    int cur_frame_index = cm->cur_frame.cur_frame_offset;
-    int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
-    int frame0_index = cm->buffer_pool.frame_bufs[buf_idx_0].cur_frame_offset;
-    int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index);
-    CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[rf[0]];
-
-    if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
-      int_mv this_refmv;
-
-      get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
-                        cur_offset_0, prev_frame_mvs->ref_frame_offset);
-      lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_force_integer_mv);
-
-      if (blk_row == 0 && blk_col == 0)
-        if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
-            abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
-          mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
-
-      for (idx = 0; idx < refmv_count[rf[0]]; ++idx)
-        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
-
-      if (idx < refmv_count[rf[0]]) ref_mv_stack[idx].weight += 2 * weight_unit;
-
-      if (idx == refmv_count[rf[0]] &&
-          refmv_count[rf[0]] < MAX_REF_MV_STACK_SIZE) {
-        ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
-        ref_mv_stack[idx].weight = 2 * weight_unit;
-        ++(refmv_count[rf[0]]);
-      }
-
-      return 1;
-    }
-  } else {
-    // Process compound inter mode
-    int cur_frame_index = cm->cur_frame.cur_frame_offset;
-    int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
-    int frame0_index = cm->buffer_pool.frame_bufs[buf_idx_0].cur_frame_offset;
-
-    int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index);
-    int buf_idx_1 = cm->frame_refs[FWD_RF_OFFSET(rf[1])].idx;
-    int frame1_index = cm->buffer_pool.frame_bufs[buf_idx_1].cur_frame_offset;
-    int cur_offset_1 = get_relative_dist(cm, cur_frame_index, frame1_index);
-    CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[ref_frame];
-
-    if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
-      int_mv this_refmv;
-      int_mv comp_refmv;
-      get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
-                        cur_offset_0, prev_frame_mvs->ref_frame_offset);
-      get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
-                        cur_offset_1, prev_frame_mvs->ref_frame_offset);
-
-      lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_force_integer_mv);
-      lower_mv_precision(&comp_refmv.as_mv, cm->allow_high_precision_mv,
-                         cm->cur_frame_force_integer_mv);
-
-      if (blk_row == 0 && blk_col == 0)
-        if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
-            abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
-            abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
-            abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
-          mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
-
-      for (idx = 0; idx < refmv_count[ref_frame]; ++idx)
-        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
-            comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
-          break;
-
-      if (idx < refmv_count[ref_frame])
-        ref_mv_stack[idx].weight += 2 * weight_unit;
-
-      if (idx == refmv_count[ref_frame] &&
-          refmv_count[ref_frame] < MAX_REF_MV_STACK_SIZE) {
-        ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
-        ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
-        ref_mv_stack[idx].weight = 2 * weight_unit;
-        ++(refmv_count[ref_frame]);
-      }
-      return 1;
-    }
-  }
-  return 0;
-}
-
-static void setup_ref_mv_list(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
-    uint8_t refmv_count[MODE_CTX_REF_FRAMES],
-    CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
-    int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
-    int mi_row, int mi_col, int16_t *mode_context) {
-  const int bs = AOMMAX(xd->n8_w, xd->n8_h);
-  const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
-  MV_REFERENCE_FRAME rf[2];
-
-  const TileInfo *const tile = &xd->tile;
-  int max_row_offset = 0, max_col_offset = 0;
-  const int row_adj = (xd->n8_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
-  const int col_adj = (xd->n8_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
-  int processed_rows = 0;
-  int processed_cols = 0;
-
-  av1_set_ref_frame(rf, ref_frame);
-  mode_context[ref_frame] = 0;
-  refmv_count[ref_frame] = 0;
-
-  // Find valid maximum row/col offset.
-  if (xd->up_available) {
-    max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
-
-    if (xd->n8_h < mi_size_high[BLOCK_8X8])
-      max_row_offset = -(2 << 1) + row_adj;
-
-    max_row_offset =
-        find_valid_row_offset(tile, mi_row, cm->mi_rows, max_row_offset);
-  }
-
-  if (xd->left_available) {
-    max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
-
-    if (xd->n8_w < mi_size_wide[BLOCK_8X8])
-      max_col_offset = -(2 << 1) + col_adj;
-
-    max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
-  }
-
-  uint8_t col_match_count = 0;
-  uint8_t row_match_count = 0;
-  uint8_t newmv_count = 0;
-
-  // Scan the first above row mode info. row_offset = -1;
-  if (abs(max_row_offset) >= 1)
-    scan_row_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
-                  &refmv_count[ref_frame], &row_match_count, &newmv_count,
-                  gm_mv_candidates, max_row_offset, &processed_rows);
-  // Scan the first left column mode info. col_offset = -1;
-  if (abs(max_col_offset) >= 1)
-    scan_col_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
-                  &refmv_count[ref_frame], &col_match_count, &newmv_count,
-                  gm_mv_candidates, max_col_offset, &processed_cols);
-  // Check top-right boundary
-  if (has_tr)
-    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n8_w,
-                  ref_mv_stack[ref_frame], &row_match_count, &newmv_count,
-                  gm_mv_candidates, &refmv_count[ref_frame]);
-
-  uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
-  uint8_t nearest_refmv_count = refmv_count[ref_frame];
-
-  // TODO(yunqing): for comp_search, do it for all 3 cases.
-  for (int idx = 0; idx < nearest_refmv_count; ++idx)
-    ref_mv_stack[ref_frame][idx].weight += REF_CAT_LEVEL;
-
-  if (cm->allow_ref_frame_mvs) {
-    int is_available = 0;
-    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n8_h);
-    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n8_w);
-    const int blk_row_end = AOMMIN(xd->n8_h, mi_size_high[BLOCK_64X64]);
-    const int blk_col_end = AOMMIN(xd->n8_w, mi_size_wide[BLOCK_64X64]);
-
-    const int tpl_sample_pos[3][2] = {
-      { voffset, -2 },
-      { voffset, hoffset },
-      { voffset - 2, hoffset },
-    };
-    const int allow_extension = (xd->n8_h >= mi_size_high[BLOCK_8X8]) &&
-                                (xd->n8_h < mi_size_high[BLOCK_64X64]) &&
-                                (xd->n8_w >= mi_size_wide[BLOCK_8X8]) &&
-                                (xd->n8_w < mi_size_wide[BLOCK_64X64]);
-
-    int step_h = (xd->n8_h >= mi_size_high[BLOCK_64X64])
-                     ? mi_size_high[BLOCK_16X16]
-                     : mi_size_high[BLOCK_8X8];
-    int step_w = (xd->n8_w >= mi_size_wide[BLOCK_64X64])
-                     ? mi_size_wide[BLOCK_16X16]
-                     : mi_size_wide[BLOCK_8X8];
-
-    for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) {
-      for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
-        int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row,
-                                 blk_col, gm_mv_candidates, refmv_count,
-                                 ref_mv_stack, mode_context);
-        if (blk_row == 0 && blk_col == 0) is_available = ret;
-      }
-    }
-
-    if (is_available == 0) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
-
-    for (int i = 0; i < 3 && allow_extension; ++i) {
-      const int blk_row = tpl_sample_pos[i][0];
-      const int blk_col = tpl_sample_pos[i][1];
-
-      if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue;
-      add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col,
-                     gm_mv_candidates, refmv_count, ref_mv_stack, mode_context);
-    }
-  }
-
-  uint8_t dummy_newmv_count = 0;
-
-  // Scan the second outer area.
-  scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack[ref_frame],
-                &row_match_count, &dummy_newmv_count, gm_mv_candidates,
-                &refmv_count[ref_frame]);
-
-  for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) {
-    const int row_offset = -(idx << 1) + 1 + row_adj;
-    const int col_offset = -(idx << 1) + 1 + col_adj;
-
-    if (abs(row_offset) <= abs(max_row_offset) &&
-        abs(row_offset) > processed_rows)
-      scan_row_mbmi(cm, xd, mi_row, mi_col, rf, row_offset,
-                    ref_mv_stack[ref_frame], &refmv_count[ref_frame],
-                    &row_match_count, &dummy_newmv_count, gm_mv_candidates,
-                    max_row_offset, &processed_rows);
-
-    if (abs(col_offset) <= abs(max_col_offset) &&
-        abs(col_offset) > processed_cols)
-      scan_col_mbmi(cm, xd, mi_row, mi_col, rf, col_offset,
-                    ref_mv_stack[ref_frame], &refmv_count[ref_frame],
-                    &col_match_count, &dummy_newmv_count, gm_mv_candidates,
-                    max_col_offset, &processed_cols);
-  }
-
-  uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
-
-  switch (nearest_match) {
-    case 0:
-      mode_context[ref_frame] |= 0;
-      if (ref_match_count >= 1) mode_context[ref_frame] |= 1;
-      if (ref_match_count == 1)
-        mode_context[ref_frame] |= (1 << REFMV_OFFSET);
-      else if (ref_match_count >= 2)
-        mode_context[ref_frame] |= (2 << REFMV_OFFSET);
-      break;
-    case 1:
-      mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;
-      if (ref_match_count == 1)
-        mode_context[ref_frame] |= (3 << REFMV_OFFSET);
-      else if (ref_match_count >= 2)
-        mode_context[ref_frame] |= (4 << REFMV_OFFSET);
-      break;
-    case 2:
-    default:
-      if (newmv_count >= 1)
-        mode_context[ref_frame] |= 4;
-      else
-        mode_context[ref_frame] |= 5;
-
-      mode_context[ref_frame] |= (5 << REFMV_OFFSET);
-      break;
-  }
-
-  // Rank the likelihood and assign nearest and near mvs.
-  int len = nearest_refmv_count;
-  while (len > 0) {
-    int nr_len = 0;
-    for (int idx = 1; idx < len; ++idx) {
-      if (ref_mv_stack[ref_frame][idx - 1].weight <
-          ref_mv_stack[ref_frame][idx].weight) {
-        CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
-        ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
-        ref_mv_stack[ref_frame][idx] = tmp_mv;
-        nr_len = idx;
-      }
-    }
-    len = nr_len;
-  }
-
-  len = refmv_count[ref_frame];
-  while (len > nearest_refmv_count) {
-    int nr_len = nearest_refmv_count;
-    for (int idx = nearest_refmv_count + 1; idx < len; ++idx) {
-      if (ref_mv_stack[ref_frame][idx - 1].weight <
-          ref_mv_stack[ref_frame][idx].weight) {
-        CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
-        ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
-        ref_mv_stack[ref_frame][idx] = tmp_mv;
-        nr_len = idx;
-      }
-    }
-    len = nr_len;
-  }
-
-  if (rf[1] > NONE_FRAME) {
-    // TODO(jingning, yunqing): Refactor and consolidate the compound and
-    // single reference frame modes. Reduce unnecessary redundancy.
-    if (refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES) {
-      int_mv ref_id[2][2], ref_diff[2][2];
-      int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
-
-      int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w);
-      mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
-      int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h);
-      mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
-      int mi_size = AOMMIN(mi_width, mi_height);
-
-      for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
-        const MB_MODE_INFO *const candidate = &xd->mi[-xd->mi_stride + idx];
-        const int candidate_bsize = candidate->sb_type;
-
-        for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
-          MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
-
-          for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
-            if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
-              ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
-              ++ref_id_count[cmp_idx];
-            } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
-              int_mv this_mv = candidate->mv[rf_idx];
-              if (cm->ref_frame_sign_bias[can_rf] !=
-                  cm->ref_frame_sign_bias[rf[cmp_idx]]) {
-                this_mv.as_mv.row = -this_mv.as_mv.row;
-                this_mv.as_mv.col = -this_mv.as_mv.col;
-              }
-              ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
-              ++ref_diff_count[cmp_idx];
-            }
-          }
-        }
-        idx += mi_size_wide[candidate_bsize];
-      }
-
-      for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
-        const MB_MODE_INFO *const candidate = &xd->mi[idx * xd->mi_stride - 1];
-        const int candidate_bsize = candidate->sb_type;
-
-        for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
-          MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
-
-          for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
-            if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
-              ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
-              ++ref_id_count[cmp_idx];
-            } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
-              int_mv this_mv = candidate->mv[rf_idx];
-              if (cm->ref_frame_sign_bias[can_rf] !=
-                  cm->ref_frame_sign_bias[rf[cmp_idx]]) {
-                this_mv.as_mv.row = -this_mv.as_mv.row;
-                this_mv.as_mv.col = -this_mv.as_mv.col;
-              }
-              ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
-              ++ref_diff_count[cmp_idx];
-            }
-          }
-        }
-        idx += mi_size_high[candidate_bsize];
-      }
-
-      // Build up the compound mv predictor
-      int_mv comp_list[3][2];
-
-      for (int idx = 0; idx < 2; ++idx) {
-        int comp_idx = 0;
-        for (int list_idx = 0; list_idx < ref_id_count[idx] && comp_idx < 2;
-             ++list_idx, ++comp_idx)
-          comp_list[comp_idx][idx] = ref_id[idx][list_idx];
-        for (int list_idx = 0; list_idx < ref_diff_count[idx] && comp_idx < 2;
-             ++list_idx, ++comp_idx)
-          comp_list[comp_idx][idx] = ref_diff[idx][list_idx];
-        for (; comp_idx < 3; ++comp_idx)
-          comp_list[comp_idx][idx] = gm_mv_candidates[idx];
-      }
-
-      if (refmv_count[ref_frame]) {
-        assert(refmv_count[ref_frame] == 1);
-        if (comp_list[0][0].as_int ==
-                ref_mv_stack[ref_frame][0].this_mv.as_int &&
-            comp_list[0][1].as_int ==
-                ref_mv_stack[ref_frame][0].comp_mv.as_int) {
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
-              comp_list[1][0];
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
-              comp_list[1][1];
-        } else {
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
-              comp_list[0][0];
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
-              comp_list[0][1];
-        }
-        ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
-        ++refmv_count[ref_frame];
-      } else {
-        for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
-              comp_list[idx][0];
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
-              comp_list[idx][1];
-          ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
-          ++refmv_count[ref_frame];
-        }
-      }
-    }
-
-    assert(refmv_count[ref_frame] >= 2);
-
-    for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
-      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
-                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
-      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv,
-                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
-    }
-  } else {
-    // Handle single reference frame extension
-    int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w);
-    mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
-    int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h);
-    mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
-    int mi_size = AOMMIN(mi_width, mi_height);
-
-    for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
-                      refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
-      const MB_MODE_INFO *const candidate = &xd->mi[-xd->mi_stride + idx];
-      const int candidate_bsize = candidate->sb_type;
-
-      // TODO(jingning): Refactor the following code.
-      for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
-        if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
-          int_mv this_mv = candidate->mv[rf_idx];
-          if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
-              cm->ref_frame_sign_bias[ref_frame]) {
-            this_mv.as_mv.row = -this_mv.as_mv.row;
-            this_mv.as_mv.col = -this_mv.as_mv.col;
-          }
-          int stack_idx;
-          for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
-            int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
-            if (this_mv.as_int == stack_mv.as_int) break;
-          }
-
-          if (stack_idx == refmv_count[ref_frame]) {
-            ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
-
-            // TODO(jingning): Set an arbitrary small number here. The weight
-            // doesn't matter as long as it is properly initialized.
-            ref_mv_stack[ref_frame][stack_idx].weight = 2;
-            ++refmv_count[ref_frame];
-          }
-        }
-      }
-      idx += mi_size_wide[candidate_bsize];
-    }
-
-    for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
-                      refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
-      const MB_MODE_INFO *const candidate = &xd->mi[idx * xd->mi_stride - 1];
-      const int candidate_bsize = candidate->sb_type;
-
-      // TODO(jingning): Refactor the following code.
-      for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
-        if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
-          int_mv this_mv = candidate->mv[rf_idx];
-          if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
-              cm->ref_frame_sign_bias[ref_frame]) {
-            this_mv.as_mv.row = -this_mv.as_mv.row;
-            this_mv.as_mv.col = -this_mv.as_mv.col;
-          }
-          int stack_idx;
-          for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
-            int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
-            if (this_mv.as_int == stack_mv.as_int) break;
-          }
-
-          if (stack_idx == refmv_count[ref_frame]) {
-            ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
-
-            // TODO(jingning): Set an arbitrary small number here. The weight
-            // doesn't matter as long as it is properly initialized.
-            ref_mv_stack[ref_frame][stack_idx].weight = 2;
-            ++refmv_count[ref_frame];
-          }
-        }
-      }
-      idx += mi_size_high[candidate_bsize];
-    }
-
-    for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
-      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
-                   xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
-    }
-
-    if (mv_ref_list != NULL) {
-      for (int idx = refmv_count[ref_frame]; idx < MAX_MV_REF_CANDIDATES; ++idx)
-        mv_ref_list[rf[0]][idx].as_int = gm_mv_candidates[0].as_int;
-
-      for (int idx = 0;
-           idx < AOMMIN(MAX_MV_REF_CANDIDATES, refmv_count[ref_frame]); ++idx) {
-        mv_ref_list[rf[0]][idx].as_int =
-            ref_mv_stack[ref_frame][idx].this_mv.as_int;
-      }
-    }
-  }
-}
-
-static void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                      MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                      uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
-                      CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
-                      int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
-                      int_mv *global_mvs, int mi_row, int mi_col,
-                      int16_t *mode_context) {
-  int_mv zeromv[2];
-  BLOCK_SIZE bsize = mi->sb_type;
-  MV_REFERENCE_FRAME rf[2];
-  av1_set_ref_frame(rf, ref_frame);
-
-  if (ref_frame < REF_FRAMES) {
-    if (ref_frame != INTRA_FRAME) {
-      global_mvs[ref_frame] = gm_get_motion_vector(
-          &cm->global_motion[ref_frame], cm->allow_high_precision_mv, bsize,
-          mi_col, mi_row, cm->cur_frame_force_integer_mv);
-    } else {
-      global_mvs[ref_frame].as_int = INVALID_MV;
-    }
-  }
-
-  if (ref_frame != INTRA_FRAME) {
-    zeromv[0].as_int =
-        gm_get_motion_vector(&cm->global_motion[rf[0]],
-                             cm->allow_high_precision_mv, bsize, mi_col, mi_row,
-                             cm->cur_frame_force_integer_mv)
-            .as_int;
-    zeromv[1].as_int =
-        (rf[1] != NONE_FRAME)
-            ? gm_get_motion_vector(&cm->global_motion[rf[1]],
-                                   cm->allow_high_precision_mv, bsize, mi_col,
-                                   mi_row, cm->cur_frame_force_integer_mv)
-                  .as_int
-            : 0;
-  } else {
-    zeromv[0].as_int = zeromv[1].as_int = 0;
-  }
-
-  setup_ref_mv_list(cm, xd, ref_frame, ref_mv_count, ref_mv_stack, mv_ref_list,
-                    zeromv, mi_row, mi_col, mode_context);
-}
-
-static void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
-  cm->cur_frame.cur_frame_offset = cm->frame_offset;
-
-  MV_REFERENCE_FRAME ref_frame;
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
-    if (buf_idx >= 0)
-      cm->cur_frame.ref_frame_offset[ref_frame - LAST_FRAME] =
-          cm->buffer_pool.frame_bufs[buf_idx].cur_frame_offset;
-  }
-}
-
-#define MAX_OFFSET_WIDTH 64
-#define MAX_OFFSET_HEIGHT 0
-
-static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
-                              int blk_col, MV mv, int sign_bias) {
-  const int base_blk_row = (blk_row >> 3) << 3;
-  const int base_blk_col = (blk_col >> 3) << 3;
-
-  const int row_offset = (mv.row >= 0) ? (mv.row >> (4 + MI_SIZE_LOG2))
-                                       : -((-mv.row) >> (4 + MI_SIZE_LOG2));
-
-  const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2))
-                                       : -((-mv.col) >> (4 + MI_SIZE_LOG2));
-
-  int row = (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
-  int col = (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
-
-  if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 ||
-      col >= (cm->mi_cols >> 1))
-    return 0;
-
-  if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) ||
-      row >= base_blk_row + 8 + (MAX_OFFSET_HEIGHT >> 3) ||
-      col < base_blk_col - (MAX_OFFSET_WIDTH >> 3) ||
-      col >= base_blk_col + 8 + (MAX_OFFSET_WIDTH >> 3))
-    return 0;
-
-  *mi_r = row;
-  *mi_c = col;
-
-  return 1;
-}
-
-static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame,
-                                   int dir,
-                                   const int from_x4, const int to_x4,
-                                   const int from_y4, const int to_y4) {
-  TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
-  int ref_offset[TOTAL_REFS_PER_FRAME] = { 0 };
-  int ref_sign[TOTAL_REFS_PER_FRAME] = { 0 };
-
-  (void)dir;
-
-  int ref_frame_idx = cm->frame_refs[FWD_RF_OFFSET(ref_frame)].idx;
-  if (ref_frame_idx < 0) return 0;
-
-  if (cm->buffer_pool.frame_bufs[ref_frame_idx].intra_only) return 0;
-
-  if (cm->buffer_pool.frame_bufs[ref_frame_idx].mi_rows != cm->mi_rows ||
-      cm->buffer_pool.frame_bufs[ref_frame_idx].mi_cols != cm->mi_cols)
-    return 0;
-
-  int ref_frame_index =
-      cm->buffer_pool.frame_bufs[ref_frame_idx].cur_frame_offset;
-  unsigned int *ref_rf_idx =
-      &cm->buffer_pool.frame_bufs[ref_frame_idx].ref_frame_offset[0];
-   int cur_frame_index = cm->cur_frame.cur_frame_offset;
-  int ref_to_cur = get_relative_dist(cm, ref_frame_index, cur_frame_index);
-
-  for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
-    ref_offset[rf] =
-        get_relative_dist(cm, ref_frame_index, ref_rf_idx[rf - LAST_FRAME]);
-    // note the inverted sign
-    ref_sign[rf] =
-        get_relative_dist(cm, ref_rf_idx[rf - LAST_FRAME], ref_frame_index) < 0;
-  }
-
-  if (dir == 2) ref_to_cur = -ref_to_cur;
-
-  MV_REF *mv_ref_base = cm->buffer_pool.frame_bufs[ref_frame_idx].mvs;
-  const ptrdiff_t mv_stride =
-    cm->buffer_pool.frame_bufs[ref_frame_idx].mv_stride;
-  const int mvs_rows = (cm->mi_rows + 1) >> 1;
-  const int mvs_cols = (cm->mi_cols + 1) >> 1;
-
-  assert(from_y4 >= 0);
-  const int row_start8 = from_y4 >> 1;
-  const int row_end8 = imin(to_y4 >> 1, mvs_rows);
-  const int col_start8 = imax((from_x4 - (MAX_OFFSET_WIDTH >> 2)) >> 1, 0);
-  const int col_end8 = imin((to_x4 + (MAX_OFFSET_WIDTH >> 2)) >> 1, mvs_cols);
-  for (int blk_row = row_start8; blk_row < row_end8; ++blk_row) {
-    for (int blk_col = col_start8; blk_col < col_end8; ++blk_col) {
-      MV_REF *mv_ref = &mv_ref_base[((blk_row << 1) + 1) * mv_stride +
-                                     (blk_col << 1) + 1];
-      int diridx;
-      const int ref0 = mv_ref->ref_frame[0], ref1 = mv_ref->ref_frame[1];
-      if (ref1 > 0 && ref_sign[ref1] &&
-          abs(mv_ref->mv[1].as_mv.row) < (1 << 12) &&
-          abs(mv_ref->mv[1].as_mv.col) < (1 << 12))
-      {
-        diridx = 1;
-      } else if (ref0 > 0 && ref_sign[ref0] &&
-                 abs(mv_ref->mv[0].as_mv.row) < (1 << 12) &&
-                 abs(mv_ref->mv[0].as_mv.col) < (1 << 12))
-      {
-        diridx = 0;
-      } else {
-        continue;
-      }
-      MV fwd_mv = mv_ref->mv[diridx].as_mv;
-
-      if (mv_ref->ref_frame[diridx] > INTRA_FRAME) {
-        int_mv this_mv;
-        int mi_r, mi_c;
-        const int ref_frame_offset = ref_offset[mv_ref->ref_frame[diridx]];
-
-        int pos_valid = abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
-                        ref_frame_offset > 0 &&
-                        abs(ref_to_cur) <= MAX_FRAME_DISTANCE;
-
-        if (pos_valid) {
-          get_mv_projection(&this_mv.as_mv, fwd_mv, ref_to_cur,
-                            ref_frame_offset);
-          pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
-                                         this_mv.as_mv, dir >> 1);
-        }
-
-        if (pos_valid && mi_c >= (from_x4 >> 1) && mi_c < (to_x4 >> 1)) {
-          int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
-
-          tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
-          tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
-          tpl_mvs_base[mi_offset].ref_frame_offset = ref_frame_offset;
-        }
-      }
-    }
-  }
-
-  return 1;
-}
-
-static void av1_setup_motion_field(AV1_COMMON *cm) {
-  if (!cm->seq_params.enable_order_hint) return;
-
-  TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
-  int size = (((cm->mi_rows + 31) & ~31) >> 1) * (cm->mi_stride >> 1);
-  for (int idx = 0; idx < size; ++idx) {
-    tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV;
-    tpl_mvs_base[idx].ref_frame_offset = 0;
-  }
-
-  memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side));
-  RefCntBuffer *const frame_bufs = cm->buffer_pool.frame_bufs;
-
-  const int cur_order_hint = cm->cur_frame.cur_frame_offset;
-  int *const ref_buf_idx = cm->ref_buf_idx;
-  int *const ref_order_hint = cm->ref_order_hint;
-
-  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    const int ref_idx = ref_frame - LAST_FRAME;
-    const int buf_idx = cm->frame_refs[ref_idx].idx;
-    int order_hint = 0;
-
-    if (buf_idx >= 0) order_hint = frame_bufs[buf_idx].cur_frame_offset;
-
-    ref_buf_idx[ref_idx] = buf_idx;
-    ref_order_hint[ref_idx] = order_hint;
-
-    if (get_relative_dist(cm, order_hint, cur_order_hint) > 0)
-      cm->ref_frame_side[ref_frame] = 1;
-    else if (order_hint == cur_order_hint)
-      cm->ref_frame_side[ref_frame] = -1;
-  }
-}
-
-enum BlockSize {
-    BS_128x128,
-    BS_128x64,
-    BS_64x128,
-    BS_64x64,
-    BS_64x32,
-    BS_64x16,
-    BS_32x64,
-    BS_32x32,
-    BS_32x16,
-    BS_32x8,
-    BS_16x64,
-    BS_16x32,
-    BS_16x16,
-    BS_16x8,
-    BS_16x4,
-    BS_8x32,
-    BS_8x16,
-    BS_8x8,
-    BS_8x4,
-    BS_4x16,
-    BS_4x8,
-    BS_4x4,
-    N_BS_SIZES,
-};
-extern const uint8_t dav1d_block_dimensions[N_BS_SIZES][4];
-const uint8_t dav1d_bs_to_sbtype[N_BS_SIZES] = {
-    [BS_128x128] = BLOCK_128X128,
-    [BS_128x64] = BLOCK_128X64,
-    [BS_64x128] = BLOCK_64X128,
-    [BS_64x64] = BLOCK_64X64,
-    [BS_64x32] = BLOCK_64X32,
-    [BS_64x16] = BLOCK_64X16,
-    [BS_32x64] = BLOCK_32X64,
-    [BS_32x32] = BLOCK_32X32,
-    [BS_32x16] = BLOCK_32X16,
-    [BS_32x8] = BLOCK_32X8,
-    [BS_16x64] = BLOCK_16X64,
-    [BS_16x32] = BLOCK_16X32,
-    [BS_16x16] = BLOCK_16X16,
-    [BS_16x8] = BLOCK_16X8,
-    [BS_16x4] = BLOCK_16X4,
-    [BS_8x32] = BLOCK_8X32,
-    [BS_8x16] = BLOCK_8X16,
-    [BS_8x8] = BLOCK_8X8,
-    [BS_8x4] = BLOCK_8X4,
-    [BS_4x16] = BLOCK_4X16,
-    [BS_4x8] = BLOCK_4X8,
-    [BS_4x4] = BLOCK_4X4,
-};
-const uint8_t dav1d_sbtype_to_bs[BLOCK_SIZES_ALL] = {
-    [BLOCK_128X128] = BS_128x128,
-    [BLOCK_128X64] = BS_128x64,
-    [BLOCK_64X128] = BS_64x128,
-    [BLOCK_64X64] = BS_64x64,
-    [BLOCK_64X32] = BS_64x32,
-    [BLOCK_64X16] = BS_64x16,
-    [BLOCK_32X64] = BS_32x64,
-    [BLOCK_32X32] = BS_32x32,
-    [BLOCK_32X16] = BS_32x16,
-    [BLOCK_32X8] = BS_32x8,
-    [BLOCK_16X64] = BS_16x64,
-    [BLOCK_16X32] = BS_16x32,
-    [BLOCK_16X16] = BS_16x16,
-    [BLOCK_16X8] = BS_16x8,
-    [BLOCK_16X4] = BS_16x4,
-    [BLOCK_8X32] = BS_8x32,
-    [BLOCK_8X16] = BS_8x16,
-    [BLOCK_8X8] = BS_8x8,
-    [BLOCK_8X4] = BS_8x4,
-    [BLOCK_4X16] = BS_4x16,
-    [BLOCK_4X8] = BS_4x8,
-    [BLOCK_4X4] = BS_4x4,
-};
-
-#include <stdio.h>
-
-void dav1d_find_ref_mvs(CANDIDATE_MV *mvstack, int *cnt, int_mv (*mvlist)[2],
-                        int *ctx, int refidx_dav1d[2],
-                        int w4, int h4, int bs, int bp, int by4, int bx4,
-                        int tile_col_start4, int tile_col_end4,
-                        int tile_row_start4, int tile_row_end4,
-                        AV1_COMMON *cm);
-void dav1d_find_ref_mvs(CANDIDATE_MV *mvstack, int *cnt, int_mv (*mvlist)[2],
-                        int *ctx, int refidx_dav1d[2],
-                        int w4, int h4, int bs, int bp, int by4, int bx4,
-                        int tile_col_start4, int tile_col_end4,
-                        int tile_row_start4, int tile_row_end4,
-                        AV1_COMMON *cm)
-{
-    const int bw4 = dav1d_block_dimensions[bs][0];
-    const int bh4 = dav1d_block_dimensions[bs][1];
-    int stride = (int) cm->cur_frame.mv_stride;
-    MACROBLOCKD xd = (MACROBLOCKD) {
-        .n8_w = bw4,
-        .n8_h = bh4,
-        .mi_stride = stride,
-        .up_available = by4 > tile_row_start4,
-        .left_available = bx4 > tile_col_start4,
-        .tile = {
-            .mi_col_end = AOMMIN(w4, tile_col_end4),
-            .mi_row_end = AOMMIN(h4, tile_row_end4),
-            .tg_horz_boundary = 0,
-            .mi_row_start = tile_row_start4,
-            .mi_col_start = tile_col_start4,
-        },
-        .mi = (MB_MODE_INFO *) &cm->cur_frame.mvs[by4 * stride + bx4],
-        .mb_to_bottom_edge = (h4 - bh4 - by4) * 32,
-        .mb_to_left_edge = -bx4 * 32,
-        .mb_to_right_edge = (w4 - bw4 - bx4) * 32,
-        .mb_to_top_edge = -by4 * 32,
-        .is_sec_rect = 0,
-        .cur_mi = {
-            .partition = bp,
-        },
-    };
-    xd.mi->sb_type = dav1d_bs_to_sbtype[bs];
-    if (xd.n8_w < xd.n8_h) {
-        // Only mark is_sec_rect as 1 for the last block.
-        // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
-        // For other partitions, it would be (0, 1).
-        if (!((bx4 + xd.n8_w) & (xd.n8_h - 1))) xd.is_sec_rect = 1;
-    }
-
-    if (xd.n8_w > xd.n8_h)
-        if (by4 & (xd.n8_w - 1)) xd.is_sec_rect = 1;
-
-    MV_REFERENCE_FRAME rf[2] = { refidx_dav1d[0] + 1, refidx_dav1d[1] + 1 };
-    const int refidx = av1_ref_frame_type(rf);
-    int16_t single_context[MODE_CTX_REF_FRAMES];
-    uint8_t mv_cnt[MODE_CTX_REF_FRAMES];
-    CANDIDATE_MV mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
-    int_mv mv_list[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
-    int_mv gmvs[MODE_CTX_REF_FRAMES];
-    av1_find_mv_refs(cm, &xd, xd.mi, refidx, mv_cnt,
-                     mv_stack, mv_list, gmvs, by4, bx4,
-                     single_context);
-    for (int i = 0; i < mv_cnt[refidx]; i++)
-        mvstack[i] = mv_stack[refidx][i];
-    *cnt = mv_cnt[refidx];
-
-    mvlist[0][0] = mv_list[refidx_dav1d[0] + 1][0];
-    mvlist[0][1] = mv_list[refidx_dav1d[0] + 1][1];
-    if (refidx_dav1d[1] != -1) {
-        mvlist[1][0] = mv_list[refidx_dav1d[1] + 1][0];
-        mvlist[1][1] = mv_list[refidx_dav1d[1] + 1][1];
-    }
-
-    if (ctx) {
-        if (refidx_dav1d[1] == -1)
-            *ctx = single_context[refidx_dav1d[0] + 1];
-        else
-            *ctx = av1_mode_context_analyzer(single_context, rf);
-    }
-}
-
-int dav1d_init_ref_mv_common(AV1_COMMON *cm, const int w8, const int h8,
-                             const ptrdiff_t stride, const int allow_sb128,
-                             MV_REF *cur, MV_REF *ref_mvs[7],
-                             const unsigned cur_poc,
-                             const unsigned ref_poc[7],
-                             const unsigned ref_ref_poc[7][7],
-                             const Dav1dWarpedMotionParams gmv[7],
-                             const int allow_hp, const int force_int_mv,
-                             const int allow_ref_frame_mvs,
-                             const int order_hint);
-int dav1d_init_ref_mv_common(AV1_COMMON *cm, const int w8, const int h8,
-                             const ptrdiff_t stride, const int allow_sb128,
-                             MV_REF *cur, MV_REF *ref_mvs[7],
-                             const unsigned cur_poc,
-                             const unsigned ref_poc[7],
-                             const unsigned ref_ref_poc[7][7],
-                             const Dav1dWarpedMotionParams gmv[7],
-                             const int allow_hp, const int force_int_mv,
-                             const int allow_ref_frame_mvs,
-                             const int order_hint)
-{
-    if (cm->mi_cols != (w8 << 1) || cm->mi_rows != (h8 << 1)) {
-        const int align_h = (h8 + 15) & ~15;
-        if (cm->tpl_mvs) free(cm->tpl_mvs);
-        cm->tpl_mvs = malloc(sizeof(*cm->tpl_mvs) * (stride >> 1) * align_h);
-        if (!cm->tpl_mvs) {
-            cm->mi_cols = cm->mi_rows = 0;
-            return DAV1D_ERR(ENOMEM);
-        }
-        for (int i = 0; i < 7; i++)
-            cm->frame_refs[i].idx = i;
-        cm->mi_cols = w8 << 1;
-        cm->mi_rows = h8 << 1;
-        cm->mi_stride = (int) stride;
-        for (int i = 0; i < 7; i++) {
-            cm->buffer_pool.frame_bufs[i].mi_rows = cm->mi_rows;
-            cm->buffer_pool.frame_bufs[i].mi_cols = cm->mi_cols;
-            cm->buffer_pool.frame_bufs[i].mv_stride = stride;
-        }
-        cm->cur_frame.mv_stride = stride;
-    }
-
-    cm->allow_high_precision_mv = allow_hp;
-    cm->seq_params.sb_size = allow_sb128 ? BLOCK_128X128 : BLOCK_64X64;
-
-    cm->seq_params.enable_order_hint = !!order_hint;
-    cm->seq_params.order_hint_bits_minus1 = order_hint - 1;
-    // FIXME get these from the sequence/frame headers instead of hardcoding
-    cm->frame_parallel_decode = 0;
-    cm->cur_frame_force_integer_mv = force_int_mv;
-
-    memcpy(&cm->global_motion[1], gmv, sizeof(*gmv) * 7);
-
-    cm->frame_offset = cur_poc;
-    cm->allow_ref_frame_mvs = allow_ref_frame_mvs;
-    cm->cur_frame.mvs = cur;
-    for (int i = 0; i < 7; i++) {
-        cm->buffer_pool.frame_bufs[i].mvs = ref_mvs[i];
-        cm->buffer_pool.frame_bufs[i].intra_only = ref_mvs[i] == NULL;
-        cm->buffer_pool.frame_bufs[i].cur_frame_offset = ref_poc[i];
-        for (int j = 0; j < 7; j++)
-            cm->buffer_pool.frame_bufs[i].ref_frame_offset[j] =
-                ref_ref_poc[i][j];
-    }
-    av1_setup_frame_buf_refs(cm);
-    for (int i = 0; i < 7; i++) {
-        const int ref_poc = cm->buffer_pool.frame_bufs[i].cur_frame_offset;
-        cm->ref_frame_sign_bias[1 + i] = get_relative_dist(cm, ref_poc, cur_poc) > 0;
-    }
-    if (allow_ref_frame_mvs) {
-        av1_setup_motion_field(cm);
-    }
-
-    return 0;
-}
-
-void dav1d_init_ref_mv_tile_row(AV1_COMMON *cm,
-                                int tile_col_start4, int tile_col_end4,
-                                int row_start4, int row_end4);
-void dav1d_init_ref_mv_tile_row(AV1_COMMON *cm,
-                                int tile_col_start4, int tile_col_end4,
-                                int row_start4, int row_end4)
-{
-  RefCntBuffer *const frame_bufs = cm->buffer_pool.frame_bufs;
-  const int cur_order_hint = cm->cur_frame.cur_frame_offset;
-  int *const ref_buf_idx = cm->ref_buf_idx;
-  int *const ref_order_hint = cm->ref_order_hint;
-
-  int ref_stamp = MFMV_STACK_SIZE - 1;
-
-  if (ref_buf_idx[LAST_FRAME - LAST_FRAME] >= 0) {
-    const int alt_of_lst_order_hint =
-        frame_bufs[ref_buf_idx[LAST_FRAME - LAST_FRAME]]
-            .ref_frame_offset[ALTREF_FRAME - LAST_FRAME];
-
-    const int is_lst_overlay =
-        (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]);
-      if (!is_lst_overlay) motion_field_projection(cm, LAST_FRAME, 2,
-                                                   tile_col_start4, tile_col_end4,
-                                                   row_start4, row_end4);
-    --ref_stamp;
-  }
-
-  if (get_relative_dist(cm, ref_order_hint[BWDREF_FRAME - LAST_FRAME],
-                        cur_order_hint) > 0) {
-      if (motion_field_projection(cm, BWDREF_FRAME, 0,
-                                  tile_col_start4, tile_col_end4,
-                                  row_start4, row_end4)) --ref_stamp;
-  }
-
-  if (get_relative_dist(cm, ref_order_hint[ALTREF2_FRAME - LAST_FRAME],
-                        cur_order_hint) > 0) {
-      if (motion_field_projection(cm, ALTREF2_FRAME, 0,
-                                  tile_col_start4, tile_col_end4,
-                                  row_start4, row_end4)) --ref_stamp;
-  }
-
-  if (get_relative_dist(cm, ref_order_hint[ALTREF_FRAME - LAST_FRAME],
-                        cur_order_hint) > 0 &&
-      ref_stamp >= 0)
-      if (motion_field_projection(cm, ALTREF_FRAME, 0,
-                                  tile_col_start4, tile_col_end4,
-                                  row_start4, row_end4)) --ref_stamp;
-
-  if (ref_stamp >= 0 && ref_buf_idx[LAST2_FRAME - LAST_FRAME] >= 0)
-      if (motion_field_projection(cm, LAST2_FRAME, 2,
-                                  tile_col_start4, tile_col_end4,
-                                  row_start4, row_end4)) --ref_stamp;
-}
-
-AV1_COMMON *dav1d_alloc_ref_mv_common(void);
-AV1_COMMON *dav1d_alloc_ref_mv_common(void) {
-    return calloc(1, sizeof(AV1_COMMON));
-}
-
-void dav1d_free_ref_mv_common(AV1_COMMON *cm);
-void dav1d_free_ref_mv_common(AV1_COMMON *cm) {
-    if (cm->tpl_mvs) free(cm->tpl_mvs);
-    free(cm);
-}
--- a/src/ref_mvs.h
+++ /dev/null
@@ -1,178 +1,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef DAV1D_SRC_REF_MVS_H
-#define DAV1D_SRC_REF_MVS_H
-
-#include <stddef.h>
-
-#include "src/levels.h"
-#include "src/tables.h"
-
-typedef struct refmvs {
-    mv mv[2];
-    int8_t ref[2]; // [0] = 0: intra=1, [1] = -1: comp=0
-    int8_t mode, sb_type;
-} refmvs;
-
-typedef struct candidate_mv {
-    mv this_mv;
-    mv comp_mv;
-    int weight;
-} candidate_mv;
-
-typedef struct AV1_COMMON AV1_COMMON;
-
-// call once per frame thread
-AV1_COMMON *dav1d_alloc_ref_mv_common(void);
-void dav1d_free_ref_mv_common(AV1_COMMON *cm);
-
-// call once per frame
-int dav1d_init_ref_mv_common(AV1_COMMON *cm, int w8, int h8,
-                             ptrdiff_t stride, int allow_sb128,
-                             refmvs *cur, refmvs *ref_mvs[7],
-                             unsigned cur_poc,
-                             const unsigned ref_poc[7],
-                             const unsigned ref_ref_poc[7][7],
-                             const Dav1dWarpedMotionParams gmv[7],
-                             int allow_hp, int force_int_mv,
-                             int allow_ref_frame_mvs, int order_hint);
-
-// call for start of each sbrow per tile
-void dav1d_init_ref_mv_tile_row(AV1_COMMON *cm,
-                                int tile_col_start4, int tile_col_end4,
-                                int row_start4, int row_end4);
-
-// call for each block
-void dav1d_find_ref_mvs(candidate_mv *mvstack, int *cnt, mv (*mvlist)[2],
-                        int *ctx, int refidx[2], int w4, int h4,
-                        enum BlockSize bs, enum BlockPartition bp,
-                        int by4, int bx4, int tile_col_start4,
-                        int tile_col_end4, int tile_row_start4,
-                        int tile_row_end4, AV1_COMMON *cm);
-
-extern const uint8_t dav1d_bs_to_sbtype[];
-extern const uint8_t dav1d_sbtype_to_bs[];
-static inline void splat_oneref_mv(refmvs *r, const ptrdiff_t stride,
-                                   const int by4, const int bx4,
-                                   const enum BlockSize bs,
-                                   const enum InterPredMode mode,
-                                   const int ref, const mv mv,
-                                   const int is_interintra)
-{
-    const int bw4 = dav1d_block_dimensions[bs][0];
-    int bh4 = dav1d_block_dimensions[bs][1];
-
-    r += by4 * stride + bx4;
-    const refmvs tmpl = (refmvs) {
-        .ref = { ref + 1, is_interintra ? 0 : -1 },
-        .mv = { mv },
-        .sb_type = dav1d_bs_to_sbtype[bs],
-        .mode = N_INTRA_PRED_MODES + mode,
-    };
-    do {
-        for (int x = 0; x < bw4; x++)
-            r[x] = tmpl;
-        r += stride;
-    } while (--bh4);
-}
-
-static inline void splat_intrabc_mv(refmvs *r, const ptrdiff_t stride,
-                                    const int by4, const int bx4,
-                                    const enum BlockSize bs, const mv mv)
-{
-    const int bw4 = dav1d_block_dimensions[bs][0];
-    int bh4 = dav1d_block_dimensions[bs][1];
-
-    r += by4 * stride + bx4;
-    const refmvs tmpl = (refmvs) {
-        .ref = { 0, -1 },
-        .mv = { mv },
-        .sb_type = dav1d_bs_to_sbtype[bs],
-        .mode = DC_PRED,
-    };
-    do {
-        for (int x = 0; x < bw4; x++)
-            r[x] = tmpl;
-        r += stride;
-    } while (--bh4);
-}
-
-static inline void splat_tworef_mv(refmvs *r, const ptrdiff_t stride,
-                                   const int by4, const int bx4,
-                                   const enum BlockSize bs,
-                                   const enum CompInterPredMode mode,
-                                   const int ref1, const int ref2,
-                                   const mv mv1, const mv mv2)
-{
-    const int bw4 = dav1d_block_dimensions[bs][0];
-    int bh4 = dav1d_block_dimensions[bs][1];
-
-    r += by4 * stride + bx4;
-    const refmvs tmpl = (refmvs) {
-        .ref = { ref1 + 1, ref2 + 1 },
-        .mv = { mv1, mv2 },
-        .sb_type = dav1d_bs_to_sbtype[bs],
-        .mode = N_INTRA_PRED_MODES + N_INTER_PRED_MODES + mode,
-    };
-    do {
-        for (int x = 0; x < bw4; x++)
-            r[x] = tmpl;
-        r += stride;
-    } while (--bh4);
-}
-
-static inline void splat_intraref(refmvs *r, const ptrdiff_t stride,
-                                  const int by4, const int bx4,
-                                  const enum BlockSize bs,
-                                  const enum IntraPredMode mode)
-{
-    const int bw4 = dav1d_block_dimensions[bs][0];
-    int bh4 = dav1d_block_dimensions[bs][1];
-
-    r += by4 * stride + bx4;
-    do {
-        int x;
-
-        for (x = 0; x < bw4; x++)
-            r[x] = (refmvs) {
-                .ref = { 0, -1 },
-                .mv = { [0] = { .y = -0x8000, .x = -0x8000 }, },
-                .sb_type = dav1d_bs_to_sbtype[bs],
-                .mode = mode,
-            };
-        r += stride;
-    } while (--bh4);
-}
-
-static inline void fix_mv_precision(const Dav1dFrameHeader *const hdr,
-                                    mv *const mv)
-{
-    if (hdr->force_integer_mv) {
-        const int xmod = mv->x & 7;
-        mv->x &= ~7;
-        mv->x += (xmod > 4 - (mv->x < 0)) << 3;
-        const int ymod = mv->y & 7;
-        mv->y &= ~7;
-        mv->y += (ymod > 4 - (mv->y < 0)) << 3;
-    } else if (!hdr->hp) {
-        if (mv->x & 1) {
-            if (mv->x < 0) mv->x++;
-            else           mv->x--;
-        }
-        if (mv->y & 1) {
-            if (mv->y < 0) mv->y++;
-            else           mv->y--;
-        }
-    }
-}
-
-#endif /* DAV1D_SRC_REF_MVS_H */
--- /dev/null
+++ b/src/refmvs.c
@@ -1,0 +1,915 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2020, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <limits.h>
+#include <stdlib.h>
+
+#include "dav1d/common.h"
+
+#include "common/intops.h"
+
+#include "src/env.h"
+#include "src/refmvs.h"
+
+static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cnt,
+                                  const int weight, const refmvs_block *const b,
+                                  const union refmvs_refpair ref, const mv gmv[2],
+                                  int *const have_newmv_match,
+                                  int *const have_refmv_match)
+{
+    if (b->mv[0].n == INVALID_MV) return; // intra block, no intrabc
+
+    if (ref.ref[1] == -1) {
+        for (int n = 0; n < 2; n++) {
+            if (b->ref.ref[n] == ref.ref[0]) {
+                const mv cand_mv = ((b->mf & 1) && gmv[0].n != INVALID_MV) ?
+                                   gmv[0] : b->mv[n];
+
+                const int last = *cnt;
+                for (int m = 0; m < last; m++)
+                    if (mvstack[m].mv[0].n == cand_mv.n) {
+                        mvstack[m].weight += weight;
+                        *have_refmv_match = 1;
+                        *have_newmv_match |= b->mf >> 1;
+                        return;
+                    }
+
+                if (last < 8) {
+                    mvstack[last].mv[0] = cand_mv;
+                    mvstack[last].weight = weight;
+                    *cnt = last + 1;
+                }
+                *have_refmv_match = 1;
+                *have_newmv_match |= b->mf >> 1;
+                return;
+            }
+        }
+    } else if (b->ref.pair == ref.pair) {
+        const mv cand_mv[2] = {
+            [0] = ((b->mf & 1) && gmv[0].n != INVALID_MV) ? gmv[0] : b->mv[0],
+            [1] = ((b->mf & 1) && gmv[1].n != INVALID_MV) ? gmv[1] : b->mv[1],
+        };
+
+        const int last = *cnt;
+        for (int n = 0; n < last; n++)
+            if (mvstack[n].mv[0].n == cand_mv[0].n &&
+                mvstack[n].mv[1].n == cand_mv[1].n)
+            {
+                mvstack[n].weight += weight;
+                *have_refmv_match = 1;
+                *have_newmv_match |= b->mf >> 1;
+                return;
+            }
+
+        if (last < 8) {
+            mvstack[last].mv[0] = cand_mv[0];
+            mvstack[last].mv[1] = cand_mv[1];
+            mvstack[last].weight = weight;
+            *cnt = last + 1;
+        }
+        *have_refmv_match = 1;
+        *have_newmv_match |= b->mf >> 1;
+    }
+}
+
+static int scan_row(refmvs_candidate *const mvstack, int *const cnt,
+                    const union refmvs_refpair ref, const mv gmv[2],
+                    const refmvs_block *b, const int bw4, const int w4,
+                    const int max_rows, const int step,
+                    int *const have_newmv_match, int *const have_refmv_match)
+{
+    const refmvs_block *cand_b = b;
+    const enum BlockSize first_cand_bs = cand_b->bs;
+    const uint8_t *const first_cand_b_dim = dav1d_block_dimensions[first_cand_bs];
+    int cand_bw4 = first_cand_b_dim[0];
+    int len = imax(step, imin(bw4, cand_bw4));
+
+    if (bw4 <= cand_bw4) {
+        // FIXME weight can be higher for odd blocks (bx4 & 1), but then the
+        // position of the first block has to be odd already, i.e. not just
+        // for row_offset=-3/-5
+        // FIXME why can this not be cand_bw4?
+        const int weight = bw4 == 1 ? 2 :
+                           imax(2, imin(2 * max_rows, first_cand_b_dim[1]));
+        add_spatial_candidate(mvstack, cnt, len * weight, cand_b, ref, gmv,
+                              have_newmv_match, have_refmv_match);
+        return weight >> 1;
+    }
+
+    for (int x = 0;;) {
+        // FIXME if we overhang above, we could fill a bitmask so we don't have
+        // to repeat the add_spatial_candidate() for the next row, but just increase
+        // the weight here
+        add_spatial_candidate(mvstack, cnt, len * 2, cand_b, ref, gmv,
+                              have_newmv_match, have_refmv_match);
+        x += len;
+        if (x >= w4) return 1;
+        cand_b = &b[x];
+        cand_bw4 = dav1d_block_dimensions[cand_b->bs][0];
+        assert(cand_bw4 < bw4);
+        len = imax(step, cand_bw4);
+    }
+}
+
+static int scan_col(refmvs_candidate *const mvstack, int *const cnt,
+                    const union refmvs_refpair ref, const mv gmv[2],
+                    /*const*/ refmvs_block *const *b, const int bh4, const int h4,
+                    const int bx4, const int max_cols, const int step,
+                    int *const have_newmv_match, int *const have_refmv_match)
+{
+    const refmvs_block *cand_b = &b[0][bx4];
+    const enum BlockSize first_cand_bs = cand_b->bs;
+    const uint8_t *const first_cand_b_dim = dav1d_block_dimensions[first_cand_bs];
+    int cand_bh4 = first_cand_b_dim[1];
+    int len = imax(step, imin(bh4, cand_bh4));
+
+    if (bh4 <= cand_bh4) {
+        // FIXME weight can be higher for odd blocks (by4 & 1), but then the
+        // position of the first block has to be odd already, i.e. not just
+        // for col_offset=-3/-5
+        // FIXME why can this not be cand_bh4?
+        const int weight = bh4 == 1 ? 2 :
+                           imax(2, imin(2 * max_cols, first_cand_b_dim[0]));
+        add_spatial_candidate(mvstack, cnt, len * weight, cand_b, ref, gmv,
+                            have_newmv_match, have_refmv_match);
+        return weight >> 1;
+    }
+
+    for (int y = 0;;) {
+        // FIXME if we overhang above, we could fill a bitmask so we don't have
+        // to repeat the add_spatial_candidate() for the next row, but just increase
+        // the weight here
+        add_spatial_candidate(mvstack, cnt, len * 2, cand_b, ref, gmv,
+                              have_newmv_match, have_refmv_match);
+        y += len;
+        if (y >= h4) return 1;
+        cand_b = &b[y][bx4];
+        cand_bh4 = dav1d_block_dimensions[cand_b->bs][1];
+        assert(cand_bh4 < bh4);
+        len = imax(step, cand_bh4);
+    }
+}
+
+static inline union mv mv_projection(const union mv mv, const int num, const int den) {
+    static const uint16_t div_mult[32] = {
+           0, 16384, 8192, 5461, 4096, 3276, 2730, 2340,
+        2048,  1820, 1638, 1489, 1365, 1260, 1170, 1092,
+        1024,   963,  910,  862,  819,  780,  744,  712,
+         682,   655,  630,  606,  585,  564,  546,  528
+    };
+    assert(den > 0 && den < 32);
+    assert(num > -32 && num < 32);
+    const int dm = div_mult[den];
+    const int y = mv.y * num * dm, x = mv.x * num * dm;
+    return (union mv) { .y = (y + 8192 + (y >> 31)) >> 14,
+                        .x = (x + 8192 + (x >> 31)) >> 14 };
+}
+
+static void add_temporal_candidate(const refmvs_frame *const rf,
+                                   refmvs_candidate *const mvstack, int *const cnt,
+                                   const refmvs_temporal_block *const rb,
+                                   const union refmvs_refpair ref, int *const globalmv_ctx,
+                                   const union mv gmv[])
+{
+    if (rb->mv.n == INVALID_MV) return;
+
+    union mv mv = mv_projection(rb->mv, rf->pocdiff[ref.ref[0] - 1], rb->ref);
+    fix_mv_precision(rf->frm_hdr, &mv);
+
+    const int last = *cnt;
+    if (ref.ref[1] == -1) {
+        if (globalmv_ctx)
+            *globalmv_ctx = (abs(mv.x - gmv[0].x) | abs(mv.y - gmv[0].y)) >= 16;
+
+        for (int n = 0; n < last; n++)
+            if (mvstack[n].mv[0].n == mv.n) {
+                mvstack[n].weight += 2;
+                return;
+            }
+        if (last < 8) {
+            mvstack[last].mv[0] = mv;
+            mvstack[last].weight = 2;
+            *cnt = last + 1;
+        }
+    } else {
+        union mv mv2 = mv_projection(rb->mv, rf->pocdiff[ref.ref[1] - 1], rb->ref);
+        fix_mv_precision(rf->frm_hdr, &mv2);
+
+        for (int n = 0; n < last; n++)
+            if (mvstack[n].mv[0].n == mv.n && mvstack[n].mv[1].n == mv2.n) {
+                mvstack[n].weight += 2;
+                return;
+            }
+        if (last < 8) {
+            mvstack[last].mv[0] = mv;
+            mvstack[last].mv[1] = mv2;
+            mvstack[last].weight = 2;
+            *cnt = last + 1;
+        }
+    }
+}
+
+static void add_compound_extended_candidate(refmvs_candidate *const same,
+                                            int *const same_count,
+                                            const refmvs_block *const cand_b,
+                                            const int sign0, const int sign1,
+                                            const union refmvs_refpair ref,
+                                            const uint8_t *const sign_bias)
+{
+    refmvs_candidate *const diff = &same[2];
+    int *const diff_count = &same_count[2];
+
+    for (int n = 0; n < 2; n++) {
+        const int cand_ref = cand_b->ref.ref[n];
+
+        if (cand_ref <= 0) break;
+
+        mv cand_mv = cand_b->mv[n];
+        if (cand_ref == ref.ref[0]) {
+            if (same_count[0] < 2)
+                same[same_count[0]++].mv[0] = cand_mv;
+            if (diff_count[1] < 2) {
+                if (sign1 ^ sign_bias[cand_ref - 1]) {
+                    cand_mv.y = -cand_mv.y;
+                    cand_mv.x = -cand_mv.x;
+                }
+                diff[diff_count[1]++].mv[1] = cand_mv;
+            }
+        } else if (cand_ref == ref.ref[1]) {
+            if (same_count[1] < 2)
+                same[same_count[1]++].mv[1] = cand_mv;
+            if (diff_count[0] < 2) {
+                if (sign0 ^ sign_bias[cand_ref - 1]) {
+                    cand_mv.y = -cand_mv.y;
+                    cand_mv.x = -cand_mv.x;
+                }
+                diff[diff_count[0]++].mv[0] = cand_mv;
+            }
+        } else {
+            mv i_cand_mv = (union mv) {
+                .x = -cand_mv.x,
+                .y = -cand_mv.y
+            };
+
+            if (diff_count[0] < 2) {
+                diff[diff_count[0]++].mv[0] =
+                    sign0 ^ sign_bias[cand_ref - 1] ?
+                    i_cand_mv : cand_mv;
+            }
+
+            if (diff_count[1] < 2) {
+                diff[diff_count[1]++].mv[1] =
+                    sign1 ^ sign_bias[cand_ref - 1] ?
+                    i_cand_mv : cand_mv;
+            }
+        }
+    }
+}
+
+static void add_single_extended_candidate(refmvs_candidate mvstack[8], int *const cnt,
+                                          const refmvs_block *const cand_b,
+                                          const int sign, const uint8_t *const sign_bias)
+{
+    for (int n = 0; n < 2; n++) {
+        const int cand_ref = cand_b->ref.ref[n];
+
+        if (cand_ref <= 0) break;
+        // we need to continue even if cand_ref == ref.ref[0], since
+        // the candidate could have been added as a globalmv variant,
+        // which changes the value
+        // FIXME if scan_{row,col}() returned a mask for the nearest
+        // edge, we could skip the appropriate ones here
+
+        mv cand_mv = cand_b->mv[n];
+        if (sign ^ sign_bias[cand_ref - 1]) {
+            cand_mv.y = -cand_mv.y;
+            cand_mv.x = -cand_mv.x;
+        }
+
+        int m;
+        const int last = *cnt;
+        for (m = 0; m < last; m++)
+            if (cand_mv.n == mvstack[m].mv[0].n)
+                break;
+        if (m == last) {
+            mvstack[m].mv[0] = cand_mv;
+            mvstack[m].weight = 2; // "minimal"
+            *cnt = last + 1;
+        }
+    }
+}
+
+/*
+ * refmvs_frame allocates memory for one sbrow (32 blocks high, whole frame
+ * wide) of 4x4-resolution refmvs_block entries for spatial MV referencing.
+ * mvrefs_tile[] keeps a list of 35 (32 + 3 above) pointers into this memory,
+ * and each sbrow, the bottom entries (y=27/29/31) are exchanged with the top
+ * (-5/-3/-1) pointers by calling dav1d_refmvs_tile_sbrow_init() at the start
+ * of each tile/sbrow.
+ *
+ * For temporal MV referencing, we call dav1d_refmvs_save_tmvs() at the end of
+ * each tile/sbrow (when tile column threading is enabled), or at the start of
+ * each interleaved sbrow (i.e. once for all tile columns together, when tile
+ * column threading is disabled). This will copy the 4x4-resolution spatial MVs
+ * into 8x8-resolution refmvs_temporal_block structures. Then, for subsequent
+ * frames, at the start of each tile/sbrow (when tile column threading is
+ * enabled) or at the start of each interleaved sbrow (when tile column
+ * threading is disabled), we call load_tmvs(), which will project the MVs to
+ * their respective position in the current frame.
+ */
+
+void dav1d_refmvs_find(const refmvs_tile *const rt,
+                       refmvs_candidate mvstack[8], int *const cnt,
+                       int *const ctx,
+                       const union refmvs_refpair ref, const enum BlockSize bs,
+                       const enum EdgeFlags edge_flags,
+                       const int by4, const int bx4)
+{
+    const refmvs_frame *const rf = rt->rf;
+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+    const int bw4 = b_dim[0], w4 = imin(imin(bw4, 16), rt->tile_col.end - bx4);
+    const int bh4 = b_dim[1], h4 = imin(imin(bh4, 16), rt->tile_row.end - by4);
+    mv gmv[2], tgmv[2];
+
+    *cnt = 0;
+    assert(ref.ref[0] >=  0 && ref.ref[0] <= 8 &&
+           ref.ref[1] >= -1 && ref.ref[1] <= 8);
+    if (ref.ref[0] > 0) {
+        tgmv[0] = get_gmv_2d(&rf->frm_hdr->gmv[ref.ref[0] - 1],
+                             bx4, by4, bw4, bh4, rf->frm_hdr);
+        if (rf->frm_hdr->force_integer_mv)
+            fix_int_mv_precision(&tgmv[0]);
+        gmv[0] = rf->frm_hdr->gmv[ref.ref[0] - 1].type > DAV1D_WM_TYPE_TRANSLATION ?
+                 tgmv[0] : (mv) { .n = INVALID_MV };
+    } else {
+        tgmv[0] = (mv) { .n = 0 };
+        gmv[0] = (mv) { .n = INVALID_MV };
+    }
+    if (ref.ref[1] > 0) {
+        tgmv[1] = get_gmv_2d(&rf->frm_hdr->gmv[ref.ref[1] - 1],
+                             bx4, by4, bw4, bh4, rf->frm_hdr);
+        if (rf->frm_hdr->force_integer_mv)
+            fix_int_mv_precision(&tgmv[1]);
+        gmv[1] = rf->frm_hdr->gmv[ref.ref[1] - 1].type > DAV1D_WM_TYPE_TRANSLATION ?
+                 tgmv[1] : (mv) { .n = INVALID_MV };
+    }
+
+    // top
+    int have_newmv = 0, have_col_mvs = 0, have_row_mvs = 0;
+    unsigned max_rows = 0, n_rows = ~0;
+    const refmvs_block *b_top;
+    if (by4 > rt->tile_row.start) {
+        max_rows = imin((by4 - rt->tile_row.start + 1) >> 1, 2 + (bh4 > 1));
+        b_top = &rt->r[(by4 & 31) - 1 + 5][bx4];
+        n_rows = scan_row(mvstack, cnt, ref, gmv, b_top,
+                          bw4, w4, max_rows, bw4 >= 16 ? 4 : 1,
+                          &have_newmv, &have_row_mvs);
+    }
+
+    // left
+    unsigned max_cols = 0, n_cols = ~0U;
+    refmvs_block *const *b_left;
+    if (bx4 > rt->tile_col.start) {
+        max_cols = imin((bx4 - rt->tile_col.start + 1) >> 1, 2 + (bw4 > 1));
+        b_left = &rt->r[(by4 & 31) + 5];
+        n_cols = scan_col(mvstack, cnt, ref, gmv, b_left,
+                          bh4, h4, bx4 - 1, max_cols, bh4 >= 16 ? 4 : 1,
+                          &have_newmv, &have_col_mvs);
+    }
+
+    // top/right
+    if (n_rows != ~0U && edge_flags & EDGE_I444_TOP_HAS_RIGHT &&
+        imax(bw4, bh4) <= 16 && bw4 + bx4 < rt->tile_col.end)
+    {
+        add_spatial_candidate(mvstack, cnt, 4, &b_top[bw4], ref, gmv,
+                              &have_newmv, &have_row_mvs);
+    }
+
+    const int nearest_match = have_col_mvs + have_row_mvs;
+    const int nearest_cnt = *cnt;
+    for (int n = 0; n < nearest_cnt; n++)
+        mvstack[n].weight += 640;
+
+    // temporal
+    int globalmv_ctx = rf->frm_hdr->use_ref_frame_mvs;
+    if (rf->use_ref_frame_mvs) {
+        const ptrdiff_t stride = rf->rp_stride;
+        const int by8 = by4 >> 1, bx8 = bx4 >> 1;
+        const refmvs_temporal_block *const rbi = &rt->rp_proj[(by8 & 15) * stride + bx8];
+        const refmvs_temporal_block *rb = rbi;
+        const int step_h = bw4 >= 16 ? 2 : 1, step_v = bh4 >= 16 ? 2 : 1;
+        const int w8 = imin((w4 + 1) >> 1, 8), h8 = imin((h4 + 1) >> 1, 8);
+        for (int y = 0; y < h8; y += step_v) {
+            for (int x = 0; x < w8; x+= step_h) {
+                add_temporal_candidate(rf, mvstack, cnt, &rb[x], ref,
+                                       !(x | y) ? &globalmv_ctx : NULL, tgmv);
+            }
+            rb += stride * step_v;
+        }
+        if (imin(bw4, bh4) >= 2 && imax(bw4, bh4) < 16) {
+            const int bh8 = bh4 >> 1, bw8 = bw4 >> 1;
+            rb = &rbi[bh8 * stride];
+            const int has_bottom = by8 + bh8 < imin(rt->tile_row.end >> 1,
+                                                    (by8 & ~7) + 8);
+            if (has_bottom && bx8 - 1 >= imax(rt->tile_col.start >> 1, bx8 & ~7)) {
+                add_temporal_candidate(rf, mvstack, cnt, &rb[-1], ref,
+                                       NULL, NULL);
+            }
+            if (bx8 + bw8 < imin(rt->tile_col.end >> 1, (bx8 & ~7) + 8)) {
+                if (has_bottom) {
+                    add_temporal_candidate(rf, mvstack, cnt, &rb[bw8], ref,
+                                           NULL, NULL);
+                }
+                if (by8 + bh8 - 1 < imin(rt->tile_row.end >> 1, (by8 & ~7) + 8)) {
+                    add_temporal_candidate(rf, mvstack, cnt, &rb[bw8 - stride],
+                                           ref, NULL, NULL);
+                }
+            }
+        }
+    }
+    assert(*cnt <= 8);
+
+    // top/left (which, confusingly, is part of "secondary" references)
+    int have_dummy_newmv_match;
+    if ((n_rows | n_cols) != ~0U) {
+        add_spatial_candidate(mvstack, cnt, 4, &b_top[-1], ref, gmv,
+                              &have_dummy_newmv_match, &have_row_mvs);
+    }
+
+    // "secondary" (non-direct neighbour) top & left edges
+    // what is different about secondary is that everything is now in 8x8 resolution
+    for (int n = 2; n <= 3; n++) {
+        if ((unsigned) n > n_rows && (unsigned) n <= max_rows) {
+            n_rows += scan_row(mvstack, cnt, ref, gmv,
+                               &rt->r[(((by4 & 31) - 2 * n + 1) | 1) + 5][bx4 | 1],
+                               bw4, w4, 1 + max_rows - n, bw4 >= 16 ? 4 : 2,
+                               &have_dummy_newmv_match, &have_row_mvs);
+        }
+
+        if ((unsigned) n > n_cols && (unsigned) n <= max_cols) {
+            n_cols += scan_col(mvstack, cnt, ref, gmv, &rt->r[((by4 & 31) | 1) + 5],
+                               bh4, h4, (bx4 - n * 2 + 1) | 1,
+                               1 + max_cols - n, bh4 >= 16 ? 4 : 2,
+                               &have_dummy_newmv_match, &have_col_mvs);
+        }
+    }
+    assert(*cnt <= 8);
+
+    const int ref_match_count = have_col_mvs + have_row_mvs;
+
+    // context build-up
+    int refmv_ctx, newmv_ctx;
+    switch (nearest_match) {
+    case 0:
+        refmv_ctx = imin(2, ref_match_count);
+        newmv_ctx = ref_match_count > 0;
+        break;
+    case 1:
+        refmv_ctx = imin(ref_match_count * 3, 4);
+        newmv_ctx = 3 - have_newmv;
+        break;
+    case 2:
+        refmv_ctx = 5;
+        newmv_ctx = 5 - have_newmv;
+        break;
+    }
+
+    // sorting (nearest, then "secondary")
+    int len = nearest_cnt;
+    while (len) {
+        int last = 0;
+        for (int n = 1; n < len; n++) {
+            if (mvstack[n - 1].weight < mvstack[n].weight) {
+#define EXCHANGE(a, b) do { refmvs_candidate tmp = a; a = b; b = tmp; } while (0)
+                EXCHANGE(mvstack[n - 1], mvstack[n]);
+                last = n;
+            }
+        }
+        len = last;
+    }
+    len = *cnt;
+    while (len > nearest_cnt) {
+        int last = nearest_cnt;
+        for (int n = nearest_cnt + 1; n < len; n++) {
+            if (mvstack[n - 1].weight < mvstack[n].weight) {
+                EXCHANGE(mvstack[n - 1], mvstack[n]);
+#undef EXCHANGE
+                last = n;
+            }
+        }
+        len = last;
+    }
+
+    if (ref.ref[1] > 0) {
+        if (*cnt < 2) {
+            const int sign0 = rf->sign_bias[ref.ref[0] - 1];
+            const int sign1 = rf->sign_bias[ref.ref[1] - 1];
+            const int sz4 = imin(w4, h4);
+            refmvs_candidate *const same = &mvstack[*cnt];
+            int same_count[4] = { 0 };
+
+            // non-self references in top
+            if (n_rows != ~0U) for (int x = 0; x < sz4;) {
+                const refmvs_block *const cand_b = &b_top[x];
+                add_compound_extended_candidate(same, same_count, cand_b,
+                                                sign0, sign1, ref, rf->sign_bias);
+                x += dav1d_block_dimensions[cand_b->bs][0];
+            }
+
+            // non-self references in left
+            if (n_cols != ~0U) for (int y = 0; y < sz4;) {
+                const refmvs_block *const cand_b = &b_left[y][bx4 - 1];
+                add_compound_extended_candidate(same, same_count, cand_b,
+                                                sign0, sign1, ref, rf->sign_bias);
+                y += dav1d_block_dimensions[cand_b->bs][1];
+            }
+
+            refmvs_candidate *const diff = &same[2];
+            const int *const diff_count = &same_count[2];
+
+            // merge together
+            for (int n = 0; n < 2; n++) {
+                int m = same_count[n];
+
+                if (m >= 2) continue;
+
+                const int l = diff_count[n];
+                if (l) {
+                    same[m].mv[n] = diff[0].mv[n];
+                    if (++m == 2) continue;
+                    if (l == 2) {
+                        same[1].mv[n] = diff[1].mv[n];
+                        continue;
+                    }
+                }
+                do {
+                    same[m].mv[n] = tgmv[n];
+                } while (++m < 2);
+            }
+
+            // if the first extended was the same as the non-extended one,
+            // then replace it with the second extended one
+            int n = *cnt;
+            if (n == 1 && mvstack[0].mv[0].n == same[0].mv[0].n &&
+                mvstack[0].mv[1].n == same[0].mv[1].n)
+            {
+                mvstack[1].mv[0] = mvstack[2].mv[0];
+                mvstack[1].mv[1] = mvstack[2].mv[1];
+            }
+            do {
+                mvstack[n].weight = 2;
+            } while (++n < 2);
+            *cnt = 2;
+        }
+
+        // clamping
+        const int left = -(bx4 + bw4 + 4) * 4 * 8;
+        const int right = (rf->iw4 - bx4 + 4) * 4 * 8;
+        const int top = -(by4 + bh4 + 4) * 4 * 8;
+        const int bottom = (rf->ih4 - by4 + 4) * 4 * 8;
+
+        const int n_refmvs = *cnt;
+        int n = 0;
+        do {
+            mvstack[n].mv[0].x = iclip(mvstack[n].mv[0].x, left, right);
+            mvstack[n].mv[0].y = iclip(mvstack[n].mv[0].y, top, bottom);
+            mvstack[n].mv[1].x = iclip(mvstack[n].mv[1].x, left, right);
+            mvstack[n].mv[1].y = iclip(mvstack[n].mv[1].y, top, bottom);
+        } while (++n < n_refmvs);
+
+        switch (refmv_ctx >> 1) {
+        case 0:
+            *ctx = imin(newmv_ctx, 1);
+            break;
+        case 1:
+            *ctx = 1 + imin(newmv_ctx, 3);
+            break;
+        case 2:
+            *ctx = iclip(3 + newmv_ctx, 4, 7);
+            break;
+        }
+
+        return;
+    } else if (*cnt < 2 && ref.ref[0] > 0) {
+        const int sign = rf->sign_bias[ref.ref[0] - 1];
+        const int sz4 = imin(w4, h4);
+
+        // non-self references in top
+        if (n_rows != ~0U) for (int x = 0; x < sz4 && *cnt < 2;) {
+            const refmvs_block *const cand_b = &b_top[x];
+            add_single_extended_candidate(mvstack, cnt, cand_b, sign, rf->sign_bias);
+            x += dav1d_block_dimensions[cand_b->bs][0];
+        }
+
+        // non-self references in left
+        if (n_cols != ~0U) for (int y = 0; y < sz4 && *cnt < 2;) {
+            const refmvs_block *const cand_b = &b_left[y][bx4 - 1];
+            add_single_extended_candidate(mvstack, cnt, cand_b, sign, rf->sign_bias);
+            y += dav1d_block_dimensions[cand_b->bs][1];
+        }
+    }
+    assert(*cnt <= 8);
+
+    // clamping
+    int n_refmvs = *cnt;
+    if (n_refmvs) {
+        const int left = -(bx4 + bw4 + 4) * 4 * 8;
+        const int right = (rf->iw4 - bx4 + 4) * 4 * 8;
+        const int top = -(by4 + bh4 + 4) * 4 * 8;
+        const int bottom = (rf->ih4 - by4 + 4) * 4 * 8;
+
+        int n = 0;
+        do {
+            mvstack[n].mv[0].x = iclip(mvstack[n].mv[0].x, left, right);
+            mvstack[n].mv[0].y = iclip(mvstack[n].mv[0].y, top, bottom);
+        } while (++n < n_refmvs);
+    }
+
+    for (int n = *cnt; n < 2; n++)
+        mvstack[n].mv[0] = tgmv[0];
+
+    *ctx = (refmv_ctx << 4) | (globalmv_ctx << 3) | newmv_ctx;
+}
+
+void dav1d_refmvs_tile_sbrow_init(refmvs_tile *const rt, const refmvs_frame *const rf,
+                                  const int tile_col_start4, const int tile_col_end4,
+                                  const int tile_row_start4, const int tile_row_end4,
+                                  const int sby, int tile_row_idx)
+{
+    if (rf->n_tile_threads == 1) tile_row_idx = 0;
+    rt->rp_proj = &rf->rp_proj[16 * rf->rp_stride * tile_row_idx];
+    refmvs_block *r = &rf->r[35 * rf->r_stride * tile_row_idx];
+    const int sbsz = rf->sbsz;
+    const int off = (sbsz * sby) & 16;
+    for (int i = 0; i < sbsz; i++, r += rf->r_stride)
+        rt->r[off + 5 + i] = r;
+    rt->r[off + 0] = r;
+    r += rf->r_stride;
+    rt->r[off + 1] = NULL;
+    rt->r[off + 2] = r;
+    r += rf->r_stride;
+    rt->r[off + 3] = NULL;
+    rt->r[off + 4] = r;
+    if (sby & 1) {
+#define EXCHANGE(a, b) do { void *const tmp = a; a = b; b = tmp; } while (0)
+        EXCHANGE(rt->r[off + 0], rt->r[off + sbsz + 0]);
+        EXCHANGE(rt->r[off + 2], rt->r[off + sbsz + 2]);
+        EXCHANGE(rt->r[off + 4], rt->r[off + sbsz + 4]);
+#undef EXCHANGE
+    }
+
+    rt->rf = rf;
+    rt->tile_row.start = tile_row_start4;
+    rt->tile_row.end = imin(tile_row_end4, rf->ih4);
+    rt->tile_col.start = tile_col_start4;
+    rt->tile_col.end = imin(tile_col_end4, rf->iw4);
+}
+
+void dav1d_refmvs_load_tmvs(const refmvs_frame *const rf, int tile_row_idx,
+                            const int col_start8, const int col_end8,
+                            const int row_start8, int row_end8)
+{
+    if (rf->n_tile_threads == 1) tile_row_idx = 0;
+    assert(row_start8 >= 0);
+    assert((unsigned) (row_end8 - row_start8) <= 16U);
+    row_end8 = imin(row_end8, rf->ih8);
+    const int col_start8i = imax(col_start8 - 8, 0);
+    const int col_end8i = imin(col_end8 + 8, rf->iw8);
+
+    const ptrdiff_t stride = rf->rp_stride;
+    refmvs_temporal_block *rp_proj =
+        &rf->rp_proj[16 * stride * tile_row_idx + (row_start8 & 15) * stride];
+    for (int y = row_start8; y < row_end8; y++) {
+        for (int x = col_start8; x < col_end8; x++)
+            rp_proj[x].mv.n = INVALID_MV;
+        rp_proj += stride;
+    }
+
+    rp_proj = &rf->rp_proj[16 * stride * tile_row_idx];
+    for (int n = 0; n < rf->n_mfmvs; n++) {
+        const int ref2cur = rf->mfmv_ref2cur[n];
+        if (ref2cur == INT_MIN) continue;
+
+        const int ref = rf->mfmv_ref[n];
+        const int ref_sign = ref - 4;
+        const refmvs_temporal_block *r = &rf->rp_ref[ref][row_start8 * stride];
+        for (int y = row_start8; y < row_end8; y++) {
+            const int y_sb_align = y & ~7;
+            const int y_proj_start = imax(y_sb_align, row_start8);
+            const int y_proj_end = imin(y_sb_align + 8, row_end8);
+            for (int x = col_start8i; x < col_end8i; x++) {
+                const refmvs_temporal_block *rb = &r[x];
+                const int b_ref = rb->ref;
+                if (!b_ref) continue;
+                const int ref2ref = rf->mfmv_ref2ref[n][b_ref - 1];
+                if (!ref2ref) continue;
+                const mv b_mv = rb->mv;
+                const mv offset = mv_projection(b_mv, ref2cur, ref2ref);
+                int pos_x = x + apply_sign(abs(offset.x) >> 6,
+                                           offset.x ^ ref_sign);
+                const int pos_y = y + apply_sign(abs(offset.y) >> 6,
+                                                 offset.y ^ ref_sign);
+                if (pos_y >= y_proj_start && pos_y < y_proj_end) {
+                    const ptrdiff_t pos = (pos_y & 15) * stride;
+                    for (;;) {
+                        const int x_sb_align = x & ~7;
+                        if (pos_x >= imax(x_sb_align - 8, col_start8) &&
+                            pos_x < imin(x_sb_align + 16, col_end8))
+                        {
+                            rp_proj[pos + pos_x].mv = rb->mv;
+                            rp_proj[pos + pos_x].ref = ref2ref;
+                        }
+                        if (++x >= col_end8i) break;
+                        rb++;
+                        if (rb->ref != b_ref || rb->mv.n != b_mv.n) break;
+                        pos_x++;
+                    }
+                } else {
+                    for (;;) {
+                        if (++x >= col_end8i) break;
+                        rb++;
+                        if (rb->ref != b_ref || rb->mv.n != b_mv.n) break;
+                    }
+                }
+                x--;
+            }
+            r += stride;
+        }
+    }
+}
+
+void dav1d_refmvs_save_tmvs(const refmvs_tile *const rt,
+                            const int col_start8, int col_end8,
+                            const int row_start8, int row_end8)
+{
+    const refmvs_frame *const rf = rt->rf;
+
+    assert(row_start8 >= 0);
+    assert((unsigned) (row_end8 - row_start8) <= 16U);
+    row_end8 = imin(row_end8, rf->ih8);
+    col_end8 = imin(col_end8, rf->iw8);
+
+    const ptrdiff_t stride = rf->rp_stride;
+    const uint8_t *const ref_sign = rf->mfmv_sign;
+    refmvs_temporal_block *rp = &rf->rp[row_start8 * stride];
+    for (int y = row_start8; y < row_end8; y++) {
+        const refmvs_block *const b = rt->r[6 + (y & 15) * 2];
+
+        for (int x = col_start8; x < col_end8;) {
+            const refmvs_block *const cand_b = &b[x * 2 + 1];
+            const int bw8 = (dav1d_block_dimensions[cand_b->bs][0] + 1) >> 1;
+
+            if (cand_b->ref.ref[1] > 0 && ref_sign[cand_b->ref.ref[1] - 1] &&
+                (abs(cand_b->mv[1].y) | abs(cand_b->mv[1].x)) < 4096)
+            {
+                for (int n = 0; n < bw8; n++, x++)
+                    rp[x] = (refmvs_temporal_block) { .mv = cand_b->mv[1],
+                                                      .ref = cand_b->ref.ref[1] };
+            } else if (cand_b->ref.ref[0] > 0 && ref_sign[cand_b->ref.ref[0] - 1] &&
+                       (abs(cand_b->mv[0].y) | abs(cand_b->mv[0].x)) < 4096)
+            {
+                for (int n = 0; n < bw8; n++, x++)
+                    rp[x] = (refmvs_temporal_block) { .mv = cand_b->mv[0],
+                                                      .ref = cand_b->ref.ref[0] };
+            } else {
+                for (int n = 0; n < bw8; n++, x++)
+                    rp[x].ref = 0; // "invalid"
+            }
+        }
+        rp += stride;
+    }
+}
+
+int dav1d_refmvs_init_frame(refmvs_frame *const rf,
+                            const Dav1dSequenceHeader *const seq_hdr,
+                            const Dav1dFrameHeader *const frm_hdr,
+                            const unsigned ref_poc[7],
+                            refmvs_temporal_block *const rp,
+                            const unsigned ref_ref_poc[7][7],
+                            /*const*/ refmvs_temporal_block *const rp_ref[7],
+                            const int n_tile_threads)
+{
+    rf->sbsz = 16 << seq_hdr->sb128;
+    rf->frm_hdr = frm_hdr;
+    rf->iw8 = (frm_hdr->width[0] + 7) >> 3;
+    rf->ih8 = (frm_hdr->height + 7) >> 3;
+    rf->iw4 = rf->iw8 << 1;
+    rf->ih4 = rf->ih8 << 1;
+
+    const ptrdiff_t r_stride = ((frm_hdr->width[0] + 127) & ~127) >> 2;
+    const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1;
+    if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) {
+        if (rf->r) free(rf->r);
+        rf->r = malloc(sizeof(*rf->r) * 35 * r_stride * n_tile_rows);
+        if (!rf->r) return DAV1D_ERR(ENOMEM);
+        rf->r_stride = r_stride;
+    }
+
+    const ptrdiff_t rp_stride = r_stride >> 1;
+    if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) {
+        if (rf->rp_proj) free(rf->rp_proj);
+        rf->rp_proj = malloc(sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows);
+        if (!rf->rp_proj) return DAV1D_ERR(ENOMEM);
+        rf->rp_stride = rp_stride;
+    }
+    rf->n_tile_rows = n_tile_rows;
+    rf->n_tile_threads = n_tile_threads;
+    rf->rp = rp;
+    rf->rp_ref = rp_ref;
+    const unsigned poc = frm_hdr->frame_offset;
+    for (int i = 0; i < 7; i++) {
+        const int poc_diff = get_poc_diff(seq_hdr->order_hint_n_bits,
+                                          ref_poc[i], poc);
+        rf->sign_bias[i] = poc_diff > 0;
+        rf->mfmv_sign[i] = poc_diff < 0;
+        rf->pocdiff[i] = iclip(get_poc_diff(seq_hdr->order_hint_n_bits,
+                                            poc, ref_poc[i]), -31, 31);
+    }
+
+    // temporal MV setup
+    rf->n_mfmvs = 0;
+    if (frm_hdr->use_ref_frame_mvs && seq_hdr->order_hint_n_bits) {
+        int total = 2;
+        if (rp_ref[0] && ref_ref_poc[0][6] != ref_poc[3] /* alt-of-last != gold */) {
+            rf->mfmv_ref[rf->n_mfmvs++] = 0; // last
+            total = 3;
+        }
+        if (rp_ref[4] && get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[4],
+                                      frm_hdr->frame_offset) > 0)
+        {
+            rf->mfmv_ref[rf->n_mfmvs++] = 4; // bwd
+        }
+        if (rp_ref[5] && get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[5],
+                                      frm_hdr->frame_offset) > 0)
+        {
+            rf->mfmv_ref[rf->n_mfmvs++] = 5; // altref2
+        }
+        if (rf->n_mfmvs < total && rp_ref[6] &&
+            get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[6],
+                         frm_hdr->frame_offset) > 0)
+        {
+            rf->mfmv_ref[rf->n_mfmvs++] = 6; // altref
+        }
+        if (rf->n_mfmvs < total && rp_ref[1])
+            rf->mfmv_ref[rf->n_mfmvs++] = 1; // last2
+
+        for (int n = 0; n < rf->n_mfmvs; n++) {
+            const unsigned rpoc = ref_poc[rf->mfmv_ref[n]];
+            const int diff1 = get_poc_diff(seq_hdr->order_hint_n_bits,
+                                           rpoc, frm_hdr->frame_offset);
+            if (abs(diff1) > 31) {
+                rf->mfmv_ref2cur[n] = INT_MIN;
+            } else {
+                rf->mfmv_ref2cur[n] = rf->mfmv_ref[n] < 4 ? -diff1 : diff1;
+                for (int m = 0; m < 7; m++) {
+                    const unsigned rrpoc = ref_ref_poc[rf->mfmv_ref[n]][m];
+                    const int diff2 = get_poc_diff(seq_hdr->order_hint_n_bits,
+                                                   rpoc, rrpoc);
+                    // unsigned comparison also catches the < 0 case
+                    rf->mfmv_ref2ref[n][m] = (unsigned) diff2 > 31U ? 0 : diff2;
+                }
+            }
+        }
+    }
+    rf->use_ref_frame_mvs = rf->n_mfmvs > 0;
+
+    return 0;
+}
+
+void dav1d_refmvs_init(refmvs_frame *const rf) {
+    rf->r = NULL;
+    rf->r_stride = 0;
+    rf->rp_proj = NULL;
+    rf->rp_stride = 0;
+}
+
+void dav1d_refmvs_clear(refmvs_frame *const rf) {
+    if (rf->r) free(rf->r);
+    if (rf->rp_proj) free(rf->rp_proj);
+}
--- /dev/null
+++ b/src/refmvs.h
@@ -1,0 +1,249 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2020, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_REF_MVS_H
+#define DAV1D_SRC_REF_MVS_H
+
+#include <stdint.h>
+
+#include "dav1d/headers.h"
+
+#include "common/intops.h"
+
+#include "src/intra_edge.h"
+#include "src/levels.h"
+#include "src/tables.h"
+
+#define INVALID_MV 0x80008000
+
+typedef struct refmvs_temporal_block {
+    mv mv;
+    int8_t ref;
+} refmvs_temporal_block;
+
+typedef union refmvs_refpair {
+    int8_t ref[2]; // [0] = 0: intra=1, [1] = -1: comp=0
+    uint16_t pair;
+} refmvs_refpair;
+
+// would be nice to have a mvpair also, so double mv comparisons in
+// add_{spatial,temporal}_candidate() can be done in a single comparison,
+// but that would extend the size of refmvs_block to 16 byte (from 12)
+// (on x86-64) which we probably don't want to do.
+
+typedef struct refmvs_block {
+    mv mv[2];
+    refmvs_refpair ref;
+    uint8_t bs, mf; // 1 = globalmv+affine, 2 = newmv
+} refmvs_block;
+
+typedef struct refmvs_frame {
+    const Dav1dFrameHeader *frm_hdr;
+    int iw4, ih4, iw8, ih8;
+    int sbsz;
+    int use_ref_frame_mvs;
+    uint8_t sign_bias[7], mfmv_sign[7];
+    int8_t pocdiff[7];
+    uint8_t mfmv_ref[3];
+    int mfmv_ref2cur[3];
+    int mfmv_ref2ref[3][7];
+    int n_mfmvs;
+
+    refmvs_temporal_block *rp;
+    /*const*/ refmvs_temporal_block *const *rp_ref;
+    refmvs_temporal_block *rp_proj;
+    ptrdiff_t rp_stride;
+
+    refmvs_block *r; // 35 x r_stride memory
+    ptrdiff_t r_stride;
+    int n_tile_rows, n_tile_threads;
+} refmvs_frame;
+
+typedef struct refmvs_tile {
+    const refmvs_frame *rf;
+    refmvs_block *r[32 + 5];
+    refmvs_temporal_block *rp_proj;
+    struct {
+        int start, end;
+    } tile_col, tile_row;
+} refmvs_tile;
+
+typedef struct refmvs_candidate {
+    mv mv[2];
+    int weight;
+} refmvs_candidate;
+
+// call once per frame thread
+void dav1d_refmvs_init(refmvs_frame *rf);
+void dav1d_refmvs_clear(refmvs_frame *rf);
+
+// call once per frame
+int dav1d_refmvs_init_frame(refmvs_frame *rf,
+                            const Dav1dSequenceHeader *seq_hdr,
+                            const Dav1dFrameHeader *frm_hdr,
+                            const unsigned ref_poc[7],
+                            refmvs_temporal_block *rp,
+                            const unsigned ref_ref_poc[7][7],
+                            /*const*/ refmvs_temporal_block *const rp_ref[7],
+                            int n_tile_threads);
+
+// initialize temporal MVs; this can be done in any configuration, e.g. one
+// tile/sbrow at a time, where col_{start,end}8 are the tile boundaries; or
+// it can just be for the whole frame's sbrow, where col_{start,end}8 are the
+// frame boundaries. row_{start,end}8 are the superblock row boundaries.
+void dav1d_refmvs_load_tmvs(const refmvs_frame *rf, int tile_row_idx,
+                            int col_start8, int col_end8,
+                            int row_start8, int row_end8);
+
+// cache the current tile/sbrow (or frame/sbrow)'s projectable motion vectors
+// into buffers for use in future frame's temporal MV prediction
+void dav1d_refmvs_save_tmvs(const refmvs_tile *rt,
+                            int col_start8, int col_end8,
+                            int row_start8, int row_end8);
+
+// initialize tile boundaries and refmvs_block pointers for one tile/sbrow
+void dav1d_refmvs_tile_sbrow_init(refmvs_tile *rt, const refmvs_frame *rf,
+                                  int tile_col_start4, int tile_col_end4,
+                                  int tile_row_start4, int tile_row_end4,
+                                  int sby, int tile_row_idx);
+
+// call for each block
+void dav1d_refmvs_find(const refmvs_tile *rt,
+                       refmvs_candidate mvstack[8], int *cnt,
+                       int *ctx, const refmvs_refpair ref, enum BlockSize bs,
+                       enum EdgeFlags edge_flags, int by4, int bx4);
+
+static inline void splat_oneref_mv(refmvs_tile *const rt,
+                                   const int by4, const int bx4,
+                                   const enum BlockSize bs,
+                                   const enum InterPredMode mode,
+                                   const int ref, const mv mv,
+                                   const int is_interintra)
+{
+    const int bw4 = dav1d_block_dimensions[bs][0];
+    int bh4 = dav1d_block_dimensions[bs][1];
+    refmvs_block **rr = &rt->r[(by4 & 31) + 5];
+
+    const refmvs_block tmpl = (refmvs_block) {
+        .ref.ref = { ref + 1, is_interintra ? 0 : -1 },
+        .mv = { mv },
+        .bs = bs,
+        .mf = (mode == GLOBALMV && imin(bw4, bh4) >= 2) | ((mode == NEWMV) * 2),
+    };
+    do {
+        refmvs_block *r = *rr++ + bx4;
+        for (int x = 0; x < bw4; x++)
+            r[x] = tmpl;
+    } while (--bh4);
+}
+
+static inline void splat_intrabc_mv(refmvs_tile *const rt,
+                                    const int by4, const int bx4,
+                                    const enum BlockSize bs, const mv mv)
+{
+    const int bw4 = dav1d_block_dimensions[bs][0];
+    int bh4 = dav1d_block_dimensions[bs][1];
+    refmvs_block **rr = &rt->r[(by4 & 31) + 5];
+
+    const refmvs_block tmpl = (refmvs_block) {
+        .ref.ref = { 0, -1 },
+        .mv = { mv },
+        .bs = bs,
+        .mf = 0,
+    };
+    do {
+        refmvs_block *r = *rr++ + bx4;
+        for (int x = 0; x < bw4; x++) {
+            r[x] = tmpl;
+        }
+    } while (--bh4);
+}
+
+static inline void splat_tworef_mv(refmvs_tile *const rt,
+                                   const int by4, const int bx4,
+                                   const enum BlockSize bs,
+                                   const enum CompInterPredMode mode,
+                                   const int ref1, const int ref2,
+                                   const mv mv[2])
+{
+    const int bw4 = dav1d_block_dimensions[bs][0];
+    int bh4 = dav1d_block_dimensions[bs][1];
+    refmvs_block **rr = &rt->r[(by4 & 31) + 5];
+
+    assert(bw4 >= 2 && bh4 >= 2);
+    const refmvs_block tmpl = (refmvs_block) {
+        .ref.ref = { ref1 + 1, ref2 + 1 },
+        .mv = { mv[0], mv[1] },
+        .bs = bs,
+        .mf = (mode == GLOBALMV_GLOBALMV) | !!((1 << mode) & (0xbc)) * 2,
+    };
+    do {
+        refmvs_block *r = *rr++ + bx4;
+        for (int x = 0; x < bw4; x++)
+            r[x] = tmpl;
+    } while (--bh4);
+}
+
+static inline void splat_intraref(refmvs_tile *const rt,
+                                  const int by4, const int bx4,
+                                  const enum BlockSize bs)
+{
+    const int bw4 = dav1d_block_dimensions[bs][0];
+    int bh4 = dav1d_block_dimensions[bs][1];
+    refmvs_block **rr = &rt->r[(by4 & 31) + 5];
+
+    const refmvs_block tmpl = (refmvs_block) {
+        .ref.ref = { 0, -1 },
+        .mv = { [0] = { .n = INVALID_MV } },
+        .bs = bs,
+        .mf = 0,
+    };
+    do {
+        refmvs_block *r = *rr++ + bx4;
+        for (int x = 0; x < bw4; x++) {
+            r[x] = tmpl;
+        }
+    } while (--bh4);
+}
+
+static inline void fix_int_mv_precision(mv *const mv) {
+    mv->x = (mv->x - (mv->x >> 15) + 3) & ~7U;
+    mv->y = (mv->y - (mv->y >> 15) + 3) & ~7U;
+}
+
+static inline void fix_mv_precision(const Dav1dFrameHeader *const hdr,
+                                    mv *const mv)
+{
+    if (hdr->force_integer_mv) {
+        fix_int_mv_precision(mv);
+    } else if (!hdr->hp) {
+        mv->x = (mv->x - (mv->x >> 15)) & ~1U;
+        mv->y = (mv->y - (mv->y >> 15)) & ~1U;
+    }
+}
+
+#endif /* DAV1D_SRC_REF_MVS_H */
--- a/src/warpmv.c
+++ b/src/warpmv.c
@@ -130,6 +130,22 @@
     return iclip(v2, 0xe001, 0x11fff);
 }
 
+void dav1d_set_affine_mv2d(const int bw4, const int bh4,
+                           const mv mv, Dav1dWarpedMotionParams *const wm,
+                           const int bx4, const int by4)
+{
+    int32_t *const mat = wm->matrix;
+    const int rsuy = 2 * bh4 - 1;
+    const int rsux = 2 * bw4 - 1;
+    const int isuy = by4 * 4 + rsuy;
+    const int isux = bx4 * 4 + rsux;
+
+    mat[0] = iclip(mv.x * 0x2000 - (isux * (mat[2] - 0x10000) + isuy * mat[3]),
+                   -0x800000, 0x7fffff);
+    mat[1] = iclip(mv.y * 0x2000 - (isux * mat[4] + isuy * (mat[5] - 0x10000)),
+                   -0x800000, 0x7fffff);
+}
+
 int dav1d_find_affine_int(const int (*pts)[2][2], const int np,
                           const int bw4, const int bh4,
                           const mv mv, Dav1dWarpedMotionParams *const wm,
--- a/src/warpmv.h
+++ b/src/warpmv.h
@@ -32,6 +32,8 @@
 
 int dav1d_get_shear_params(Dav1dWarpedMotionParams *wm);
 int dav1d_find_affine_int(const int (*pts)[2][2], int np, int bw4, int bh4,
-                          mv mv, Dav1dWarpedMotionParams *wm, int by, int bx);
+                          mv mv, Dav1dWarpedMotionParams *wm, int bx, int by);
+void dav1d_set_affine_mv2d(int bw4, int bh4,
+                           mv mv, Dav1dWarpedMotionParams *wm, int bx, int by);
 
 #endif /* DAV1D_SRC_WARPMV_H */