shithub: dav1d

Download patch

ref: 58fc51659634b48026da97eced714d214c97857a
parent: 8b8e9fe85f6875a86ed66726e8964450a318cdc6
author: Henrik Gramner <gramner@twoorioles.com>
date: Fri Nov 9 15:18:18 EST 2018

Split MC blend

The mstride == 0, mstride == 1, and mstride == w cases are very different
from each other, and splitting them into separate functions makes it easier
top optimize them.

Also add some further optimizations to the AVX2 asm that became possible
after this change.

--- a/src/mc.h
+++ b/src/mc.h
@@ -81,11 +81,14 @@
 typedef decl_w_mask_fn(*w_mask_fn);
 
 #define decl_blend_fn(name) \
-void (name)(pixel *dst, ptrdiff_t dst_stride, \
-            const pixel *tmp, int w, int h, \
-            const uint8_t *mask, ptrdiff_t mstride)
+void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, \
+            int w, int h, const uint8_t *mask)
 typedef decl_blend_fn(*blend_fn);
 
+#define decl_blend_dir_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, int w, int h)
+typedef decl_blend_dir_fn(*blend_dir_fn);
+
 #define decl_emu_edge_fn(name) \
 void (name)(intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih, intptr_t x, intptr_t y, \
             pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride)
@@ -99,6 +102,8 @@
     mask_fn mask;
     w_mask_fn w_mask[3 /* 444, 422, 420 */];
     blend_fn blend;
+    blend_dir_fn blend_v;
+    blend_dir_fn blend_h;
     warp8x8_fn warp8x8;
     warp8x8t_fn warp8x8t;
     emu_edge_fn emu_edge;
--- a/src/mc_tmpl.c
+++ b/src/mc_tmpl.c
@@ -373,21 +373,48 @@
     } while (--h);
 }
 
-static void blend_c(pixel *dst, const ptrdiff_t dst_stride,
-                    const pixel *tmp, const int w, const int h,
-                    const uint8_t *mask, const ptrdiff_t m_stride)
+#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
+static NOINLINE void
+blend_internal_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+                 const int w, int h, const uint8_t *mask,
+                 const ptrdiff_t mask_stride)
 {
-    for (int y = 0; y < h; y++) {
+    do {
         for (int x = 0; x < w; x++) {
-#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
-            dst[x] = blend_px(dst[x], tmp[x], mask[m_stride == 1 ? 0 : x]);
+            dst[x] = blend_px(dst[x], tmp[x], mask[x]);
         }
         dst += PXSTRIDE(dst_stride);
         tmp += w;
-        mask += m_stride;
-    }
+        mask += mask_stride;
+    } while (--h);
 }
 
+static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+                    const int w, const int h, const uint8_t *mask)
+{
+    blend_internal_c(dst, dst_stride, tmp, w, h, mask, w);
+}
+
+static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+                      const int w, const int h)
+{
+    blend_internal_c(dst, dst_stride, tmp, w, h, &dav1d_obmc_masks[w], 0);
+}
+
+static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+                      const int w, int h)
+{
+    const uint8_t *mask = &dav1d_obmc_masks[h];
+    do {
+        const int m = *mask++;
+        for (int x = 0; x < w; x++) {
+            dst[x] = blend_px(dst[x], tmp[x], m);
+        }
+        dst += PXSTRIDE(dst_stride);
+        tmp += w;
+    } while (--h);
+}
+
 static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
                      const coef *tmp1, const coef *tmp2, const int w, int h,
                      uint8_t *mask, const int sign,
@@ -591,6 +618,8 @@
     c->w_avg    = w_avg_c;
     c->mask     = mask_c;
     c->blend    = blend_c;
+    c->blend_v  = blend_v_c;
+    c->blend_h  = blend_h_c;
     c->w_mask[0] = w_mask_444_c;
     c->w_mask[1] = w_mask_422_c;
     c->w_mask[2] = w_mask_420_c;
--- a/src/recon_tmpl.c
+++ b/src/recon_tmpl.c
@@ -579,9 +579,8 @@
                          &f->refp[a_r->ref[0] - 1],
                          dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
                 if (res) return res;
-                f->dsp->mc.blend(&dst[x * h_mul], dst_stride, lap,
-                                 h_mul * ow4, v_mul * oh4,
-                                 &dav1d_obmc_masks[v_mul * oh4], 1);
+                f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
+                                   h_mul * ow4, v_mul * oh4);
                 i++;
             }
             x += imax(a_b_dim[0], 2);
@@ -603,9 +602,8 @@
                          &f->refp[l_r->ref[0] - 1],
                          dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
                 if (res) return res;
-                f->dsp->mc.blend(&dst[y * v_mul * PXSTRIDE(dst_stride)],
-                                 dst_stride, lap, h_mul * ow4, v_mul * oh4,
-                                 &dav1d_obmc_masks[h_mul * ow4], 0);
+                f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
+                                   dst_stride, lap, h_mul * ow4, v_mul * oh4);
                 i++;
             }
             y += imax(l_b_dim[1], 2);
@@ -1144,7 +1142,7 @@
                      dav1d_ii_masks[bs][0][b->interintra_mode] :
                      dav1d_wedge_masks[bs][0][0][b->wedge_idx];
             dsp->mc.blend(dst, f->cur.p.stride[0], tmp,
-                          bw4 * 4, bh4 * 4, ii_mask, bw4 * 4);
+                          bw4 * 4, bh4 * 4, ii_mask);
         }
 
         if (!has_chroma) goto skip_inter_chroma_pred;
@@ -1277,7 +1275,7 @@
                     dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
                                              tl_edge, cbw4 * 4, cbh4 * 4, 0);
                     dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp,
-                                  cbw4 * 4, cbh4 * 4, ii_mask, cbw4 * 4);
+                                  cbw4 * 4, cbh4 * 4, ii_mask);
                 }
             }
         }
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -30,6 +30,23 @@
 
 SECTION_RODATA 32
 
+; dav1d_obmc_masks[] with 64-x interleaved
+obmc_masks: db  0,  0,  0,  0
+            ; 2
+            db 45, 19, 64,  0
+            ; 4
+            db 39, 25, 50, 14, 59,  5, 64,  0
+            ; 8
+            db 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
+            ; 16
+            db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+            db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
+            ; 32
+            db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+            db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
+            db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
+            db 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0
+
 warp_8x8_shufA: db 0,  2,  4,  6,  1,  3,  5,  7,  1,  3,  5,  7,  2,  4,  6,  8
                 db 4,  6,  8, 10,  5,  7,  9, 11,  5,  7,  9, 11,  6,  8, 10, 12
 warp_8x8_shufB: db 2,  4,  6,  8,  3,  5,  7,  9,  3,  5,  7,  9,  4,  6,  8, 10
@@ -42,10 +59,9 @@
 bilin_h_shuf4:  db 1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
 bilin_h_shuf8:  db 1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
 deint_shuf4:    db 0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
+blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
 
-blend_shuf: ; bits 0-3: 0, 0, 0, 0, 1, 1, 1, 1
 pb_64:   times 4 db 64
-         times 4 db 1
 pw_8:    times 2 dw 8
 pw_26:   times 2 dw 26
 pw_34:   times 2 dw 34
@@ -61,7 +77,7 @@
 cextern mc_subpel_filters
 %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
 
-%macro BIDIR_JMP_TABLE 1-* 4, 8, 16, 32, 64, 128
+%macro BIDIR_JMP_TABLE 1-*
     %xdefine %1_table (%%table - 2*%2)
     %xdefine %%base %1_table
     %xdefine %%prefix mangle(private_prefix %+ _%1)
@@ -72,11 +88,13 @@
     %endrep
 %endmacro
 
-BIDIR_JMP_TABLE avg_avx2
-BIDIR_JMP_TABLE w_avg_avx2
-BIDIR_JMP_TABLE mask_avx2
-BIDIR_JMP_TABLE w_mask_420_avx2
-BIDIR_JMP_TABLE blend_avx2, 2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE avg_avx2,        4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg_avx2,      4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask_avx2,       4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend_avx2,      4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32
 
 %macro BASE_JMP_TABLE 3-*
     %xdefine %1_%2_table (%%table - %3)
@@ -3286,7 +3304,7 @@
     jg .w128_loop
     RET
 
-cglobal blend, 3, 7, 6, dst, ds, tmp, w, h, mask, ms
+cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
 %define base r6-blend_avx2_table
     lea                  r6, [blend_avx2_table]
     tzcnt                wd, wm
@@ -3296,55 +3314,125 @@
     vpbroadcastd         m4, [base+pb_64]
     vpbroadcastd         m5, [base+pw_512]
     add                  wq, r6
-    mov                 msq, msmp
+    lea                  r6, [dsq*3]
     jmp                  wq
-.w2:
-    cmp                 msq, 1
-    jb .w2_s0
-    je .w2_s1
-.w2_s2:
-    movd                xm1, [maskq]
+.w4:
     movd                xm0, [dstq+dsq*0]
-    pinsrw              xm0, [dstq+dsq*1], 1
-    psubb               xm2, xm4, xm1
-    punpcklbw           xm2, xm1
-    movd                xm1, [tmpq]
-    add               maskq, 2*2
-    add                tmpq, 2*2
-    punpcklbw           xm0, xm1
+    pinsrd              xm0, [dstq+dsq*1], 1
+    vpbroadcastd        xm1, [dstq+dsq*2]
+    pinsrd              xm1, [dstq+r6   ], 3
+    mova                xm6, [maskq]
+    psubb               xm3, xm4, xm6
+    punpcklbw           xm2, xm3, xm6
+    punpckhbw           xm3, xm6
+    mova                xm6, [tmpq]
+    add               maskq, 4*4
+    add                tmpq, 4*4
+    punpcklbw           xm0, xm6
+    punpckhbw           xm1, xm6
     pmaddubsw           xm0, xm2
+    pmaddubsw           xm1, xm3
     pmulhrsw            xm0, xm5
-    packuswb            xm0, xm0
-    pextrw     [dstq+dsq*0], xm0, 0
-    pextrw     [dstq+dsq*1], xm0, 1
-    lea                dstq, [dstq+dsq*2]
-    sub                  hd, 2
-    jg .w2_s2
+    pmulhrsw            xm1, xm5
+    packuswb            xm0, xm1
+    movd       [dstq+dsq*0], xm0
+    pextrd     [dstq+dsq*1], xm0, 1
+    pextrd     [dstq+dsq*2], xm0, 2
+    pextrd     [dstq+r6   ], xm0, 3
+    lea                dstq, [dstq+dsq*4]
+    sub                  hd, 4
+    jg .w4
     RET
-.w2_s1:
-    movd                xm1, [maskq]
-    movd                xm0, [dstq+dsq*0]
-    psubb               xm2, xm4, xm1
-    punpcklbw           xm2, xm1
-    pinsrw              xm0, [dstq+dsq*1], 1
-    movd                xm1, [tmpq]
-    punpcklwd           xm2, xm2
-    add               maskq, 2
-    add                tmpq, 2*2
-    punpcklbw           xm0, xm1
-    pmaddubsw           xm0, xm2
-    pmulhrsw            xm0, xm5
-    packuswb            xm0, xm0
-    pextrw     [dstq+dsq*0], xm0, 0
-    pextrw     [dstq+dsq*1], xm0, 1
+ALIGN function_align
+.w8:
+    movq                xm1, [dstq+dsq*0]
+    movhps              xm1, [dstq+dsq*1]
+    vpbroadcastq         m2, [dstq+dsq*2]
+    vpbroadcastq         m3, [dstq+r6   ]
+    mova                 m0, [maskq]
+    mova                 m6, [tmpq]
+    add               maskq, 8*4
+    add                tmpq, 8*4
+    vpblendd             m1, m2, 0x30
+    vpblendd             m1, m3, 0xc0
+    psubb                m3, m4, m0
+    punpcklbw            m2, m3, m0
+    punpckhbw            m3, m0
+    punpcklbw            m0, m1, m6
+    punpckhbw            m1, m6
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    vextracti128        xm1, m0, 1
+    movq       [dstq+dsq*0], xm0
+    movhps     [dstq+dsq*1], xm0
+    movq       [dstq+dsq*2], xm1
+    movhps     [dstq+r6   ], xm1
+    lea                dstq, [dstq+dsq*4]
+    sub                  hd, 4
+    jg .w8
+    RET
+ALIGN function_align
+.w16:
+    mova                 m0, [maskq]
+    mova                xm1, [dstq+dsq*0]
+    vinserti128          m1, [dstq+dsq*1], 1
+    psubb                m3, m4, m0
+    punpcklbw            m2, m3, m0
+    punpckhbw            m3, m0
+    mova                 m6, [tmpq]
+    add               maskq, 16*2
+    add                tmpq, 16*2
+    punpcklbw            m0, m1, m6
+    punpckhbw            m1, m6
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    mova         [dstq+dsq*0], xm0
+    vextracti128 [dstq+dsq*1], m0, 1
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
-    jg .w2_s1
+    jg .w16
     RET
-.w2_s0:
-    vpbroadcastw        xm0, [maskq]
-    psubb               xm4, xm0
-    punpcklbw           xm4, xm0
+ALIGN function_align
+.w32:
+    mova                 m0, [maskq]
+    mova                 m1, [dstq]
+    mova                 m6, [tmpq]
+    add               maskq, 32
+    add                tmpq, 32
+    psubb                m3, m4, m0
+    punpcklbw            m2, m3, m0
+    punpckhbw            m3, m0
+    punpcklbw            m0, m1, m6
+    punpckhbw            m1, m6
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    mova             [dstq], m0
+    add                dstq, dsq
+    dec                  hd
+    jg .w32
+    RET
+
+cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_avx2_table
+    lea                  r5, [blend_v_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, dword [r5+wq*4]
+    vpbroadcastd         m5, [base+pw_512]
+    add                  wq, r5
+    add               maskq, obmc_masks-blend_v_avx2_table
+    jmp                  wq
+.w2:
+    vpbroadcastd        xm2, [maskq+2*2]
 .w2_s0_loop:
     movd                xm0, [dstq+dsq*0]
     pinsrw              xm0, [dstq+dsq*1], 1
@@ -3351,7 +3439,7 @@
     movd                xm1, [tmpq]
     add                tmpq, 2*2
     punpcklbw           xm0, xm1
-    pmaddubsw           xm0, xm4
+    pmaddubsw           xm0, xm2
     pmulhrsw            xm0, xm5
     packuswb            xm0, xm0
     pextrw     [dstq+dsq*0], xm0, 0
@@ -3362,17 +3450,11 @@
     RET
 ALIGN function_align
 .w4:
-    cmp                 msq, 1
-    jb .w4_s0
-    je .w4_s1
-.w4_s4:
-    movq                xm1, [maskq]
+    vpbroadcastq        xm2, [maskq+4*2]
+.w4_loop:
     movd                xm0, [dstq+dsq*0]
     pinsrd              xm0, [dstq+dsq*1], 1
-    psubb               xm2, xm4, xm1
-    punpcklbw           xm2, xm1
     movq                xm1, [tmpq]
-    add               maskq, 4*2
     add                tmpq, 4*2
     punpcklbw           xm0, xm1
     pmaddubsw           xm0, xm2
@@ -3382,116 +3464,19 @@
     pextrd     [dstq+dsq*1], xm0, 1
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
-    jg .w4_s4
+    jg .w4_loop
     RET
-.w4_s1:
-    movq                xm3, [blend_shuf]
-.w4_s1_loop:
-    movd                xm1, [maskq]
-    movd                xm0, [dstq+dsq*0]
-    pshufb              xm1, xm3
-    psubb               xm2, xm4, xm1
-    pinsrd              xm0, [dstq+dsq*1], 1
-    punpcklbw           xm2, xm1
-    movq                xm1, [tmpq]
-    add               maskq, 2
-    add                tmpq, 4*2
-    punpcklbw           xm0, xm1
-    pmaddubsw           xm0, xm2
-    pmulhrsw            xm0, xm5
-    packuswb            xm0, xm0
-    movd       [dstq+dsq*0], xm0
-    pextrd     [dstq+dsq*1], xm0, 1
-    lea                dstq, [dstq+dsq*2]
-    sub                  hd, 2
-    jg .w4_s1_loop
-    RET
-.w4_s0:
-    vpbroadcastd        xm0, [maskq]
-    psubb               xm4, xm0
-    punpcklbw           xm4, xm0
-.w4_s0_loop:
-    movd                xm0, [dstq+dsq*0]
-    pinsrd              xm0, [dstq+dsq*1], 1
-    movq                xm1, [tmpq]
-    add                tmpq, 4*2
-    punpcklbw           xm0, xm1
-    pmaddubsw           xm0, xm4
-    pmulhrsw            xm0, xm5
-    packuswb            xm0, xm0
-    movd       [dstq+dsq*0], xm0
-    pextrd     [dstq+dsq*1], xm0, 1
-    lea                dstq, [dstq+dsq*2]
-    sub                  hd, 2
-    jg .w4_s0_loop
-    RET
 ALIGN function_align
 .w8:
-    cmp                 msq, 1
-    jb .w8_s0
-    je .w8_s1
-.w8_s8:
-    movq                xm1, [maskq+8*1]
-    vinserti128          m1, [maskq+8*0], 1
+    vbroadcasti128       m4, [maskq+8*2]
+.w8_loop:
     vpbroadcastq         m2, [dstq+dsq*0]
     movq                xm0, [dstq+dsq*1]
     vpblendd             m0, m2, 0x30
-    psubb                m2, m4, m1
-    punpcklbw            m2, m1
     movq                xm1, [tmpq+8*1]
     vinserti128          m1, [tmpq+8*0], 1
-    add               maskq, 8*2
     add                tmpq, 8*2
     punpcklbw            m0, m1
-    pmaddubsw            m0, m2
-    pmulhrsw             m0, m5
-    vextracti128        xm1, m0, 1
-    packuswb            xm0, xm1
-    movhps     [dstq+dsq*0], xm0
-    movq       [dstq+dsq*1], xm0
-    lea                dstq, [dstq+dsq*2]
-    sub                  hd, 2
-    jg .w8_s8
-    RET
-.w8_s1:
-    vpbroadcastd         m0, [blend_shuf+0]
-    vpbroadcastd        xm3, [blend_shuf+4]
-    vpblendd             m3, m0, 0xf0
-.w8_s1_loop:
-    vpbroadcastd         m0, [maskq]
-    vpbroadcastq         m1, [dstq+dsq*0]
-    pshufb               m0, m3
-    psubb                m2, m4, m0
-    punpcklbw            m2, m0
-    movq                xm0, [dstq+dsq*1]
-    vpblendd             m0, m1, 0x30
-    movq                xm1, [tmpq+8*1]
-    vinserti128          m1, [tmpq+8*0], 1
-    add               maskq, 2
-    add                tmpq, 8*2
-    punpcklbw            m0, m1
-    pmaddubsw            m0, m2
-    pmulhrsw             m0, m5
-    vextracti128        xm1, m0, 1
-    packuswb            xm0, xm1
-    movhps     [dstq+dsq*0], xm0
-    movq       [dstq+dsq*1], xm0
-    lea                dstq, [dstq+dsq*2]
-    sub                  hd, 2
-    jg .w8_s1_loop
-    RET
-.w8_s0:
-    vpbroadcastq         m0, [maskq]
-    psubb                m4, m0
-    punpcklbw            m4, m0
-.w8_s0_loop:
-    vpbroadcastq         m2, [dstq+dsq*0]
-    movq                xm0, [dstq+dsq*1]
-    vpblendd             m0, m2, 0x30
-    movq                xm1, [tmpq+8*1]
-    vinserti128          m1, [tmpq+8*0], 1
-    add                tmpq, 8*2
-    punpcklbw            m0, m1
     pmaddubsw            m0, m4
     pmulhrsw             m0, m5
     vextracti128        xm1, m0, 1
@@ -3500,28 +3485,21 @@
     movq       [dstq+dsq*1], xm0
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
-    jg .w8_s0_loop
+    jg .w8_loop
     RET
 ALIGN function_align
 .w16:
-    cmp                 msq, 1
-    jb .w16_s0
-    WIN64_SPILL_XMM       7
-    je .w16_s1
-.w16_s16:
-    mova                 m0, [maskq]
+    vbroadcasti128       m3, [maskq+16*2]
+    vbroadcasti128       m4, [maskq+16*3]
+.w16_loop:
     mova                xm1, [dstq+dsq*0]
     vinserti128          m1, [dstq+dsq*1], 1
-    psubb                m3, m4, m0
-    punpcklbw            m2, m3, m0
-    punpckhbw            m3, m0
-    mova                 m6, [tmpq]
-    add               maskq, 16*2
+    mova                 m2, [tmpq]
     add                tmpq, 16*2
-    punpcklbw            m0, m1, m6
-    punpckhbw            m1, m6
-    pmaddubsw            m0, m2
-    pmaddubsw            m1, m3
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m4
     pmulhrsw             m0, m5
     pmulhrsw             m1, m5
     packuswb             m0, m1
@@ -3529,51 +3507,119 @@
     vextracti128 [dstq+dsq*1], m0, 1
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
-    jg .w16_s16
+    jg .w16_loop
     RET
-.w16_s1:
-    vpbroadcastd        xm6, [blend_shuf]
-    vpbroadcastd         m0, [blend_shuf+4]
-    vpblendd             m6, m0, 0xf0
-.w16_s1_loop:
-    vpbroadcastd         m2, [maskq]
-    mova                xm1, [dstq+dsq*0]
-    pshufb               m2, m6
-    psubb                m3, m4, m2
-    vinserti128          m1, [dstq+dsq*1], 1
-    punpcklbw            m3, m2
+ALIGN function_align
+.w32:
+    mova                xm3, [maskq+16*4]
+    vinserti128          m3, [maskq+16*6], 1
+    mova                xm4, [maskq+16*5]
+    vinserti128          m4, [maskq+16*7], 1
+.w32_loop:
+    mova                 m1, [dstq]
     mova                 m2, [tmpq]
-    add               maskq, 2
-    add                tmpq, 16*2
+    add                tmpq, 32
     punpcklbw            m0, m1, m2
     punpckhbw            m1, m2
     pmaddubsw            m0, m3
-    pmaddubsw            m1, m3
+    pmaddubsw            m1, m4
     pmulhrsw             m0, m5
     pmulhrsw             m1, m5
     packuswb             m0, m1
-    mova         [dstq+dsq*0], xm0
-    vextracti128 [dstq+dsq*1], m0, 1
+    mova             [dstq], m0
+    add                dstq, dsq
+    dec                  hd
+    jg .w32_loop
+    RET
+
+cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_h_avx2_table
+    lea                  r5, [blend_h_avx2_table]
+    mov                 r6d, wd
+    tzcnt                wd, wd
+    mov                  hd, hm
+    movsxd               wq, dword [r5+wq*4]
+    vpbroadcastd         m5, [base+pw_512]
+    add                  wq, r5
+    lea               maskq, [base+obmc_masks+hq*4]
+    neg                  hq
+    jmp                  wq
+.w2:
+    movd                xm0, [dstq+dsq*0]
+    pinsrw              xm0, [dstq+dsq*1], 1
+    movd                xm2, [maskq+hq*2]
+    movd                xm1, [tmpq]
+    add                tmpq, 2*2
+    punpcklwd           xm2, xm2
+    punpcklbw           xm0, xm1
+    pmaddubsw           xm0, xm2
+    pmulhrsw            xm0, xm5
+    packuswb            xm0, xm0
+    pextrw     [dstq+dsq*0], xm0, 0
+    pextrw     [dstq+dsq*1], xm0, 1
     lea                dstq, [dstq+dsq*2]
-    sub                  hd, 2
-    jg .w16_s1_loop
+    add                  hq, 2
+    jl .w2
     RET
-.w16_s0:
-    %assign stack_offset stack_offset - stack_size_padded
-    WIN64_SPILL_XMM       6
-    vbroadcasti128       m0, [maskq]
-    psubb                m4, m0
-    punpcklbw            m3, m4, m0
-    punpckhbw            m4, m0
-.w16_s0_loop:
+ALIGN function_align
+.w4:
+    mova                xm3, [blend_shuf]
+.w4_loop:
+    movd                xm0, [dstq+dsq*0]
+    pinsrd              xm0, [dstq+dsq*1], 1
+    movq                xm2, [maskq+hq*2]
+    movq                xm1, [tmpq]
+    add                tmpq, 4*2
+    pshufb              xm2, xm3
+    punpcklbw           xm0, xm1
+    pmaddubsw           xm0, xm2
+    pmulhrsw            xm0, xm5
+    packuswb            xm0, xm0
+    movd       [dstq+dsq*0], xm0
+    pextrd     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    add                  hq, 2
+    jl .w4_loop
+    RET
+ALIGN function_align
+.w8:
+    vbroadcasti128       m4, [blend_shuf]
+    shufpd               m4, m4, 0x03
+.w8_loop:
+    vpbroadcastq         m1, [dstq+dsq*0]
+    movq                xm0, [dstq+dsq*1]
+    vpblendd             m0, m1, 0x30
+    vpbroadcastd         m3, [maskq+hq*2]
+    movq                xm1, [tmpq+8*1]
+    vinserti128          m1, [tmpq+8*0], 1
+    add                tmpq, 8*2
+    pshufb               m3, m4
+    punpcklbw            m0, m1
+    pmaddubsw            m0, m3
+    pmulhrsw             m0, m5
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    movhps     [dstq+dsq*0], xm0
+    movq       [dstq+dsq*1], xm0
+    lea                dstq, [dstq+dsq*2]
+    add                  hq, 2
+    jl .w8_loop
+    RET
+ALIGN function_align
+.w16:
+    vbroadcasti128       m4, [blend_shuf]
+    shufpd               m4, m4, 0x0c
+.w16_loop:
     mova                xm1, [dstq+dsq*0]
     vinserti128          m1, [dstq+dsq*1], 1
+    vpbroadcastd         m3, [maskq+hq*2]
     mova                 m2, [tmpq]
     add                tmpq, 16*2
+    pshufb               m3, m4
     punpcklbw            m0, m1, m2
     punpckhbw            m1, m2
     pmaddubsw            m0, m3
-    pmaddubsw            m1, m4
+    pmaddubsw            m1, m3
     pmulhrsw             m0, m5
     pmulhrsw             m1, m5
     packuswb             m0, m1
@@ -3580,60 +3626,17 @@
     mova         [dstq+dsq*0], xm0
     vextracti128 [dstq+dsq*1], m0, 1
     lea                dstq, [dstq+dsq*2]
-    sub                  hd, 2
-    jg .w16_s0_loop
+    add                  hq, 2
+    jl .w16_loop
     RET
 ALIGN function_align
-.w32:
-    mov                  wd, 32
-    jmp .w32_start
-.w64:
-    mov                  wd, 64
-    jmp .w32_start
-.w128:
-    mov                  wd, 128
-.w32_start:
-    WIN64_SPILL_XMM       7
-    cmp                 msq, 1
-    jb .w32_s0
-    je .w32_s1
-    sub                 dsq, wq
-.w32_s32:
-    mov                 r6d, wd
-.w32_s32_loop:
-    mova                 m0, [maskq]
+.w32: ; w32/w64/w128
+    sub                 dsq, r6
+.w32_loop0:
+    vpbroadcastw         m3, [maskq+hq*2]
+    mov                  wd, r6d
+.w32_loop:
     mova                 m1, [dstq]
-    psubb                m3, m4, m0
-    punpcklbw            m2, m3, m0
-    punpckhbw            m3, m0
-    mova                 m6, [tmpq]
-    add               maskq, 32
-    add                tmpq, 32
-    punpcklbw            m0, m1, m6
-    punpckhbw            m1, m6
-    pmaddubsw            m0, m2
-    pmaddubsw            m1, m3
-    pmulhrsw             m0, m5
-    pmulhrsw             m1, m5
-    packuswb             m0, m1
-    mova             [dstq], m0
-    add                dstq, 32
-    sub                 r6d, 32
-    jg .w32_s32_loop
-    add                dstq, dsq
-    dec                  hd
-    jg .w32_s32
-    RET
-.w32_s1:
-    sub                 dsq, wq
-.w32_s1_loop0:
-    vpbroadcastb         m0, [maskq]
-    mov                 r6d, wd
-    inc               maskq
-    psubb                m3, m4, m0
-    punpcklbw            m3, m0
-.w32_s1_loop:
-    mova                 m1, [dstq]
     mova                 m2, [tmpq]
     add                tmpq, 32
     punpcklbw            m0, m1, m2
@@ -3645,49 +3648,11 @@
     packuswb             m0, m1
     mova             [dstq], m0
     add                dstq, 32
-    sub                 r6d, 32
-    jg .w32_s1_loop
+    sub                  wd, 32
+    jg .w32_loop
     add                dstq, dsq
-    dec                  hd
-    jg .w32_s1_loop0
-    RET
-.w32_s0:
-%if WIN64
-    PUSH                 r7
-    PUSH                 r8
-    %define regs_used 9
-%endif
-    lea                 r6d, [hq+wq*8-256]
-    mov                  r7, dstq
-    mov                  r8, tmpq
-.w32_s0_loop0:
-    mova                 m0, [maskq]
-    add               maskq, 32
-    psubb                m3, m4, m0
-    punpcklbw            m2, m3, m0
-    punpckhbw            m3, m0
-.w32_s0_loop:
-    mova                 m1, [dstq]
-    mova                 m6, [tmpq]
-    add                tmpq, wq
-    punpcklbw            m0, m1, m6
-    punpckhbw            m1, m6
-    pmaddubsw            m0, m2
-    pmaddubsw            m1, m3
-    pmulhrsw             m0, m5
-    pmulhrsw             m1, m5
-    packuswb             m0, m1
-    mova             [dstq], m0
-    add                dstq, dsq
-    dec                  hd
-    jg .w32_s0_loop
-    add                  r7, 32
-    add                  r8, 32
-    mov                dstq, r7
-    mov                tmpq, r8
-    mov                  hb, r6b
-    sub                 r6d, 256
-    jg .w32_s0_loop0
+    inc                  hq
+    jl .w32_loop0
     RET
 
 cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -55,6 +55,8 @@
 decl_mask_fn(dav1d_mask_avx2);
 decl_w_mask_fn(dav1d_w_mask_420_avx2);
 decl_blend_fn(dav1d_blend_avx2);
+decl_blend_dir_fn(dav1d_blend_v_avx2);
+decl_blend_dir_fn(dav1d_blend_h_avx2);
 
 decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
 decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
@@ -98,6 +100,8 @@
     c->mask = dav1d_mask_avx2;
     c->w_mask[2] = dav1d_w_mask_420_avx2;
     c->blend = dav1d_blend_avx2;
+    c->blend_v = dav1d_blend_v_avx2;
+    c->blend_h = dav1d_blend_h_avx2;
 
     c->warp8x8  = dav1d_warp_affine_8x8_avx2;
     c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
--- a/tests/checkasm/mc.c
+++ b/tests/checkasm/mc.c
@@ -237,40 +237,95 @@
 }
 
 static void check_blend(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, tmp, 128 * 32,);
-    ALIGN_STK_32(pixel, c_dst, 128 * 32,);
-    ALIGN_STK_32(pixel, a_dst, 128 * 32,);
-    ALIGN_STK_32(uint8_t, mask, 128 * 32,);
+    ALIGN_STK_32(pixel, tmp, 32 * 32,);
+    ALIGN_STK_32(pixel, c_dst, 32 * 32,);
+    ALIGN_STK_32(pixel, a_dst, 32 * 32,);
+    ALIGN_STK_32(uint8_t, mask, 32 * 32,);
 
-    for (int i = 0; i < 128 * 32; i++) {
+    for (int i = 0; i < 32 * 32; i++) {
         tmp[i] = rand() & ((1 << BITDEPTH) - 1);
         mask[i] = rand() % 65;
     }
 
     declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
-                 int w, int h, const uint8_t *mask, ptrdiff_t mstride);
+                 int w, int h, const uint8_t *mask);
 
-    for (int w = 2; w <= 128; w <<= 1) {
+    for (int w = 4; w <= 32; w <<= 1) {
         const ptrdiff_t dst_stride = w * sizeof(pixel);
-        const int h_min = (w == 128) ? 4 : 2;
-        const int h_max = (w > 32) ? 32 : (w == 2) ? 64 : 128;
-        for (int ms = 0; ms <= w; ms += ms ? w - 1 : 1)
-            if (check_func(c->blend, "blend_w%d_ms%d_%dbpc", w, ms, BITDEPTH))
-                for (int h = h_min; h <= h_max; h <<= 1) {
-                    for (int i = 0; i < w * h; i++)
-                        c_dst[i] = a_dst[i] = rand() & ((1 << BITDEPTH) - 1);
+        if (check_func(c->blend, "blend_w%d_%dbpc", w, BITDEPTH))
+            for (int h = imax(w / 2, 4); h <= imin(w * 2, 32); h <<= 1) {
+                for (int i = 0; i < w * h; i++)
+                    c_dst[i] = a_dst[i] = rand() & ((1 << BITDEPTH) - 1);
 
-                    call_ref(c_dst, dst_stride, tmp, w, h, mask, ms);
-                    call_new(a_dst, dst_stride, tmp, w, h, mask, ms);
-                    if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)))
-                        fail();
+                call_ref(c_dst, dst_stride, tmp, w, h, mask);
+                call_new(a_dst, dst_stride, tmp, w, h, mask);
+                if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)))
+                    fail();
 
-                    bench_new(a_dst, dst_stride, tmp, w, h, mask, ms);
-                }
+                bench_new(a_dst, dst_stride, tmp, w, h, mask);
+            }
     }
     report("blend");
 }
 
+static void check_blend_v(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_32(pixel, tmp,   32 * 128,);
+    ALIGN_STK_32(pixel, c_dst, 32 * 128,);
+    ALIGN_STK_32(pixel, a_dst, 32 * 128,);
+
+    for (int i = 0; i < 32 * 128; i++)
+        tmp[i] = rand() & ((1 << BITDEPTH) - 1);
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
+                 int w, int h);
+
+    for (int w = 2; w <= 32; w <<= 1) {
+        const ptrdiff_t dst_stride = w * sizeof(pixel);
+        if (check_func(c->blend_v, "blend_v_w%d_%dbpc", w, BITDEPTH))
+            for (int h = 2; h <= (w == 2 ? 64 : 128); h <<= 1) {
+                for (int i = 0; i < w * h; i++)
+                    c_dst[i] = a_dst[i] = rand() & ((1 << BITDEPTH) - 1);
+
+                call_ref(c_dst, dst_stride, tmp, w, h);
+                call_new(a_dst, dst_stride, tmp, w, h);
+                if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)))
+                    fail();
+
+                bench_new(a_dst, dst_stride, tmp, w, h);
+            }
+    }
+    report("blend_v");
+}
+
+static void check_blend_h(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_32(pixel, tmp,   128 * 32,);
+    ALIGN_STK_32(pixel, c_dst, 128 * 32,);
+    ALIGN_STK_32(pixel, a_dst, 128 * 32,);
+
+    for (int i = 0; i < 128 * 32; i++)
+        tmp[i] = rand() & ((1 << BITDEPTH) - 1);
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
+                 int w, int h);
+
+    for (int w = 2; w <= 128; w <<= 1) {
+        const ptrdiff_t dst_stride = w * sizeof(pixel);
+        if (check_func(c->blend_h, "blend_h_w%d_%dbpc", w, BITDEPTH))
+            for (int h = (w == 128 ? 4 : 2); h <= 32; h <<= 1) {
+                for (int i = 0; i < w * h; i++)
+                    c_dst[i] = a_dst[i] = rand() & ((1 << BITDEPTH) - 1);
+
+                call_ref(c_dst, dst_stride, tmp, w, h);
+                call_new(a_dst, dst_stride, tmp, w, h);
+                if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)))
+                    fail();
+
+                bench_new(a_dst, dst_stride, tmp, w, h);
+            }
+    }
+    report("blend_h");
+}
+
 static void check_warp8x8(Dav1dMCDSPContext *const c) {
     ALIGN_STK_32(pixel, src_buf, 15 * 15,);
     ALIGN_STK_32(pixel, c_dst,    8 *  8,);
@@ -430,6 +485,8 @@
     check_mask(&c);
     check_w_mask(&c);
     check_blend(&c);
+    check_blend_v(&c);
+    check_blend_h(&c);
     check_warp8x8(&c);
     check_warp8x8t(&c);
     check_emuedge(&c);