shithub: dav1d

Download patch

ref: 5173de3032572e7aabdfacf0c0fac2a9a65fab8c
parent: 50e876c60efd251ab7a5af36b0d39572dcce627c
author: Henrik Gramner <gramner@twoorioles.com>
date: Sun Sep 13 14:10:07 EDT 2020

x86: Add misc mc asm tweaks

--- a/src/x86/mc_avx2.asm
+++ b/src/x86/mc_avx2.asm
@@ -59,8 +59,8 @@
 subpel_v_shuf4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
 subpel_s_shuf2: db  0,  1,  2,  3,  0,  1,  2,  3,  8,  9, 10, 11,  8,  9, 10, 11
 subpel_s_shuf8: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
-bilin_h_shuf4:  db  1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
-bilin_h_shuf8:  db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+bilin_h_shuf4:  db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
+bilin_h_shuf8:  db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
 bilin_v_shuf4:  db  4,  0,  5,  1,  6,  2,  7,  3,  8,  4,  9,  5, 10,  6, 11,  7
 deint_shuf4:    db  0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
 blend_shuf:     db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
@@ -76,6 +76,7 @@
 
 pb_64:   times 4 db 64
 pw_m256: times 2 dw -256
+pw_15:   times 2 dw 15
 pw_32:   times 2 dw 32
 pw_34:   times 2 dw 34
 pw_258:  times 2 dw 258
@@ -201,10 +202,9 @@
 SECTION .text
 
 INIT_XMM avx2
-DECLARE_REG_TMP 4, 6, 7
 cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
     movifnidn          mxyd, r6m ; mx
-    lea                  t2, [put_avx2]
+    lea                  r7, [put_avx2]
     tzcnt                wd, wm
     movifnidn            hd, hm
     test               mxyd, mxyd
@@ -213,35 +213,35 @@
     test               mxyd, mxyd
     jnz .v
 .put:
-    movzx                wd, word [t2+wq*2+table_offset(put,)]
-    add                  wq, t2
+    movzx                wd, word [r7+wq*2+table_offset(put,)]
+    add                  wq, r7
     jmp                  wq
 .put_w2:
-    movzx               t0d, word [srcq+ssq*0]
-    movzx               t1d, word [srcq+ssq*1]
+    movzx               r6d, word [srcq+ssq*0]
+    movzx               r7d, word [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    mov        [dstq+dsq*0], t0w
-    mov        [dstq+dsq*1], t1w
+    mov        [dstq+dsq*0], r6w
+    mov        [dstq+dsq*1], r7w
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .put_w2
     RET
 .put_w4:
-    mov                 t0d, [srcq+ssq*0]
-    mov                 t1d, [srcq+ssq*1]
+    mov                 r6d, [srcq+ssq*0]
+    mov                 r7d, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    mov        [dstq+dsq*0], t0d
-    mov        [dstq+dsq*1], t1d
+    mov        [dstq+dsq*0], r6d
+    mov        [dstq+dsq*1], r7d
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .put_w4
     RET
 .put_w8:
-    mov                  t0, [srcq+ssq*0]
-    mov                  t1, [srcq+ssq*1]
+    mov                  r6, [srcq+ssq*0]
+    mov                  r7, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    mov        [dstq+dsq*0], t0
-    mov        [dstq+dsq*1], t1
+    mov        [dstq+dsq*0], r6
+    mov        [dstq+dsq*1], r7
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .put_w8
@@ -298,17 +298,17 @@
 .h:
     ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
     ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
-    imul               mxyd, 0xff01
+    imul               mxyd, 255
     vbroadcasti128       m4, [bilin_h_shuf8]
-    add                mxyd, 16 << 8
+    add                mxyd, 16
     movd                xm5, mxyd
     mov                mxyd, r7m ; my
     vpbroadcastw         m5, xm5
     test               mxyd, mxyd
     jnz .hv
-    movzx                wd, word [t2+wq*2+table_offset(put, _bilin_h)]
+    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_h)]
     vpbroadcastd         m3, [pw_2048]
-    add                  wq, t2
+    add                  wq, r7
     jmp                  wq
 .h_w2:
     movd                xm0, [srcq+ssq*0]
@@ -419,10 +419,10 @@
     jg .h_w64
     RET
 .h_w128:
-    mov                  t1, -32*3
+    mov                  r6, -32*3
 .h_w128_loop:
-    movu                 m0, [srcq+t1+32*3+8*0]
-    movu                 m1, [srcq+t1+32*3+8*1]
+    movu                 m0, [srcq+r6+32*3+8*0]
+    movu                 m1, [srcq+r6+32*3+8*1]
     pshufb               m0, m4
     pshufb               m1, m4
     pmaddubsw            m0, m5
@@ -430,8 +430,8 @@
     pmulhrsw             m0, m3
     pmulhrsw             m1, m3
     packuswb             m0, m1
-    mova     [dstq+t1+32*3], m0
-    add                  t1, 32
+    mova     [dstq+r6+32*3], m0
+    add                  r6, 32
     jle .h_w128_loop
     add                srcq, ssq
     add                dstq, dsq
@@ -439,11 +439,11 @@
     jg .h_w128
     RET
 .v:
-    movzx                wd, word [t2+wq*2+table_offset(put, _bilin_v)]
-    imul               mxyd, 0xff01
+    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+    imul               mxyd, 255
     vpbroadcastd         m5, [pw_2048]
-    add                mxyd, 16 << 8
-    add                  wq, t2
+    add                mxyd, 16
+    add                  wq, r7
     movd                xm4, mxyd
     vpbroadcastw         m4, xm4
     jmp                  wq
@@ -454,7 +454,7 @@
     lea                srcq,      [srcq+ssq*2]
     pinsrw              xm0, xm1, [srcq+ssq*0], 0 ; 2 1
     pshuflw             xm1, xm1, q2301           ; 1 0
-    punpcklbw           xm1, xm0, xm1
+    punpcklbw           xm1, xm0
     pmaddubsw           xm1, xm4
     pmulhrsw            xm1, xm5
     packuswb            xm1, xm1
@@ -467,11 +467,11 @@
 .v_w4:
     movd                xm0, [srcq+ssq*0]
 .v_w4_loop:
-    vpbroadcastd        xm1, [srcq+ssq*1]
+    vpbroadcastd        xm2, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    vpblendd            xm2, xm1, xm0, 0x01 ; 0 1
+    vpblendd            xm1, xm2, xm0, 0x01 ; 0 1
     vpbroadcastd        xm0, [srcq+ssq*0]
-    vpblendd            xm1, xm0, 0x02      ; 1 2
+    vpblendd            xm2, xm0, 0x02      ; 1 2
     punpcklbw           xm1, xm2
     pmaddubsw           xm1, xm4
     pmulhrsw            xm1, xm5
@@ -485,11 +485,11 @@
 .v_w8:
     movq                xm0, [srcq+ssq*0]
 .v_w8_loop:
-    movq                xm3, [srcq+ssq*1]
+    movq                xm2, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    punpcklbw           xm1, xm3, xm0
+    punpcklbw           xm1, xm0, xm2
     movq                xm0, [srcq+ssq*0]
-    punpcklbw           xm2, xm0, xm3
+    punpcklbw           xm2, xm0
     pmaddubsw           xm1, xm4
     pmaddubsw           xm2, xm4
     pmulhrsw            xm1, xm5
@@ -504,11 +504,11 @@
 .v_w16:
     movu                xm0, [srcq+ssq*0]
 .v_w16_loop:
-    vbroadcasti128       m2, [srcq+ssq*1]
+    vbroadcasti128       m3, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    vpblendd             m3, m2, m0, 0x0f ; 0 1
+    vpblendd             m2, m3, m0, 0x0f ; 0 1
     vbroadcasti128       m0, [srcq+ssq*0]
-    vpblendd             m2, m0, 0xf0     ; 1 2
+    vpblendd             m3, m0, 0xf0     ; 1 2
     punpcklbw            m1, m2, m3
     punpckhbw            m2, m3
     pmaddubsw            m1, m4
@@ -528,8 +528,8 @@
 %%loop:
     movu                 m3, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    punpcklbw            m1, m3, m0
-    punpckhbw            m2, m3, m0
+    punpcklbw            m1, m0, m3
+    punpckhbw            m2, m0, m3
     movu                 m0, [srcq+ssq*0]
     pmaddubsw            m1, m4
     pmaddubsw            m2, m4
@@ -536,15 +536,15 @@
     pmulhrsw             m1, m5
     pmulhrsw             m2, m5
     packuswb             m1, m2
-    mova       [dstq+dsq*0], m1
-    punpcklbw            m1, m0, m3
-    punpckhbw            m2, m0, m3
-    pmaddubsw            m1, m4
+    punpcklbw            m2, m3, m0
+    punpckhbw            m3, m0
     pmaddubsw            m2, m4
-    pmulhrsw             m1, m5
+    pmaddubsw            m3, m4
     pmulhrsw             m2, m5
-    packuswb             m1, m2
-    mova       [dstq+dsq*1], m1
+    pmulhrsw             m3, m5
+    packuswb             m2, m3
+    mova       [dstq+dsq*0], m1
+    mova       [dstq+dsq*1], m2
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg %%loop
@@ -557,8 +557,8 @@
 .v_w64_loop:
     add                srcq, ssq
     movu                 m3, [srcq+32*0]
-    punpcklbw            m2, m3, m0
-    punpckhbw            m0, m3, m0
+    punpcklbw            m2, m0, m3
+    punpckhbw            m0, m3
     pmaddubsw            m2, m4
     pmaddubsw            m0, m4
     pmulhrsw             m2, m5
@@ -567,8 +567,8 @@
     mova                 m0, m3
     movu                 m3, [srcq+32*1]
     mova        [dstq+32*0], m2
-    punpcklbw            m2, m3, m1
-    punpckhbw            m1, m3, m1
+    punpcklbw            m2, m1, m3
+    punpckhbw            m1, m3
     pmaddubsw            m2, m4
     pmaddubsw            m1, m4
     pmulhrsw             m2, m5
@@ -581,28 +581,29 @@
     jg .v_w64_loop
     RET
 .v_w128:
-    mov                  t0, dstq
-    mov                  t1, srcq
-    lea                 t2d, [hq+(3<<8)]
+    lea                 r6d, [hq+(3<<8)]
+    mov                  r4, srcq
+    mov                  r7, dstq
 .v_w128_loop:
     PUT_BILIN_V_W32
-    movzx                hd, t2b
-    add                  t0, 32
-    add                  t1, 32
-    mov                dstq, t0
-    mov                srcq, t1
-    sub                 t2d, 1<<8
+    add                  r4, 32
+    add                  r7, 32
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
+    sub                 r6d, 1<<8
     jg .v_w128_loop
     RET
 .hv:
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
     ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
-    movzx                wd, word [t2+wq*2+table_offset(put, _bilin_hv)]
+    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
     WIN64_SPILL_XMM       8
     shl                mxyd, 11 ; can't shift by 12 due to signed overflow
-    vpbroadcastd         m7, [pw_2048]
+    vpbroadcastd         m7, [pw_15]
     movd                xm6, mxyd
-    add                  wq, t2
+    add                  wq, r7
+    paddb                m5, m5
     vpbroadcastw         m6, xm6
     jmp                  wq
 .hv_w2:
@@ -618,10 +619,10 @@
     shufps              xm2, xm0, xm1, q1032 ; 0 _ 1 _
     mova                xm0, xm1
     psubw               xm1, xm2
-    paddw               xm1, xm1
     pmulhw              xm1, xm6
+    pavgw               xm2, xm7
     paddw               xm1, xm2
-    pmulhrsw            xm1, xm7
+    psrlw               xm1, 4
     packuswb            xm1, xm1
     pextrw     [dstq+dsq*0], xm1, 0
     pextrw     [dstq+dsq*1], xm1, 2
@@ -643,10 +644,10 @@
     shufps              xm2, xm0, xm1, q1032 ; 0 1
     mova                xm0, xm1
     psubw               xm1, xm2
-    paddw               xm1, xm1
     pmulhw              xm1, xm6
+    pavgw               xm2, xm7
     paddw               xm1, xm2
-    pmulhrsw            xm1, xm7
+    psrlw               xm1, 4
     packuswb            xm1, xm1
     movd       [dstq+dsq*0], xm1
     pextrd     [dstq+dsq*1], xm1, 1
@@ -667,10 +668,10 @@
     vperm2i128           m2, m0, m1, 0x21 ; 0 1
     mova                 m0, m1
     psubw                m1, m2
-    paddw                m1, m1
     pmulhw               m1, m6
+    pavgw                m2, m7
     paddw                m1, m2
-    pmulhrsw             m1, m7
+    psrlw                m1, 4
     vextracti128        xm2, m1, 1
     packuswb            xm1, xm2
     movq       [dstq+dsq*0], xm1
@@ -694,16 +695,16 @@
     pshufb               m3, m4
     pmaddubsw            m2, m5
     psubw                m1, m2, m0
-    paddw                m1, m1
     pmulhw               m1, m6
+    pavgw                m0, m7
     paddw                m1, m0
     pmaddubsw            m0, m3, m5
     psubw                m3, m0, m2
-    paddw                m3, m3
     pmulhw               m3, m6
+    pavgw                m2, m7
     paddw                m3, m2
-    pmulhrsw             m1, m7
-    pmulhrsw             m3, m7
+    psrlw                m1, 4
+    psrlw                m3, 4
     packuswb             m1, m3
     vpermq               m1, m1, q3120
     mova         [dstq+dsq*0], xm1
@@ -712,19 +713,21 @@
     sub                  hd, 2
     jg .hv_w16_loop
     RET
+.hv_w128:
+    lea                 r6d, [hq+(3<<16)]
+    jmp .hv_w32_start
+.hv_w64:
+    lea                 r6d, [hq+(1<<16)]
+.hv_w32_start:
+    mov                  r4, srcq
+    mov                  r7, dstq
 .hv_w32:
-    xor                 t2d, t2d
-.hv_w32gt:
-    mov                  t0, dstq
-    mov                  t1, srcq
 %if WIN64
     movaps              r4m, xmm8
 %endif
 .hv_w32_loop0:
     movu                 m0, [srcq+8*0]
-    vinserti128          m0, [srcq+8*2], 1
     movu                 m1, [srcq+8*1]
-    vinserti128          m1, [srcq+8*3], 1
     pshufb               m0, m4
     pshufb               m1, m4
     pmaddubsw            m0, m5
@@ -731,53 +734,44 @@
     pmaddubsw            m1, m5
 .hv_w32_loop:
     add                srcq, ssq
-    movu                xm2, [srcq+8*1]
-    vinserti128          m2, [srcq+8*3], 1
+    movu                 m2, [srcq+8*0]
+    movu                 m3, [srcq+8*1]
     pshufb               m2, m4
+    pshufb               m3, m4
     pmaddubsw            m2, m5
-    psubw                m3, m2, m1
-    paddw                m3, m3
-    pmulhw               m3, m6
-    paddw                m3, m1
-    mova                 m1, m2
-    pmulhrsw             m8, m3, m7
-    movu                xm2, [srcq+8*0]
-    vinserti128          m2, [srcq+8*2], 1
-    pshufb               m2, m4
-    pmaddubsw            m2, m5
-    psubw                m3, m2, m0
-    paddw                m3, m3
-    pmulhw               m3, m6
-    paddw                m3, m0
+    pmaddubsw            m3, m5
+    psubw                m8, m2, m0
+    pmulhw               m8, m6
+    pavgw                m0, m7
+    paddw                m8, m0
     mova                 m0, m2
-    pmulhrsw             m3, m7
-    packuswb             m3, m8
-    mova             [dstq], m3
+    psubw                m2, m3, m1
+    pmulhw               m2, m6
+    pavgw                m1, m7
+    paddw                m2, m1
+    mova                 m1, m3
+    psrlw                m8, 4
+    psrlw                m2, 4
+    packuswb             m8, m2
+    mova             [dstq], m8
     add                dstq, dsq
     dec                  hd
     jg .hv_w32_loop
-    movzx                hd, t2b
-    add                  t0, 32
-    add                  t1, 32
-    mov                dstq, t0
-    mov                srcq, t1
-    sub                 t2d, 1<<8
+    add                  r4, 32
+    add                  r7, 32
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
+    sub                 r6d, 1<<16
     jg .hv_w32_loop0
 %if WIN64
     movaps             xmm8, r4m
 %endif
     RET
-.hv_w64:
-    lea                 t2d, [hq+(1<<8)]
-    jmp .hv_w32gt
-.hv_w128:
-    lea                 t2d, [hq+(3<<8)]
-    jmp .hv_w32gt
 
-DECLARE_REG_TMP 3, 5, 6
 cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     movifnidn          mxyd, r5m ; mx
-    lea                  t2, [prep%+SUFFIX]
+    lea                  r6, [prep%+SUFFIX]
     tzcnt                wd, wm
     movifnidn            hd, hm
     test               mxyd, mxyd
@@ -786,8 +780,8 @@
     test               mxyd, mxyd
     jnz .v
 .prep:
-    movzx                wd, word [t2+wq*2+table_offset(prep,)]
-    add                  wq, t2
+    movzx                wd, word [r6+wq*2+table_offset(prep,)]
+    add                  wq, r6
     lea            stride3q, [strideq*3]
     jmp                  wq
 .prep_w4:
@@ -906,16 +900,16 @@
 .h:
     ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
     ; = (16 - mx) * src[x] + mx * src[x + 1]
-    imul               mxyd, 0xff01
+    imul               mxyd, 255
     vbroadcasti128       m4, [bilin_h_shuf8]
-    add                mxyd, 16 << 8
+    add                mxyd, 16
     movd                xm5, mxyd
     mov                mxyd, r6m ; my
     vpbroadcastw         m5, xm5
     test               mxyd, mxyd
     jnz .hv
-    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
-    add                  wq, t2
+    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+    add                  wq, r6
     lea            stride3q, [strideq*3]
     jmp                  wq
 .h_w4:
@@ -1079,10 +1073,10 @@
     RET
 .v:
     WIN64_SPILL_XMM       7
-    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
-    imul               mxyd, 0xff01
-    add                mxyd, 16 << 8
-    add                  wq, t2
+    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+    imul               mxyd, 255
+    add                mxyd, 16
+    add                  wq, r6
     lea            stride3q, [strideq*3]
     movd                xm6, mxyd
     vpbroadcastw         m6, xm6
@@ -1100,9 +1094,9 @@
     vpblendd             m2, m1, m0, 0xa0 ; 0 2 2 4
     vpblendd             m1, m3, 0xaa     ; 0 1 2 3
     vpblendd             m2, m3, 0x55     ; 1 2 3 4
-    punpcklbw            m2, m1
-    pmaddubsw            m2, m6
-    mova             [tmpq], m2
+    punpcklbw            m1, m2
+    pmaddubsw            m1, m6
+    mova             [tmpq], m1
     add                tmpq, 32
     sub                  hd, 4
     jg .v_w4_loop
@@ -1116,15 +1110,15 @@
     lea                srcq, [srcq+strideq*4]
     vpblendd             m1, m0, 0x03     ; 0 2 2 2
     vpbroadcastq         m0, [srcq+strideq*0]
-    vpblendd             m3, m2, 0x33     ; 1 3 1 3
-    vpblendd             m2, m1, m3, 0x0f ; 1 3 2 2
-    vpblendd             m1, m3, 0xf0     ; 0 2 1 3
-    vpblendd             m2, m0, 0xc0     ; 1 3 2 4
-    punpcklbw            m3, m2, m1
-    punpckhbw            m2, m1
-    pmaddubsw            m3, m6
+    vpblendd             m2, m3, 0xcc     ; 1 3 1 3
+    vpblendd             m3, m2, m1, 0xf0 ; 1 3 2 2
+    vpblendd             m2, m1, 0x0f     ; 0 2 1 3
+    vpblendd             m3, m0, 0xc0     ; 1 3 2 4
+    punpcklbw            m1, m2, m3
+    punpckhbw            m2, m3
+    pmaddubsw            m1, m6
     pmaddubsw            m2, m6
-    mova        [tmpq+32*0], m3
+    mova        [tmpq+32*0], m1
     mova        [tmpq+32*1], m2
     add                tmpq, 32*2
     sub                  hd, 4
@@ -1133,25 +1127,25 @@
 .v_w16:
     vbroadcasti128       m0, [srcq+strideq*0]
 .v_w16_loop:
-    vbroadcasti128       m1, [srcq+strideq*2]
-    vbroadcasti128       m2, [srcq+strideq*1]
+    vbroadcasti128       m1, [srcq+strideq*1]
+    vbroadcasti128       m2, [srcq+strideq*2]
     vbroadcasti128       m3, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
-    shufpd               m4, m0, m1, 0x0c ; 0 2  ; 0l2l 0h2h
+    shufpd               m4, m0, m2, 0x0c ; 0 2
     vbroadcasti128       m0, [srcq+strideq*0]
-    shufpd               m2, m2, m3, 0x0c ; 1 3  ; 1l3l 1h3h
-    shufpd               m1, m1, m0, 0x0c ; 2 4  ; 2l4l 2h4h
-    punpcklbw            m3, m2, m4
+    shufpd               m1, m3, 0x0c     ; 1 3
+    shufpd               m2, m0, 0x0c     ; 2 4
+    punpcklbw            m3, m4, m1
     punpcklbw            m5, m1, m2
+    punpckhbw            m4, m1
     punpckhbw            m1, m2
-    punpckhbw            m2, m4
     pmaddubsw            m3, m6
     pmaddubsw            m5, m6
-    pmaddubsw            m2, m6
+    pmaddubsw            m4, m6
     pmaddubsw            m1, m6
     mova        [tmpq+32*0], m3
     mova        [tmpq+32*1], m5
-    mova        [tmpq+32*2], m2
+    mova        [tmpq+32*2], m4
     mova        [tmpq+32*3], m1
     add                tmpq, 32*4
     sub                  hd, 4
@@ -1164,32 +1158,32 @@
     vpermq               m2, [srcq+strideq*2], q3120
     vpermq               m3, [srcq+stride3q ], q3120
     lea                srcq, [srcq+strideq*4]
-    punpcklbw            m4, m1, m0
-    punpckhbw            m5, m1, m0
+    punpcklbw            m4, m0, m1
+    punpckhbw            m5, m0, m1
     vpermq               m0, [srcq+strideq*0], q3120
     pmaddubsw            m4, m6
     pmaddubsw            m5, m6
     mova        [tmpq+32*0], m4
     mova        [tmpq+32*1], m5
-    punpcklbw            m4, m2, m1
-    punpckhbw            m5, m2, m1
+    punpcklbw            m4, m1, m2
+    punpckhbw            m1, m2
     pmaddubsw            m4, m6
+    pmaddubsw            m1, m6
+    punpcklbw            m5, m2, m3
+    punpckhbw            m2, m3
     pmaddubsw            m5, m6
+    pmaddubsw            m2, m6
     mova        [tmpq+32*2], m4
-    mova        [tmpq+32*3], m5
+    mova        [tmpq+32*3], m1
     add                tmpq, 32*8
-    punpcklbw            m4, m3, m2
-    punpckhbw            m5, m3, m2
-    punpcklbw            m1, m0, m3
-    punpckhbw            m2, m0, m3
-    pmaddubsw            m4, m6
-    pmaddubsw            m5, m6
+    punpcklbw            m1, m3, m0
+    punpckhbw            m3, m0
     pmaddubsw            m1, m6
-    pmaddubsw            m2, m6
-    mova        [tmpq-32*4], m4
-    mova        [tmpq-32*3], m5
+    pmaddubsw            m3, m6
+    mova        [tmpq-32*4], m5
+    mova        [tmpq-32*3], m2
     mova        [tmpq-32*2], m1
-    mova        [tmpq-32*1], m2
+    mova        [tmpq-32*1], m3
     sub                  hd, 4
     jg .v_w32_loop
     RET
@@ -1200,14 +1194,14 @@
     vpermq               m2, [srcq+strideq*1+32*0], q3120
     vpermq               m3, [srcq+strideq*1+32*1], q3120
     lea                srcq, [srcq+strideq*2]
-    punpcklbw            m4, m2, m0
-    punpckhbw            m5, m2, m0
+    punpcklbw            m4, m0, m2
+    punpckhbw            m0, m2
     pmaddubsw            m4, m6
-    pmaddubsw            m5, m6
+    pmaddubsw            m0, m6
     mova        [tmpq+32*0], m4
-    mova        [tmpq+32*1], m5
-    punpcklbw            m4, m3, m1
-    punpckhbw            m5, m3, m1
+    mova        [tmpq+32*1], m0
+    punpcklbw            m4, m1, m3
+    punpckhbw            m5, m1, m3
     vpermq               m0, [srcq+strideq*0+32*0], q3120
     vpermq               m1, [srcq+strideq*0+32*1], q3120
     pmaddubsw            m4, m6
@@ -1215,52 +1209,52 @@
     mova        [tmpq+32*2], m4
     mova        [tmpq+32*3], m5
     add                tmpq, 32*8
-    punpcklbw            m4, m0, m2
-    punpckhbw            m5, m0, m2
-    punpcklbw            m2, m1, m3
-    punpckhbw            m3, m1, m3
+    punpcklbw            m4, m2, m0
+    punpckhbw            m2, m0
+    punpcklbw            m5, m3, m1
+    punpckhbw            m3, m1
     pmaddubsw            m4, m6
-    pmaddubsw            m5, m6
     pmaddubsw            m2, m6
+    pmaddubsw            m5, m6
     pmaddubsw            m3, m6
     mova        [tmpq-32*4], m4
-    mova        [tmpq-32*3], m5
-    mova        [tmpq-32*2], m2
+    mova        [tmpq-32*3], m2
+    mova        [tmpq-32*2], m5
     mova        [tmpq-32*1], m3
     sub                  hd, 2
     jg .v_w64_loop
     RET
 .v_w128:
-    mov                  t0, tmpq
-    mov                  t1, srcq
-    lea                 t2d, [hq+(3<<8)]
+    lea                 r6d, [hq+(3<<8)]
+    mov                  r3, srcq
+    mov                  r5, tmpq
 .v_w128_loop0:
     vpermq               m0, [srcq+strideq*0], q3120
 .v_w128_loop:
     vpermq               m1, [srcq+strideq*1], q3120
     lea                srcq, [srcq+strideq*2]
-    punpcklbw            m2, m1, m0
-    punpckhbw            m3, m1, m0
+    punpcklbw            m2, m0, m1
+    punpckhbw            m3, m0, m1
     vpermq               m0, [srcq+strideq*0], q3120
-    punpcklbw            m4, m0, m1
-    punpckhbw            m5, m0, m1
     pmaddubsw            m2, m6
     pmaddubsw            m3, m6
+    punpcklbw            m4, m1, m0
+    punpckhbw            m1, m0
     pmaddubsw            m4, m6
-    pmaddubsw            m5, m6
+    pmaddubsw            m1, m6
     mova        [tmpq+32*0], m2
     mova        [tmpq+32*1], m3
     mova        [tmpq+32*8], m4
-    mova        [tmpq+32*9], m5
+    mova        [tmpq+32*9], m1
     add                tmpq, 32*16
     sub                  hd, 2
     jg .v_w128_loop
-    movzx                hd, t2b
-    add                  t0, 64
-    add                  t1, 32
-    mov                tmpq, t0
-    mov                srcq, t1
-    sub                 t2d, 1<<8
+    add                  r3, 32
+    add                  r5, 64
+    movzx                hd, r6b
+    mov                srcq, r3
+    mov                tmpq, r5
+    sub                 r6d, 1<<8
     jg .v_w128_loop0
     RET
 .hv:
@@ -1268,11 +1262,11 @@
     ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
     %assign stack_offset stack_offset - stack_size_padded
     WIN64_SPILL_XMM       7
-    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
     shl                mxyd, 11
     movd                xm6, mxyd
     vpbroadcastw         m6, xm6
-    add                  wq, t2
+    add                  wq, r6
     lea            stride3q, [strideq*3]
     jmp                  wq
 .hv_w4:
@@ -1388,10 +1382,19 @@
     dec                  hd
     jg .hv_w32_loop
     RET
+.hv_w128:
+    lea                 r3d, [hq+(7<<8)]
+    mov                 r6d, 256
+    jmp .hv_w64_start
 .hv_w64:
-    mov                  t0, tmpq
-    mov                  t1, srcq
-    lea                 t2d, [hq+(3<<8)]
+    lea                 r3d, [hq+(3<<8)]
+    mov                 r6d, 128
+.hv_w64_start:
+%if WIN64
+    PUSH                 r7
+%endif
+    mov                  r5, srcq
+    mov                  r7, tmpq
 .hv_w64_loop0:
     movu                xm0, [srcq+strideq*0+8*0]
     vinserti128          m0, [srcq+strideq*0+8*1], 1
@@ -1413,57 +1416,22 @@
     psubw                m2, m0, m1
     pmulhrsw             m2, m6
     paddw                m2, m1
-    mova        [tmpq+32*0], m3
-    add                tmpq, 32*8
-    mova        [tmpq-32*4], m2
+    mova        [tmpq+r6*0], m3
+    mova        [tmpq+r6*1], m2
+    lea                tmpq, [tmpq+r6*2]
     sub                  hd, 2
     jg .hv_w64_loop
-    movzx                hd, t2b
-    add                  t0, 32
-    add                  t1, 16
-    mov                tmpq, t0
-    mov                srcq, t1
-    sub                 t2d, 1<<8
+    add                  r5, 16
+    add                  r7, 32
+    movzx                hd, r3b
+    mov                srcq, r5
+    mov                tmpq, r7
+    sub                 r3d, 1<<8
     jg .hv_w64_loop0
+%if WIN64
+    POP                  r7
+%endif
     RET
-.hv_w128:
-    mov                  t0, tmpq
-    mov                  t1, srcq
-    lea                 t2d, [hq+(7<<8)]
-.hv_w128_loop0:
-    movu                xm0, [srcq+strideq*0+8*0]
-    vinserti128          m0, [srcq+strideq*0+8*1], 1
-    pshufb               m0, m4
-    pmaddubsw            m0, m5
-.hv_w128_loop:
-    movu                xm1, [srcq+strideq*1+8*0]
-    vinserti128          m1, [srcq+strideq*1+8*1], 1
-    lea                srcq, [srcq+strideq*2]
-    movu                xm2, [srcq+strideq*0+8*0]
-    vinserti128          m2, [srcq+strideq*0+8*1], 1
-    pshufb               m1, m4
-    pshufb               m2, m4
-    pmaddubsw            m1, m5
-    psubw                m3, m1, m0
-    pmulhrsw             m3, m6
-    paddw                m3, m0
-    pmaddubsw            m0, m2, m5
-    psubw                m2, m0, m1
-    pmulhrsw             m2, m6
-    paddw                m2, m1
-    mova        [tmpq+32*0], m3
-    mova        [tmpq+32*8], m2
-    add                tmpq, 32*16
-    sub                  hd, 2
-    jg .hv_w128_loop
-    movzx                hd, t2b
-    add                  t0, 32
-    add                  t1, 16
-    mov                tmpq, t0
-    mov                srcq, t1
-    sub                 t2d, 1<<8
-    jg .hv_w128_loop0
-    RET
 
 ; int8_t subpel_filters[5][15][8]
 %assign FILTER_REGULAR (0*15 << 16) | 3*15
@@ -1676,12 +1644,12 @@
     movd                xm2, [srcq+ssq*0]
     pinsrw              xm2, [srcq+ssq*1], 2
     pinsrw              xm2, [srcq+ssq*2], 4
-    pinsrw              xm2, [srcq+ss3q ], 6 ; 0 1 2 3
-    lea                srcq, [srcq+ssq*4]
-    movd                xm3, [srcq+ssq*0]
-    vpbroadcastd        xm1, [srcq+ssq*1]
-    vpbroadcastd        xm0, [srcq+ssq*2]
     add                srcq, ss3q
+    pinsrw              xm2, [srcq+ssq*0], 6 ; 0 1 2 3
+    movd                xm3, [srcq+ssq*1]
+    vpbroadcastd        xm1, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpbroadcastd        xm0, [srcq+ssq*0]
     vpblendd            xm3, xm1, 0x02       ; 4 5
     vpblendd            xm1, xm0, 0x02       ; 5 6
     palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
@@ -1696,10 +1664,10 @@
     mova                xm2, xm3
     pmaddubsw           xm3, xm10            ; a2 b2
     paddw               xm5, xm3
-    vpbroadcastd        xm4, [srcq+ssq*0]
-    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
-    vpbroadcastd        xm0, [srcq+ssq*1]
+    vpbroadcastd        xm4, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
+    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
+    vpbroadcastd        xm0, [srcq+ssq*0]
     vpblendd            xm4, xm0, 0x02       ; 7 8
     punpcklbw           xm3, xm4             ; 67 78
     pmaddubsw           xm4, xm3, xm11       ; a3 b3
@@ -1716,12 +1684,12 @@
     movd                xm2, [srcq+ssq*0]
     pinsrd              xm2, [srcq+ssq*1], 1
     pinsrd              xm2, [srcq+ssq*2], 2
-    pinsrd              xm2, [srcq+ss3q ], 3 ; 0 1 2 3
-    lea                srcq, [srcq+ssq*4]
-    movd                xm3, [srcq+ssq*0]
-    vpbroadcastd        xm1, [srcq+ssq*1]
-    vpbroadcastd        xm0, [srcq+ssq*2]
     add                srcq, ss3q
+    pinsrd              xm2, [srcq+ssq*0], 3 ; 0 1 2 3
+    movd                xm3, [srcq+ssq*1]
+    vpbroadcastd        xm1, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpbroadcastd        xm0, [srcq+ssq*0]
     vpblendd            xm3, xm1, 0x02       ; 4 5
     vpblendd            xm1, xm0, 0x02       ; 5 6
     palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
@@ -1736,10 +1704,10 @@
     mova                xm2, xm3
     pmaddubsw           xm3, xm10            ; a2 b2
     paddw               xm5, xm3
-    vpbroadcastd        xm4, [srcq+ssq*0]
-    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
-    vpbroadcastd        xm0, [srcq+ssq*1]
+    vpbroadcastd        xm4, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
+    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
+    vpbroadcastd        xm0, [srcq+ssq*0]
     vpblendd            xm4, xm0, 0x02       ; 7 8
     punpcklbw           xm3, xm4             ; 67 78
     pmaddubsw           xm4, xm3, xm11       ; a3 b3
@@ -1756,12 +1724,12 @@
     movq                xm1, [srcq+ssq*0]
     vpbroadcastq         m4, [srcq+ssq*1]
     vpbroadcastq         m2, [srcq+ssq*2]
-    vpbroadcastq         m5, [srcq+ss3q ]
-    lea                srcq, [srcq+ssq*4]
-    vpbroadcastq         m3, [srcq+ssq*0]
-    vpbroadcastq         m6, [srcq+ssq*1]
-    vpbroadcastq         m0, [srcq+ssq*2]
     add                srcq, ss3q
+    vpbroadcastq         m5, [srcq+ssq*0]
+    vpbroadcastq         m3, [srcq+ssq*1]
+    vpbroadcastq         m6, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpbroadcastq         m0, [srcq+ssq*0]
     vpblendd             m1, m4, 0x30
     vpblendd             m4, m2, 0x30
     punpcklbw            m1, m4      ; 01 12
@@ -1772,6 +1740,8 @@
     vpblendd             m6, m0, 0x30
     punpcklbw            m3, m6      ; 45 56
 .v_w8_loop:
+    vpbroadcastq         m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
     pmaddubsw            m5, m1, m8  ; a0 b0
     mova                 m1, m2
     pmaddubsw            m2, m9      ; a1 b1
@@ -1779,10 +1749,8 @@
     mova                 m2, m3
     pmaddubsw            m3, m10     ; a2 b2
     paddw                m5, m3
-    vpbroadcastq         m4, [srcq+ssq*0]
     vpblendd             m3, m0, m4, 0x30
-    vpbroadcastq         m0, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
+    vpbroadcastq         m0, [srcq+ssq*0]
     vpblendd             m4, m0, 0x30
     punpcklbw            m3, m4      ; 67 78
     pmaddubsw            m4, m3, m11 ; a3 b3
@@ -1800,30 +1768,28 @@
 .v_w32:
 .v_w64:
 .v_w128:
-    lea                 r6d, [wq-16]
-    mov                  r4, dstq
-    mov                  r7, srcq
-    shl                 r6d, 4
-    mov                 r6b, hb
+    lea                 r6d, [wq*8-128]
+    mov                  r4, srcq
+    mov                  r7, dstq
+    lea                 r6d, [hq+r6*2]
 .v_w16_loop0:
     vbroadcasti128       m4, [srcq+ssq*0]
     vbroadcasti128       m5, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
-    vbroadcasti128       m0, [srcq+ssq*1]
-    vbroadcasti128       m6, [srcq+ssq*0]
-    lea                srcq, [srcq+ssq*2]
-    vbroadcasti128       m1, [srcq+ssq*0]
-    vbroadcasti128       m2, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
+    vbroadcasti128       m6, [srcq+ssq*2]
+    add                srcq, ss3q
+    vbroadcasti128       m0, [srcq+ssq*0]
+    vbroadcasti128       m1, [srcq+ssq*1]
+    vbroadcasti128       m2, [srcq+ssq*2]
+    add                srcq, ss3q
     vbroadcasti128       m3, [srcq+ssq*0]
-    shufpd               m4, m4, m0, 0x0c
-    shufpd               m5, m5, m1, 0x0c
+    shufpd               m4, m0, 0x0c
+    shufpd               m5, m1, 0x0c
     punpcklbw            m1, m4, m5 ; 01
     punpckhbw            m4, m5     ; 34
-    shufpd               m6, m6, m2, 0x0c
+    shufpd               m6, m2, 0x0c
     punpcklbw            m2, m5, m6 ; 12
     punpckhbw            m5, m6     ; 45
-    shufpd               m0, m0, m3, 0x0c
+    shufpd               m0, m3, 0x0c
     punpcklbw            m3, m6, m0 ; 23
     punpckhbw            m6, m0     ; 56
 .v_w16_loop:
@@ -1861,11 +1827,11 @@
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .v_w16_loop
-    movzx                hd, r6b
     add                  r4, 16
     add                  r7, 16
-    mov                dstq, r4
-    mov                srcq, r7
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
     sub                 r6d, 1<<8
     jg .v_w16_loop0
     RET
@@ -1898,12 +1864,12 @@
     movq                xm2, [srcq+ssq*0]
     movhps              xm2, [srcq+ssq*1]
     movq                xm0, [srcq+ssq*2]
-    movhps              xm0, [srcq+ss3q ]
-    lea                srcq, [srcq+ssq*4]
-    vpbroadcastq         m3, [srcq+ssq*0]
-    vpbroadcastq         m4, [srcq+ssq*1]
-    vpbroadcastq         m1, [srcq+ssq*2]
     add                srcq, ss3q
+    movhps              xm0, [srcq+ssq*0]
+    vpbroadcastq         m3, [srcq+ssq*1]
+    vpbroadcastq         m4, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpbroadcastq         m1, [srcq+ssq*0]
     vpblendd             m2, m3, 0x30
     vpblendd             m0, m1, 0x30
     vpblendd             m2, m4, 0xc0
@@ -1920,6 +1886,11 @@
     pshufd              xm0, xm3, q2121
     punpcklwd           xm3, xm0       ; 45 56
 .hv_w2_loop:
+    movq                xm4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movhps              xm4, [srcq+ssq*0]
+    pshufb              xm4, xm6
+    pmaddubsw           xm4, xm7
     pmaddwd             xm5, xm1, xm10 ; a0 b0
     mova                xm1, xm2
     pmaddwd             xm2, xm11      ; a1 b1
@@ -1926,14 +1897,9 @@
     paddd               xm5, xm2
     mova                xm2, xm3
     pmaddwd             xm3, xm12      ; a2 b2
-    paddd               xm5, xm3
-    movq                xm4, [srcq+ssq*0]
-    movhps              xm4, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
-    pshufb              xm4, xm6
-    pmaddubsw           xm4, xm7
     phaddw              xm4, xm4
     pmulhrsw            xm4, xm8
+    paddd               xm5, xm3
     palignr             xm3, xm4, xm0, 12
     mova                xm0, xm4
     punpcklwd           xm3, xm0       ; 67 78
@@ -1954,13 +1920,13 @@
     vpbroadcastq         m2, [srcq+ssq*0]
     vpbroadcastq         m4, [srcq+ssq*1]
     vpbroadcastq         m0, [srcq+ssq*2]
-    vpbroadcastq         m5, [srcq+ss3q ]
-    lea                srcq, [srcq+ssq*4]
-    vpbroadcastq         m3, [srcq+ssq*0]
+    add                srcq, ss3q
+    vpbroadcastq         m5, [srcq+ssq*0]
+    vpbroadcastq         m3, [srcq+ssq*1]
     vpblendd             m2, m4, 0xcc ; 0 1
-    vpbroadcastq         m4, [srcq+ssq*1]
-    vpbroadcastq         m1, [srcq+ssq*2]
+    vpbroadcastq         m4, [srcq+ssq*2]
     add                srcq, ss3q
+    vpbroadcastq         m1, [srcq+ssq*0]
     vpblendd             m0, m5, 0xcc ; 2 3
     vpblendd             m3, m4, 0xcc ; 4 5
     pshufb               m2, m6
@@ -1981,6 +1947,8 @@
     pshufd               m0, m3, q2121
     punpcklwd            m3, m0       ; 45 56
 .hv_w4_loop:
+    vpbroadcastq         m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
     pmaddwd              m5, m1, m10  ; a0 b0
     mova                 m1, m2
     pmaddwd              m2, m11      ; a1 b1
@@ -1988,9 +1956,7 @@
     mova                 m2, m3
     pmaddwd              m3, m12      ; a2 b2
     paddd                m5, m3
-    vpbroadcastq         m4, [srcq+ssq*0]
-    vpbroadcastq         m3, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
+    vpbroadcastq         m3, [srcq+ssq*0]
     vpblendd             m4, m3, 0xcc ; 7 8
     pshufb               m4, m6
     pmaddubsw            m4, m7
@@ -2031,25 +1997,23 @@
     pshufd              m13, m0, q1111
     pshufd              m14, m0, q2222
     pshufd              m15, m0, q3333
-    lea                 r6d, [wq-8]
-    mov                  r4, dstq
-    mov                  r7, srcq
-    shl                 r6d, 5
-    mov                 r6b, hb
+    lea                 r6d, [wq*8-64]
+    mov                  r4, srcq
+    mov                  r7, dstq
+    lea                 r6d, [hq+r6*4]
 .hv_w8_loop0:
     vbroadcasti128       m7, [subpel_h_shufA]
-    vbroadcasti128       m8, [subpel_h_shufB]
-    vbroadcasti128       m9, [subpel_h_shufC]
     movu                xm4, [srcq+ssq*0]
+    vbroadcasti128       m8, [subpel_h_shufB]
     movu                xm5, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
-    movu                xm6, [srcq+ssq*0]
-    vbroadcasti128       m0, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
+    vbroadcasti128       m9, [subpel_h_shufC]
+    movu                xm6, [srcq+ssq*2]
+    add                srcq, ss3q
+    vbroadcasti128       m0, [srcq+ssq*0]
     vpblendd             m4, m0, 0xf0        ; 0 3
-    vinserti128          m5, [srcq+ssq*0], 1 ; 1 4
-    vinserti128          m6, [srcq+ssq*1], 1 ; 2 5
-    lea                srcq, [srcq+ssq*2]
+    vinserti128          m5, [srcq+ssq*1], 1 ; 1 4
+    vinserti128          m6, [srcq+ssq*2], 1 ; 2 5
+    add                srcq, ss3q
     vinserti128          m0, [srcq+ssq*0], 1 ; 3 6
 %macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
     pshufb               %3, %1, %6
@@ -2130,11 +2094,11 @@
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .hv_w8_loop
-    movzx                hd, r6b
     add                  r4, 8
     add                  r7, 8
-    mov                dstq, r4
-    mov                srcq, r7
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
     sub                 r6d, 1<<8
     jg .hv_w8_loop0
     RET
@@ -2153,48 +2117,6 @@
     pmulhrsw             m0, m4
 %endmacro
 
-%macro PREP_8TAP_V_W4 5 ; round, weights
-    movd                xm0, [srcq+strideq*0]
-    vpbroadcastd         m1, [srcq+strideq*2]
-    vpbroadcastd        xm2, [srcq+strideq*1]
-    vpbroadcastd         m3, [srcq+stride3q ]
-    lea                srcq, [srcq+strideq*4]
-    vpblendd             m1, m0, 0x01     ; 0 2 2 _   2 _ _ _
-    vpblendd             m3, m2, 0x03     ; 1 1 3 3   3 3 _ _
-    vpbroadcastd         m0, [srcq+strideq*0]
-    vpbroadcastd         m2, [srcq+strideq*1]
-    vpblendd             m1, m0, 0x68     ; 0 2 2 4   2 4 4 _
-    vpbroadcastd         m0, [srcq+strideq*2]
-    vbroadcasti128       m5, [deint_shuf4]
-    vpblendd             m3, m2, 0xc0     ; 1 1 3 3   3 3 5 5
-    vpblendd             m2, m3, m1, 0x55 ; 0 1 2 3   2 3 4 5
-    vpblendd             m3, m1, 0xaa     ; 1 2 3 4   3 4 5 _
-    punpcklbw            m1, m2, m3       ; 01  12    23  34
-    vpblendd             m3, m0, 0x80     ; 1 2 3 4   3 4 5 6
-    punpckhbw            m2, m3           ; 23  34    45  56
-.v_w4_loop:
-    pinsrd              xm0, [srcq+stride3q ], 1
-    lea                srcq, [srcq+strideq*4]
-    vpbroadcastd         m3, [srcq+strideq*0]
-    vpbroadcastd         m4, [srcq+strideq*1]
-    vpblendd             m3, m4, 0x20     ; _ _ 8 _   8 9 _ _
-    vpblendd             m3, m0, 0x03     ; 6 7 8 _   8 9 _ _
-    vpbroadcastd         m0, [srcq+strideq*2]
-    vpblendd             m3, m0, 0x40     ; 6 7 8 _   8 9 a _
-    pshufb               m3, m5           ; 67  78    89  9a
-    pmaddubsw            m4, m1, m%2
-    vperm2i128           m1, m2, m3, 0x21 ; 45  56    67  78
-    pmaddubsw            m2, m%3
-    paddw                m4, m2
-    mova                 m2, m3
-    pmaddubsw            m3, m%5
-    paddw                m3, m4
-    pmaddubsw            m4, m1, m%4
-    paddw                m3, m4
-    pmulhrsw             m3, m%1
-    mova             [tmpq], m3
-%endmacro
-
 %if WIN64
 DECLARE_REG_TMP 6, 4
 %else
@@ -2347,7 +2269,45 @@
     jg .v_w16
     je .v_w8
 .v_w4:
-    PREP_8TAP_V_W4 7, 8, 9, 10, 11
+    movd                xm0, [srcq+strideq*0]
+    vpbroadcastd         m1, [srcq+strideq*2]
+    vpbroadcastd        xm2, [srcq+strideq*1]
+    add                srcq, stride3q
+    vpbroadcastd         m3, [srcq+strideq*0]
+    vpblendd             m1, m0, 0x01     ; 0 2 2 _   2 _ _ _
+    vpblendd             m3, m2, 0x03     ; 1 1 3 3   3 3 _ _
+    vpbroadcastd         m0, [srcq+strideq*1]
+    vpbroadcastd         m2, [srcq+strideq*2]
+    vpblendd             m1, m0, 0x68     ; 0 2 2 4   2 4 4 _
+    vpbroadcastd         m0, [srcq+stride3q ]
+    vbroadcasti128       m5, [deint_shuf4]
+    vpblendd             m3, m2, 0xc0     ; 1 1 3 3   3 3 5 5
+    vpblendd             m2, m3, m1, 0x55 ; 0 1 2 3   2 3 4 5
+    vpblendd             m3, m1, 0xaa     ; 1 2 3 4   3 4 5 _
+    punpcklbw            m1, m2, m3       ; 01  12    23  34
+    vpblendd             m3, m0, 0x80     ; 1 2 3 4   3 4 5 6
+    punpckhbw            m2, m3           ; 23  34    45  56
+.v_w4_loop:
+    lea                srcq, [srcq+strideq*4]
+    pinsrd              xm0, [srcq+strideq*0], 1
+    vpbroadcastd         m3, [srcq+strideq*1]
+    vpbroadcastd         m4, [srcq+strideq*2]
+    vpblendd             m3, m0, 0x03     ; 6 7 8 _   8 _ _ _
+    vpbroadcastd         m0, [srcq+stride3q ]
+    vpblendd             m3, m4, 0x20     ; 6 7 8 _   8 9 _ _
+    vpblendd             m3, m0, 0x40     ; 6 7 8 _   8 9 a _
+    pshufb               m3, m5           ; 67  78    89  9a
+    pmaddubsw            m4, m1, m8
+    vperm2i128           m1, m2, m3, 0x21 ; 45  56    67  78
+    pmaddubsw            m2, m9
+    paddw                m4, m2
+    mova                 m2, m3
+    pmaddubsw            m3, m11
+    paddw                m3, m4
+    pmaddubsw            m4, m1, m10
+    paddw                m3, m4
+    pmulhrsw             m3, m7
+    mova             [tmpq], m3
     add                tmpq, 32
     sub                  hd, 4
     jg .v_w4_loop
@@ -2406,11 +2366,10 @@
     jg .v_w8_loop
     RET
 .v_w16:
-    lea                 r6d, [wq-16]
-    mov                  r5, tmpq
-    mov                  r7, srcq
-    shl                 r6d, 4
-    mov                 r6b, hb
+    add                  wd, wd
+    mov                  r5, srcq
+    mov                  r7, tmpq
+    lea                 r6d, [hq+wq*8-256]
 .v_w16_loop0:
     vbroadcasti128       m4, [srcq+strideq*0]
     vbroadcasti128       m5, [srcq+strideq*1]
@@ -2461,15 +2420,15 @@
     pmulhrsw            m14, m7
     pmulhrsw            m15, m7
     mova        [tmpq+wq*0], m14
-    mova        [tmpq+wq*2], m15
-    lea                tmpq, [tmpq+wq*4]
+    mova        [tmpq+wq*1], m15
+    lea                tmpq, [tmpq+wq*2]
     sub                  hd, 2
     jg .v_w16_loop
+    add                  r5, 16
+    add                  r7, 32
     movzx                hd, r6b
-    add                  r5, 32
-    add                  r7, 16
-    mov                tmpq, r5
-    mov                srcq, r7
+    mov                srcq, r5
+    mov                tmpq, r7
     sub                 r6d, 1<<8
     jg .v_w16_loop0
     RET
@@ -2557,8 +2516,8 @@
     vpbroadcastq         m2, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
     paddd                m6, m4
-    paddd                m5, m3
     vpbroadcastq         m4, [srcq+strideq*0]
+    paddd                m5, m3
     vpbroadcastq         m3, [srcq+strideq*1]
     vpblendd             m2, m4, 0xcc
     vpbroadcastq         m4, [srcq+strideq*2]
@@ -2591,18 +2550,17 @@
     jg .hv_w4_loop
     RET
 .hv_w8:
-    lea                 r6d, [wq-8]
-    mov                  r5, tmpq
-    mov                  r7, srcq
-    shl                 r6d, 5
-    mov                 r6b, hb
+    lea                 r6d, [wq*8-64]
+    mov                  r5, srcq
+    mov                  r7, tmpq
+    lea                 r6d, [hq+r6*4]
 .hv_w8_loop0:
     vbroadcasti128       m7, [subpel_h_shufA]
-    vbroadcasti128       m8, [subpel_h_shufB]
-    vbroadcasti128       m9, [subpel_h_shufC]
     movu                xm4, [srcq+strideq*0]
+    vbroadcasti128       m8, [subpel_h_shufB]
     movu                xm5, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
+    vbroadcasti128       m9, [subpel_h_shufC]
     movu                xm6, [srcq+strideq*0]
     vbroadcasti128       m0, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
@@ -2676,11 +2634,11 @@
     lea                tmpq, [tmpq+wq*4]
     sub                  hd, 2
     jg .hv_w8_loop
+    add                  r5, 8
+    add                  r7, 16
     movzx                hd, r6b
-    add                  r5, 16
-    add                  r7, 8
-    mov                tmpq, r5
-    mov                srcq, r7
+    mov                srcq, r5
+    mov                tmpq, r7
     sub                 r6d, 1<<8
     jg .hv_w8_loop0
     RET
--- a/src/x86/mc_sse.asm
+++ b/src/x86/mc_sse.asm
@@ -57,8 +57,8 @@
 subpel_h_shufC: db 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 subpel_s_shuf2: db 0,  1,  2,  3,  0,  1,  2,  3,  8,  9, 10, 11,  8,  9, 10, 11
 subpel_s_shuf8: db 0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
-bilin_h_shuf4:  db 1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
-bilin_h_shuf8:  db 1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+bilin_h_shuf4:  db 0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
+bilin_h_shuf8:  db 0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
 unpckw:         db 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
 
 pb_8x0_8x8: times 8 db 0
@@ -77,6 +77,7 @@
 pw_1:     times 8 dw 1
 pw_2:     times 8 dw 2
 pw_8:     times 8 dw 8
+pw_15:    times 8 dw 15
 pw_26:    times 8 dw 26
 pw_34:    times 8 dw 34
 pw_512:   times 8 dw 512
@@ -220,16 +221,18 @@
  DECLARE_REG_TMP 7
  %define base 0
 %endif
-;
+
 %macro RESTORE_DSQ_32 1
  %if ARCH_X86_32
    mov                  %1, dsm ; restore dsq
  %endif
 %endmacro
-;
-cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
+
+cglobal put_bilin, 1, 8, 0, dst, ds, src, ss, w, h, mxy
     movifnidn          mxyd, r6m ; mx
     LEA                  t0, put_ssse3
+    movifnidn          srcq, srcmp
+    movifnidn           ssq, ssmp
     tzcnt                wd, wm
     mov                  hd, hm
     test               mxyd, mxyd
@@ -335,20 +338,19 @@
 .h:
     ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
     ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
-    imul               mxyd, 0xff01
+    imul               mxyd, 0x00ff00ff
     mova                 m4, [base+bilin_h_shuf8]
     mova                 m0, [base+bilin_h_shuf4]
-    add                mxyd, 16 << 8
+    add                mxyd, 0x00100010
     movd                 m5, mxyd
     mov                mxyd, r7m ; my
-    pshuflw              m5, m5, q0000
-    punpcklqdq           m5, m5
+    pshufd               m5, m5, q0000
     test               mxyd, mxyd
     jnz .hv
     movzx                wd, word [t0+wq*2+table_offset(put, _bilin_h)]
     mova                 m3, [base+pw_2048]
     add                  wq, t0
-    RESTORE_DSQ_32       t0
+    movifnidn           dsq, dsmp
     jmp                  wq
 .h_w2:
     pshufd               m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
@@ -485,14 +487,13 @@
     RET
 .v:
     movzx                wd, word [t0+wq*2+table_offset(put, _bilin_v)]
-    imul               mxyd, 0xff01
+    imul               mxyd, 0x00ff00ff
     mova                 m5, [base+pw_2048]
-    add                mxyd, 16 << 8
+    add                mxyd, 0x00100010
     add                  wq, t0
     movd                 m4, mxyd
-    pshuflw              m4, m4, q0000
-    punpcklqdq           m4, m4
-    RESTORE_DSQ_32       t0
+    pshufd               m4, m4, q0000
+    movifnidn           dsq, dsmp
     jmp                  wq
 .v_w2:
     movd                 m0, [srcq+ssq*0]
@@ -499,9 +500,9 @@
 .v_w2_loop:
     pinsrw               m0, [srcq+ssq*1], 1 ; 0 1
     lea                srcq, [srcq+ssq*2]
-    pshuflw              m2, m0, q2301
+    pshuflw              m1, m0, q2301
     pinsrw               m0, [srcq+ssq*0], 0 ; 2 1
-    punpcklbw            m1, m0, m2
+    punpcklbw            m1, m0
     pmaddubsw            m1, m4
     pmulhrsw             m1, m5
     packuswb             m1, m1
@@ -516,11 +517,12 @@
 .v_w4:
     movd                 m0, [srcq+ssq*0]
 .v_w4_loop:
-    movd                 m1, [srcq+ssq*1]
+    movd                 m2, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    punpckldq            m2, m0, m1 ; 0 1
+    mova                 m1, m0
     movd                 m0, [srcq+ssq*0]
-    punpckldq            m1, m0  ; 1 2
+    punpckldq            m1, m2 ; 0 1
+    punpckldq            m2, m0 ; 1 2
     punpcklbw            m1, m2
     pmaddubsw            m1, m4
     pmulhrsw             m1, m5
@@ -536,11 +538,12 @@
 .v_w8:
     movq                 m0, [srcq+ssq*0]
 .v_w8_loop:
-    movq                 m3, [srcq+ssq*1]
+    movq                 m2, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    punpcklbw            m1, m3, m0
+    mova                 m1, m0
     movq                 m0, [srcq+ssq*0]
-    punpcklbw            m2, m0, m3
+    punpcklbw            m1, m2
+    punpcklbw            m2, m0
     pmaddubsw            m1, m4
     pmaddubsw            m2, m4
     pmulhrsw             m1, m5
@@ -552,66 +555,69 @@
     sub                  hd, 2
     jg .v_w8_loop
     RET
-    ;
 %macro PUT_BILIN_V_W16 0
     movu                 m0, [srcq+ssq*0]
 %%loop:
     movu                 m3, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    punpcklbw            m1, m3, m0
-    punpckhbw            m2, m3, m0
+    mova                 m1, m0
+    mova                 m2, m0
     movu                 m0, [srcq+ssq*0]
+    punpcklbw            m1, m3
+    punpckhbw            m2, m3
     pmaddubsw            m1, m4
     pmaddubsw            m2, m4
     pmulhrsw             m1, m5
     pmulhrsw             m2, m5
     packuswb             m1, m2
-    mova       [dstq+dsq*0], m1
-    punpcklbw            m1, m0, m3
-    punpckhbw            m2, m0, m3
-    pmaddubsw            m1, m4
+    punpcklbw            m2, m3, m0
+    punpckhbw            m3, m0
     pmaddubsw            m2, m4
-    pmulhrsw             m1, m5
+    pmaddubsw            m3, m4
     pmulhrsw             m2, m5
-    packuswb             m1, m2
-    mova       [dstq+dsq*1], m1
+    pmulhrsw             m3, m5
+    packuswb             m2, m3
+    mova       [dstq+dsq*0], m1
+    mova       [dstq+dsq*1], m2
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg %%loop
 %endmacro
-    ;
 .v_w16:
     PUT_BILIN_V_W16
     RET
+.v_w128:
+    lea                 r6d, [hq+(7<<16)]
+    jmp .v_w16gt
+.v_w64:
+    lea                 r6d, [hq+(3<<16)]
+    jmp .v_w16gt
+.v_w32:
+    lea                 r6d, [hq+(1<<16)]
 .v_w16gt:
-    mov                  r4, dstq
-    mov                  r6, srcq
+    mov                  r4, srcq
+%if ARCH_X86_64
+    mov                  r7, dstq
+%endif
 .v_w16gt_loop:
-%if ARCH_X86_32
-    mov                bakm, t0q
-    RESTORE_DSQ_32       t0
     PUT_BILIN_V_W16
-    mov                 t0q, bakm
+%if ARCH_X86_64
+    add                  r4, 16
+    add                  r7, 16
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
 %else
-    PUT_BILIN_V_W16
+    mov                dstq, dstmp
+    add                  r4, 16
+    movzx                hd, r6w
+    add                dstq, 16
+    mov                srcq, r4
+    mov               dstmp, dstq
 %endif
-    mov                  hw, t0w
-    add                  r4, mmsize
-    add                  r6, mmsize
-    mov                dstq, r4
-    mov                srcq, r6
-    sub                 t0d, 1<<16
+    sub                 r6d, 1<<16
     jg .v_w16gt
     RET
-.v_w32:
-    lea                 t0d, [hq+(1<<16)]
-    jmp .v_w16gt
-.v_w64:
-    lea                 t0d, [hq+(3<<16)]
-    jmp .v_w16gt
-.v_w128:
-    lea                 t0d, [hq+(7<<16)]
-    jmp .v_w16gt
 .hv:
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
     ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
@@ -618,32 +624,33 @@
     movzx                wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
     WIN64_SPILL_XMM       8
     shl                mxyd, 11 ; can't shift by 12 due to signed overflow
-    mova                 m7, [base+pw_2048]
+    mova                 m7, [base+pw_15]
     movd                 m6, mxyd
     add                  wq, t0
     pshuflw              m6, m6, q0000
+    paddb                m5, m5
     punpcklqdq           m6, m6
     jmp                  wq
 .hv_w2:
     RESTORE_DSQ_32       t0
     movd                 m0, [srcq+ssq*0]
-    pshufd               m0, m0, q0000      ; src[x - src_stride]
+    punpckldq            m0, m0
     pshufb               m0, m4
     pmaddubsw            m0, m5
 .hv_w2_loop:
-    movd                 m1, [srcq+ssq*1]   ; src[x]
+    movd                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    movhps               m1, [srcq+ssq*0]   ; src[x + src_stride]
-    pshufd               m1, m1, q3120
+    movd                 m2, [srcq+ssq*0]
+    punpckldq            m1, m2
     pshufb               m1, m4
     pmaddubsw            m1, m5             ; 1 _ 2 _
     shufps               m2, m0, m1, q1032  ; 0 _ 1 _
     mova                 m0, m1
-    psubw                m1, m2   ; src[x + src_stride] - src[x]
-    paddw                m1, m1
-    pmulhw               m1, m6   ; (my * (src[x + src_stride] - src[x])
-    paddw                m1, m2   ; src[x] + (my * (src[x + src_stride] - src[x])
-    pmulhrsw             m1, m7
+    psubw                m1, m2   ; 2 * (src[x + src_stride] - src[x])
+    pmulhw               m1, m6   ; (my * (src[x + src_stride] - src[x]) >> 4
+    pavgw                m2, m7   ; src[x] + 8
+    paddw                m1, m2   ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8
+    psrlw                m1, 4
     packuswb             m1, m1
 %if ARCH_X86_64
     movq                 r6, m1
@@ -660,8 +667,8 @@
     RET
 .hv_w4:
     mova                 m4, [base+bilin_h_shuf4]
-    RESTORE_DSQ_32       t0
     movddup             xm0, [srcq+ssq*0]
+    movifnidn           dsq, dsmp
     pshufb               m0, m4
     pmaddubsw            m0, m5
 .hv_w4_loop:
@@ -669,14 +676,14 @@
     lea                srcq, [srcq+ssq*2]
     movhps               m1, [srcq+ssq*0]
     pshufb               m1, m4
-    pmaddubsw            m1, m5           ; 1 2
+    pmaddubsw            m1, m5            ; 1 2
     shufps               m2, m0, m1, q1032 ; 0 1
     mova                 m0, m1
     psubw                m1, m2
-    paddw                m1, m1
     pmulhw               m1, m6
+    pavgw                m2, m7
     paddw                m1, m2
-    pmulhrsw             m1, m7
+    psrlw                m1, 4
     packuswb             m1, m1
     movd       [dstq+dsq*0], m1
     psrlq                m1, 32
@@ -686,28 +693,28 @@
     jg .hv_w4_loop
     RET
 .hv_w8:
-    RESTORE_DSQ_32       t0
-    movu                 m0, [srcq+ssq*0+8*0]
+    movu                 m0, [srcq+ssq*0]
+    movifnidn           dsq, dsmp
     pshufb               m0, m4
     pmaddubsw            m0, m5
 .hv_w8_loop:
-    movu                 m2, [srcq+ssq*1+8*0]
+    movu                 m2, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
     pshufb               m2, m4
     pmaddubsw            m2, m5
     psubw                m1, m2, m0
-    paddw                m1, m1
     pmulhw               m1, m6
+    pavgw                m0, m7
     paddw                m1, m0
-    movu                 m0, [srcq+ssq*0+8*0]
+    movu                 m0, [srcq+ssq*0]
     pshufb               m0, m4
     pmaddubsw            m0, m5
     psubw                m3, m0, m2
-    paddw                m3, m3
     pmulhw               m3, m6
+    pavgw                m2, m7
     paddw                m3, m2
-    pmulhrsw             m1, m7
-    pmulhrsw             m3, m7
+    psrlw                m1, 4
+    psrlw                m3, 4
     packuswb             m1, m3
     movq       [dstq+dsq*0], m1
     movhps     [dstq+dsq*1], m1
@@ -715,27 +722,34 @@
     sub                  hd, 2
     jg .hv_w8_loop
     RET
+.hv_w128:
+    lea                 r6d, [hq+(7<<16)]
+    jmp .hv_w16_start
+.hv_w64:
+    lea                 r6d, [hq+(3<<16)]
+    jmp .hv_w16_start
+.hv_w32:
+    lea                 r6d, [hq+(1<<16)]
+.hv_w16_start:
+    mov                  r4, srcq
+%if ARCH_X86_32
+    %define m8 [dstq]
+%else
+    mov                  r7, dstq
+%endif
 .hv_w16:
-    xor                 t0d, t0d
-.hv_w16gt:
-    mov                  r4, dstq
-    mov                  r6, srcq
- %if WIN64
-    movaps              r4m, xmm8
- %endif
+    movifnidn           dsq, dsmp
+%if WIN64
+    movaps              r4m, m8
+%endif
 .hv_w16_loop0:
-    movu                 m0,     [srcq+8*0]
-    movu                 m1,     [srcq+8*1]
+    movu                 m0, [srcq+8*0]
+    movu                 m1, [srcq+8*1]
     pshufb               m0, m4
     pshufb               m1, m4
     pmaddubsw            m0, m5
     pmaddubsw            m1, m5
 .hv_w16_loop:
-%if ARCH_X86_32
- %define m0tmp [dstq]
-%else
- %define m0tmp m8
-%endif
     add                srcq, ssq
     movu                 m2, [srcq+8*0]
     movu                 m3, [srcq+8*1]
@@ -743,62 +757,51 @@
     pshufb               m3, m4
     pmaddubsw            m2, m5
     pmaddubsw            m3, m5
-    mova              m0tmp, m2
+    mova                 m8, m2
     psubw                m2, m0
-    paddw                m2, m2
     pmulhw               m2, m6
+    pavgw                m0, m7
     paddw                m2, m0
     mova                 m0, m3
     psubw                m3, m1
-    paddw                m3, m3
     pmulhw               m3, m6
+    pavgw                m1, m7
     paddw                m3, m1
     mova                 m1, m0
-    mova                 m0, m0tmp
-    pmulhrsw             m2, m7
-    pmulhrsw             m3, m7
+    mova                 m0, m8
+    psrlw                m2, 4
+    psrlw                m3, 4
     packuswb             m2, m3
     mova             [dstq], m2
     add                dstq, dsmp
     dec                  hd
     jg .hv_w16_loop
-    movzx                hd, t0w
-    add                  r4, mmsize
-    add                  r6, mmsize
-    mov                dstq, r4
-    mov                srcq, r6
-    sub                 t0d, 1<<16
-    jg .hv_w16_loop0
- %if WIN64
-    movaps             xmm8, r4m
- %endif
+%if ARCH_X86_32
+    mov                dstq, dstm
+    add                  r4, 16
+    movzx                hd, r6w
+    add                dstq, 16
+    mov                srcq, r4
+    mov                dstm, dstq
+%else
+    add                  r4, 16
+    add                  r7, 16
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
+%endif
+    sub                 r6d, 1<<16
+    jg .hv_w16_loop0
+%if WIN64
+    movaps               m8, r4m
+%endif
     RET
-.hv_w32:
-    lea                 t0d, [hq+(1<<16)]
-    jmp .hv_w16gt
-.hv_w64:
-    lea                 t0d, [hq+(3<<16)]
-    jmp .hv_w16gt
-.hv_w128:
-    lea                 t0d, [hq+(7<<16)]
-    jmp .hv_w16gt
 
-%macro PSHUFB_0X1X 1-2 ; dst[, src]
- %if cpuflag(ssse3)
-    pshufb               %1, %2
- %else
-    punpcklbw            %1, %1
-    psraw                %1, 8
-    pshufd               %1, %1, q0000
- %endif
-%endmacro
-
 %macro PSHUFB_BILIN_H8 2 ; dst, src
  %if cpuflag(ssse3)
     pshufb               %1, %2
  %else
-    mova                 %2, %1
-    psrldq               %1, 1
+    psrldq               %2, %1, 1
     punpcklbw            %1, %2
  %endif
 %endmacro
@@ -807,8 +810,7 @@
  %if cpuflag(ssse3)
     pshufb               %1, %2
  %else
-    mova                 %2, %1
-    psrldq               %1, 1
+    psrldq               %2, %1, 1
     punpckhbw            %3, %1, %2
     punpcklbw            %1, %2
     punpcklqdq           %1, %3
@@ -845,17 +847,15 @@
 %endmacro
 
 %macro PREP_BILIN 0
-
-DECLARE_REG_TMP 3, 5, 6
 %if ARCH_X86_32
- %define base        t2-prep%+SUFFIX
+    %define base r6-prep%+SUFFIX
 %else
- %define base        0
+    %define base 0
 %endif
 
 cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     movifnidn          mxyd, r5m ; mx
-    LEA                  t2, prep%+SUFFIX
+    LEA                  r6, prep%+SUFFIX
     tzcnt                wd, wm
     movifnidn            hd, hm
     test               mxyd, mxyd
@@ -865,11 +865,12 @@
     jnz .v
 .prep:
 %if notcpuflag(ssse3)
-    add                  t2, prep_ssse3 - prep_sse2
+    add                  r6, prep_ssse3 - prep_sse2
     jmp prep_ssse3
 %else
-    movzx                wd, word [t2+wq*2+table_offset(prep,)]
-    add                  wq, t2
+    movzx                wd, word [r6+wq*2+table_offset(prep,)]
+    pxor                 m4, m4
+    add                  wq, r6
     lea            stride3q, [strideq*3]
     jmp                  wq
 .prep_w4:
@@ -877,17 +878,16 @@
     movd                 m1, [srcq+strideq*1]
     movd                 m2, [srcq+strideq*2]
     movd                 m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
     punpckldq            m0, m1
     punpckldq            m2, m3
-    lea                srcq, [srcq+strideq*4]
-    pxor                 m1, m1
-    punpcklbw            m0, m1
-    punpcklbw            m2, m1
+    punpcklbw            m0, m4
+    punpcklbw            m2, m4
     psllw                m0, 4
     psllw                m2, 4
-    mova    [tmpq+mmsize*0], m0
-    mova    [tmpq+mmsize*1], m2
-    add                tmpq, 32
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m2
+    add                tmpq, 16*2
     sub                  hd, 4
     jg .prep_w4
     RET
@@ -897,7 +897,6 @@
     movq                 m2, [srcq+strideq*2]
     movq                 m3, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
-    pxor                 m4, m4
     punpcklbw            m0, m4
     punpcklbw            m1, m4
     punpcklbw            m2, m4
@@ -915,16 +914,13 @@
     jg .prep_w8
     RET
 .prep_w16:
-    movq                 m0, [srcq+strideq*0+8*0]
-    movq                 m1, [srcq+strideq*0+8*1]
-    movq                 m2, [srcq+strideq*1+8*0]
-    movq                 m3, [srcq+strideq*1+8*1]
+    movu                 m1, [srcq+strideq*0]
+    movu                 m3, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
-    pxor                 m4, m4
-    punpcklbw            m0, m4
-    punpcklbw            m1, m4
-    punpcklbw            m2, m4
-    punpcklbw            m3, m4
+    punpcklbw            m0, m1, m4
+    punpckhbw            m1, m4
+    punpcklbw            m2, m3, m4
+    punpckhbw            m3, m4
     psllw                m0, 4
     psllw                m1, 4
     psllw                m2, 4
@@ -937,27 +933,25 @@
     sub                  hd, 2
     jg .prep_w16
     RET
-.prep_w32:
-    mov                 t2d, 1
-    jmp .prep_w32_vloop
-.prep_w64:
-    mov                 t2d, 2
-    jmp .prep_w32_vloop
 .prep_w128:
-    mov                 t2d, 4
+    mov                  r3, -128
+    jmp .prep_w32_start
+.prep_w64:
+    mov                  r3, -64
+    jmp .prep_w32_start
+.prep_w32:
+    mov                  r3, -32
+.prep_w32_start:
+    sub                srcq, r3
 .prep_w32_vloop:
-    mov                 t1q, srcq
-    mov                 r3d, t2d
+    mov                  r6, r3
 .prep_w32_hloop:
-    movq                 m0, [t1q+8*0]
-    movq                 m1, [t1q+8*1]
-    movq                 m2, [t1q+8*2]
-    movq                 m3, [t1q+8*3]
-    pxor                 m4, m4
-    punpcklbw            m0, m4
-    punpcklbw            m1, m4
-    punpcklbw            m2, m4
-    punpcklbw            m3, m4
+    movu                 m1, [srcq+r6+16*0]
+    movu                 m3, [srcq+r6+16*1]
+    punpcklbw            m0, m1, m4
+    punpckhbw            m1, m4
+    punpcklbw            m2, m3, m4
+    punpckhbw            m3, m4
     psllw                m0, 4
     psllw                m1, 4
     psllw                m2, 4
@@ -967,10 +961,9 @@
     mova        [tmpq+16*2], m2
     mova        [tmpq+16*3], m3
     add                tmpq, 16*4
-    add                 t1q, 32
-    dec                 r3d
-    jg .prep_w32_hloop
-    lea                srcq, [srcq+strideq]
+    add                  r6, 32
+    jl .prep_w32_hloop
+    add                srcq, strideq
     dec                  hd
     jg .prep_w32_vloop
     RET
@@ -978,40 +971,31 @@
 .h:
     ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
     ; = (16 - mx) * src[x] + mx * src[x + 1]
-    imul               mxyd, 0xff01
 %if cpuflag(ssse3)
+    imul               mxyd, 0x00ff00ff
     mova                 m4, [base+bilin_h_shuf8]
+    add                mxyd, 0x00100010
+%else
+    imul               mxyd, 0xffff
+    add                mxyd, 16
 %endif
-    add                mxyd, 16 << 8
     movd                 m5, mxyd
     mov                mxyd, r6m ; my
-%if cpuflag(ssse3)
-    pshuflw              m5, m5, q0000
-    punpcklqdq           m5, m5
-%else
-    PSHUFB_0X1X          m5
-%endif
+    pshufd               m5, m5, q0000
     test               mxyd, mxyd
     jnz .hv
-%if ARCH_X86_32
-    mov                  t1, t2 ; save base reg for w4
-%endif
-    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
 %if notcpuflag(ssse3)
     WIN64_SPILL_XMM 8
     pxor                 m6, m6
 %endif
-    add                  wq, t2
-    lea            stride3q, [strideq*3]
+    add                  wq, r6
     jmp                  wq
 .h_w4:
 %if cpuflag(ssse3)
- %if ARCH_X86_32
-    mova                 m4, [t1-prep_ssse3+bilin_h_shuf4]
- %else
-    mova                 m4, [bilin_h_shuf4]
- %endif
+    mova                 m4, [base+bilin_h_shuf4]
 %endif
+    lea            stride3q, [strideq*3]
 .h_w4_loop:
     movq                 m0, [srcq+strideq*0]
     movhps               m0, [srcq+strideq*1]
@@ -1029,6 +1013,8 @@
     jg .h_w4_loop
     RET
 .h_w8:
+    lea            stride3q, [strideq*3]
+.h_w8_loop:
     movu                 m0, [srcq+strideq*0]
     movu                 m1, [srcq+strideq*1]
     movu                 m2, [srcq+strideq*2]
@@ -1048,7 +1034,7 @@
     mova        [tmpq+16*3], m3
     add                tmpq, 16*4
     sub                  hd, 4
-    jg .h_w8
+    jg .h_w8_loop
     RET
 .h_w16:
     movu                 m0, [srcq+strideq*0+8*0]
@@ -1072,22 +1058,23 @@
     sub                  hd, 2
     jg .h_w16
     RET
-.h_w32:
-    mov                 t2d, 1 << 0
-    jmp .h_w32_vloop
-.h_w64:
-    mov                 t2d, 1 << 1
-    jmp .h_w32_vloop
 .h_w128:
-    mov                 t2d, 1 << 3
+    mov                  r3, -128
+    jmp .h_w32_start
+.h_w64:
+    mov                  r3, -64
+    jmp .h_w32_start
+.h_w32:
+    mov                  r3, -32
+.h_w32_start:
+    sub                srcq, r3
 .h_w32_vloop:
-    mov                 t1q, srcq
-    mov                 r3d, t2d
+    mov                  r6, r3
 .h_w32_hloop:
-    movu                 m0, [t1q+8*0]
-    movu                 m1, [t1q+8*1]
-    movu                 m2, [t1q+8*2]
-    movu                 m3, [t1q+8*3]
+    movu                 m0, [srcq+r6+8*0]
+    movu                 m1, [srcq+r6+8*1]
+    movu                 m2, [srcq+r6+8*2]
+    movu                 m3, [srcq+r6+8*3]
     PSHUFB_BILIN_H8      m0, m4
     PSHUFB_BILIN_H8      m1, m4
     PSHUFB_BILIN_H8      m2, m4
@@ -1101,11 +1088,10 @@
     mova        [tmpq+16*2], m2
     mova        [tmpq+16*3], m3
     add                tmpq, 16*4
-    add                 t1q, 32
-    shr                 r3d, 1
-    jnz .h_w32_hloop
-    lea                srcq, [srcq+strideq]
-    sub                  hd, 1
+    add                  r6, 32
+    jl .h_w32_hloop
+    add                srcq, strideq
+    dec                  hd
     jg .h_w32_vloop
     RET
 .v:
@@ -1113,19 +1099,19 @@
  %assign stack_offset stack_offset - stack_size_padded
     WIN64_SPILL_XMM 8
 %endif
-    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
-    imul               mxyd, 0xff01
-    add                mxyd, 16 << 8
-    add                  wq, t2
-    lea            stride3q, [strideq*3]
-    movd                 m5, mxyd
+    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
 %if cpuflag(ssse3)
-    pshuflw              m5, m5, q0000
-    punpcklqdq           m5, m5
+    imul               mxyd, 0x00ff00ff
+    add                mxyd, 0x00100010
 %else
-    PSHUFB_0X1X          m5
+    imul               mxyd, 0xffff
     pxor                 m6, m6
+    add                mxyd, 16
 %endif
+    add                  wq, r6
+    lea            stride3q, [strideq*3]
+    movd                 m5, mxyd
+    pshufd               m5, m5, q0000
     jmp                  wq
 .v_w4:
     movd                 m0, [srcq+strideq*0]
@@ -1134,20 +1120,18 @@
     movd                 m2, [srcq+strideq*2]
     movd                 m3, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
-    punpcklwd            m0, m1  ; 0 1 _ _
-    punpcklwd            m1, m2  ; 1 2 _ _
-    punpcklbw            m1, m0
-    PMADDUBSW            m1, m5, m6, m7, 0
-    pshufd               m1, m1, q3120
-    mova        [tmpq+16*0], m1
+    punpckldq            m0, m1
+    punpckldq            m1, m2
+    punpcklbw            m0, m1 ; 01 12
+    PMADDUBSW            m0, m5, m6, m7, 0
+    mova        [tmpq+16*0], m0
     movd                 m0, [srcq+strideq*0]
-    punpcklwd            m2, m3  ; 2 3 _ _
-    punpcklwd            m3, m0  ; 3 4 _ _
-    punpcklbw            m3, m2
-    PMADDUBSW            m3, m5, m6, m7, 0
-    pshufd               m3, m3, q3120
-    mova        [tmpq+16*1], m3
-    add                tmpq, 32
+    punpckldq            m2, m3
+    punpckldq            m3, m0
+    punpcklbw            m2, m3 ; 23 34
+    PMADDUBSW            m2, m5, m6, m7, 0
+    mova        [tmpq+16*1], m2
+    add                tmpq, 16*2
     sub                  hd, 4
     jg .v_w4_loop
     RET
@@ -1154,26 +1138,23 @@
 .v_w8:
     movq                 m0, [srcq+strideq*0]
 .v_w8_loop:
-    movq                 m1, [srcq+strideq*2]
-    movq                 m2, [srcq+strideq*1]
+    movq                 m1, [srcq+strideq*1]
+    movq                 m2, [srcq+strideq*2]
     movq                 m3, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
-    shufpd               m4, m0, m1, 0x0c       ; 0 2
+    punpcklbw            m0, m1 ; 01
+    punpcklbw            m1, m2 ; 12
+    PMADDUBSW            m0, m5, m6, m7, 0
+    PMADDUBSW            m1, m5, m6, m7, 0
+    mova        [tmpq+16*0], m0
     movq                 m0, [srcq+strideq*0]
-    shufpd               m2, m3, 0x0c           ; 1 3
-    shufpd               m1, m0, 0x0c           ; 2 4
-    punpcklbw            m3, m2, m4
+    punpcklbw            m2, m3 ; 23
+    punpcklbw            m3, m0 ; 34
+    PMADDUBSW            m2, m5, m6, m7, 0
+    mova        [tmpq+16*1], m1
     PMADDUBSW            m3, m5, m6, m7, 0
-    mova        [tmpq+16*0], m3
-    punpckhbw            m3, m2, m4
-    PMADDUBSW            m3, m5, m6, m7, 0
-    mova        [tmpq+16*2], m3
-    punpcklbw            m3, m1, m2
-    punpckhbw            m1, m2
-    PMADDUBSW            m3, m5, m6, m7, 0
-    PMADDUBSW            m1, m5, m6, m7, 0
-    mova        [tmpq+16*1], m3
-    mova        [tmpq+16*3], m1
+    mova        [tmpq+16*2], m2
+    mova        [tmpq+16*3], m3
     add                tmpq, 16*4
     sub                  hd, 4
     jg .v_w8_loop
@@ -1183,48 +1164,48 @@
 .v_w16_loop:
     movu                 m1, [srcq+strideq*1]
     movu                 m2, [srcq+strideq*2]
-    punpcklbw            m3, m1, m0
-    punpckhbw            m4, m1, m0
-    PMADDUBSW            m3, m5, m6, m7, 0
-    PMADDUBSW            m4, m5, m6, m7, 0
-    mova        [tmpq+16*0], m3
-    mova        [tmpq+16*1], m4
-    punpcklbw            m3, m2, m1
-    punpckhbw            m4, m2, m1
-    PMADDUBSW            m3, m5, m6, m7, 0
-    PMADDUBSW            m4, m5, m6, m7, 0
-    mova        [tmpq+16*2], m3
-    mova        [tmpq+16*3], m4
     movu                 m3, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
+    punpcklbw            m4, m0, m1
+    punpckhbw            m0, m1
+    PMADDUBSW            m4, m5, m6, m7, 0
+    PMADDUBSW            m0, m5, m6, m7, 0
+    mova        [tmpq+16*0], m4
+    punpcklbw            m4, m1, m2
+    punpckhbw            m1, m2
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq+16*1], m0
     movu                 m0, [srcq+strideq*0]
-    add                tmpq, 16*8
-    punpcklbw            m1, m3, m2
-    punpckhbw            m4, m3, m2
     PMADDUBSW            m1, m5, m6, m7, 0
+    mova        [tmpq+16*2], m4
+    punpcklbw            m4, m2, m3
+    punpckhbw            m2, m3
     PMADDUBSW            m4, m5, m6, m7, 0
-    mova        [tmpq-16*4], m1
-    mova        [tmpq-16*3], m4
-    punpcklbw            m1, m0, m3
-    punpckhbw            m2, m0, m3
-    PMADDUBSW            m1, m5, m6, m7, 0
+    mova        [tmpq+16*3], m1
     PMADDUBSW            m2, m5, m6, m7, 0
-    mova        [tmpq-16*2], m1
-    mova        [tmpq-16*1], m2
+    mova        [tmpq+16*4], m4
+    punpcklbw            m4, m3, m0
+    punpckhbw            m3, m0
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq+16*5], m2
+    PMADDUBSW            m3, m5, m6, m7, 0
+    mova        [tmpq+16*6], m4
+    mova        [tmpq+16*7], m3
+    add                tmpq, 16*8
     sub                  hd, 4
     jg .v_w16_loop
     RET
-.v_w32:
-    lea                 t2d, [hq+(0<<16)]
-    mov                 t0d, 64
+.v_w128:
+    lea                 r3d, [hq+(3<<8)]
+    mov                 r6d, 256
     jmp .v_w32_start
 .v_w64:
-    lea                 t2d, [hq+(1<<16)]
-    mov                 t0d, 128
+    lea                 r3d, [hq+(1<<8)]
+    mov                 r6d, 128
     jmp .v_w32_start
-.v_w128:
-    lea                 t2d, [hq+(3<<16)]
-    mov                 t0d, 256
+.v_w32:
+    xor                 r3d, r3d
+    mov                 r6d, 64
 .v_w32_start:
 %if ARCH_X86_64
  %if WIN64
@@ -1232,7 +1213,7 @@
  %endif
     mov                  r7, tmpq
 %endif
-    mov                  t1, srcq
+    mov                  r5, srcq
 .v_w32_hloop:
     movu                 m0, [srcq+strideq*0+16*0]
     movu                 m1, [srcq+strideq*0+16*1]
@@ -1240,48 +1221,48 @@
     movu                 m2, [srcq+strideq*1+16*0]
     movu                 m3, [srcq+strideq*1+16*1]
     lea                srcq, [srcq+strideq*2]
-    punpcklbw            m4, m2, m0
+    punpcklbw            m4, m0, m2
+    punpckhbw            m0, m2
     PMADDUBSW            m4, m5, m6, m7, 0
+    PMADDUBSW            m0, m5, m6, m7, 0
     mova        [tmpq+16*0], m4
-    punpckhbw            m4, m2, m0
+    mova        [tmpq+16*1], m0
+    movu                 m0, [srcq+strideq*0+16*0]
+    punpcklbw            m4, m1, m3
+    punpckhbw            m1, m3
     PMADDUBSW            m4, m5, m6, m7, 0
-    mova        [tmpq+16*1], m4
-    punpcklbw            m4, m3, m1
-    PMADDUBSW            m4, m5, m6, m7, 0
+    PMADDUBSW            m1, m5, m6, m7, 0
     mova        [tmpq+16*2], m4
-    punpckhbw            m4, m3, m1
-    PMADDUBSW            m4, m5, m6, m7, 0
-    mova        [tmpq+16*3], m4
-    add                tmpq, t0q
-    movu                 m0, [srcq+strideq*0+16*0]
+    mova        [tmpq+16*3], m1
     movu                 m1, [srcq+strideq*0+16*1]
-    punpcklbw            m4, m0, m2
+    add                tmpq, r6
+    punpcklbw            m4, m2, m0
+    punpckhbw            m2, m0
     PMADDUBSW            m4, m5, m6, m7, 0
+    PMADDUBSW            m2, m5, m6, m7, 0
     mova        [tmpq+16*0], m4
-    punpckhbw            m4, m0, m2
+    mova        [tmpq+16*1], m2
+    punpcklbw            m4, m3, m1
+    punpckhbw            m3, m1
     PMADDUBSW            m4, m5, m6, m7, 0
-    mova        [tmpq+16*1], m4
-    punpcklbw            m4, m1, m3
-    PMADDUBSW            m4, m5, m6, m7, 0
+    PMADDUBSW            m3, m5, m6, m7, 0
     mova        [tmpq+16*2], m4
-    punpckhbw            m4, m1, m3
-    PMADDUBSW            m4, m5, m6, m7, 0
-    mova        [tmpq+16*3], m4
-    add                tmpq, t0q
+    mova        [tmpq+16*3], m3
+    add                tmpq, r6
     sub                  hd, 2
     jg .v_w32_vloop
-    movzx                hd, t2w
-    add                  t1, 32
-    mov                srcq, t1
+    add                  r5, 32
+    movzx                hd, r3b
+    mov                srcq, r5
 %if ARCH_X86_64
-    add                  r7, 2*16*2
+    add                  r7, 16*4
     mov                tmpq, r7
 %else
     mov                tmpq, tmpmp
-    add                tmpq, 2*16*2
+    add                tmpq, 16*4
     mov               tmpmp, tmpq
 %endif
-    sub                 t2d, 1<<16
+    sub                 r3d, 1<<8
     jg .v_w32_hloop
 %if WIN64
     POP                  r7
@@ -1290,71 +1271,56 @@
 .hv:
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
     ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
 %assign stack_offset stack_offset - stack_size_padded
 %if cpuflag(ssse3)
+    imul               mxyd, 0x08000800
     WIN64_SPILL_XMM 8
 %else
-    WIN64_SPILL_XMM 10
-%endif
-    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
-%if cpuflag(ssse3)
-    shl                mxyd, 11
-%else
+    or                 mxyd, 1<<16
+    WIN64_SPILL_XMM 9
  %if ARCH_X86_64
-    mova                 m8, [pw_8]
+    mova                 m8, [base+pw_8]
  %else
-  %define m8 [t1-prep_sse2+pw_8]
+  %define                m8  [base+pw_8]
  %endif
     pxor                 m7, m7
 %endif
     movd                 m6, mxyd
-    add                  wq, t2
-    pshuflw              m6, m6, q0000
-%if cpuflag(ssse3)
-    punpcklqdq           m6, m6
-%elif ARCH_X86_64
-    psrlw                m0, m8, 3
-    punpcklwd            m6, m0
-%else
-    punpcklwd            m6, [base+pw_1]
-%endif
-%if ARCH_X86_32
-    mov                  t1, t2 ; save base reg for w4
-%endif
-    lea            stride3q, [strideq*3]
+    add                  wq, r6
+    pshufd               m6, m6, q0000
     jmp                  wq
 .hv_w4:
 %if cpuflag(ssse3)
- %if ARCH_X86_32
-    mova                 m4, [t1-prep_ssse3+bilin_h_shuf4]
- %else
-    mova                 m4, [bilin_h_shuf4]
- %endif
-%endif
+    mova                 m4, [base+bilin_h_shuf4]
+    movddup              m0, [srcq+strideq*0]
+%else
     movhps               m0, [srcq+strideq*0]
+%endif
+    lea                  r3, [strideq*3]
     PSHUFB_BILIN_H4      m0, m4, m3
     PMADDUBSW            m0, m5, m7, m4, 0 ; _ 0
 .hv_w4_loop:
     movq                 m1, [srcq+strideq*1]
     movhps               m1, [srcq+strideq*2]
-    movq                 m2, [srcq+stride3q ]
+    movq                 m2, [srcq+r3       ]
     lea                srcq, [srcq+strideq*4]
     movhps               m2, [srcq+strideq*0]
     PSHUFB_BILIN_H4      m1, m4, m3
     PSHUFB_BILIN_H4      m2, m4, m3
     PMADDUBSW            m1, m5, m7, m4, 0 ; 1 2
-    shufpd               m3, m0, m1, 0x01  ; 0 1
-    mova                 m0, m2
-    PMADDUBSW            m0, m5, m7, m4, 0 ; 3 4
-    shufpd               m2, m1, m0, 0x01  ; 2 3
-    psubw                m1, m3
+    PMADDUBSW            m2, m5, m7, m4, 0 ; 3 4
+    shufpd               m0, m1, 0x01      ; 0 1
+    shufpd               m3, m1, m2, 0x01  ; 2 3
+    psubw                m1, m0
     PMULHRSW             m1, m6, m4, m8, 4
-    paddw                m1, m3
-    psubw                m3, m0, m2
-    PMULHRSW             m3, m6, m4, m8, 4
-    paddw                m3, m2
+    paddw                m1, m0
+    mova                 m0, m2
+    psubw                m2, m3
+    PMULHRSW             m2, m6, m4, m8, 4
+    paddw                m2, m3
     mova        [tmpq+16*0], m1
-    mova        [tmpq+16*1], m3
+    mova        [tmpq+16*1], m2
     add                tmpq, 32
     sub                  hd, 4
     jg .hv_w4_loop
@@ -1365,7 +1331,8 @@
     PMADDUBSW            m0, m5, m7, m4, 0 ; 0
 .hv_w8_loop:
     movu                 m1, [srcq+strideq*1]
-    movu                 m2, [srcq+strideq*2]
+    lea                srcq, [srcq+strideq*2]
+    movu                 m2, [srcq+strideq*0]
     PSHUFB_BILIN_H8      m1, m4
     PSHUFB_BILIN_H8      m2, m4
     PMADDUBSW            m1, m5, m7, m4, 0 ; 1
@@ -1373,68 +1340,40 @@
     psubw                m3, m1, m0
     PMULHRSW             m3, m6, m4, m8, 4
     paddw                m3, m0
-%if notcpuflag(ssse3) && ARCH_X86_64
-    SWAP                 m9, m7
-%endif
-    psubw                m7, m2, m1
-    PMULHRSW             m7, m6, m4, m8, 4
-    paddw                m7, m1
+    mova                 m0, m2
+    psubw                m2, m1
+    PMULHRSW             m2, m6, m4, m8, 4
+    paddw                m2, m1
     mova        [tmpq+16*0], m3
-    mova        [tmpq+16*1], m7
-%if notcpuflag(ssse3) && ARCH_X86_64
-    SWAP                 m7, m9
-%endif
-    movu                 m1, [srcq+stride3q ]
-    lea                srcq, [srcq+strideq*4]
-    movu                 m0, [srcq+strideq*0]
-    PSHUFB_BILIN_H8      m1, m4
-    PSHUFB_BILIN_H8      m0, m4
-    PMADDUBSW            m1, m5, m7, m4, ARCH_X86_32 ; 3
-    PMADDUBSW            m0, m5, m7, m4, 0           ; 4
-    psubw                m3, m1, m2
-    PMULHRSW             m3, m6, m4, m8, 4
-    paddw                m3, m2
-%if notcpuflag(ssse3) && ARCH_X86_64
-    SWAP                 m9, m7
-%endif
-    psubw                m7, m0, m1
-    PMULHRSW             m7, m6, m4, m8, 4
-    paddw                m7, m1
-    mova        [tmpq+16*2], m3
-    mova        [tmpq+16*3], m7
-%if notcpuflag(ssse3)
- %if ARCH_X86_64
-    SWAP                 m7, m9
- %else
-    pxor                 m7, m7
- %endif
-%endif
-    add                tmpq, 16*4
-    sub                  hd, 4
+    mova        [tmpq+16*1], m2
+    add                tmpq, 16*2
+    sub                  hd, 2
     jg .hv_w8_loop
     RET
-.hv_w16:
-    mov                 t2d, hd
-    mov                 t0d, 32
+.hv_w128:
+    lea                 r3d, [hq+(7<<8)]
+    mov                 r5d, 256
     jmp .hv_w16_start
-.hv_w32:
-    lea                 t2d, [hq+(1<<16)]
-    mov                 t0d, 64
-    jmp .hv_w16_start
 .hv_w64:
-    lea                 t2d, [hq+(3<<16)]
-    mov                 t0d, 128
+    lea                 r3d, [hq+(3<<8)]
+    mov                 r5d, 128
     jmp .hv_w16_start
-.hv_w128:
-    lea                 t2d, [hq+(7<<16)]
-    mov                 t0d, 256
+.hv_w32:
+    lea                 r3d, [hq+(1<<8)]
+    mov                 r5d, 64
+    jmp .hv_w16_start
+.hv_w16:
+    xor                 r3d, r3d
+    mov                 r5d, 32
 .hv_w16_start:
+%if ARCH_X86_64 || cpuflag(ssse3)
+    mov                  r6, srcq
+%endif
 %if ARCH_X86_64
  %if WIN64
     PUSH                 r7
  %endif
     mov                  r7, tmpq
-    mov                  r5, srcq
 %endif
 .hv_w16_hloop:
     movu                 m0, [srcq+strideq*0+8*0]
@@ -1459,7 +1398,7 @@
     PMULHRSW             m0, m6, m4, m8, 4
     paddw                m0, m1
     mova        [tmpq+16*1], m0
-    add                tmpq, t0q
+    add                tmpq, r5
     movu                 m0, [srcq+strideq*0+8*0]
     PSHUFB_BILIN_H8      m0, m4
     PMADDUBSW            m0, m5, m7, m4, 0 ; 2a
@@ -1474,24 +1413,30 @@
     PMULHRSW             m2, m6, m4, m8, 4
     paddw                m2, m3
     mova        [tmpq+16*1], m2
-    add                tmpq, t0q
+    add                tmpq, r5
     sub                  hd, 2
     jg .hv_w16_vloop
-    movzx                hd, t2w
+    movzx                hd, r3b
 %if ARCH_X86_64
-    add                  r5, 16
+    add                  r6, 16
     add                  r7, 2*16
-    mov                srcq, r5
+    mov                srcq, r6
     mov                tmpq, r7
+%elif cpuflag(ssse3)
+    mov                tmpq, tmpm
+    add                  r6, 16
+    add                tmpq, 2*16
+    mov                srcq, r6
+    mov                tmpm, tmpq
 %else
-    mov                srcq, srcmp
-    mov                tmpq, tmpmp
+    mov                srcq, srcm
+    mov                tmpq, tmpm
     add                srcq, 16
     add                tmpq, 2*16
-    mov               srcmp, srcq
-    mov               tmpmp, tmpq
+    mov                srcm, srcq
+    mov                tmpm, tmpq
 %endif
-    sub                 t2d, 1<<16
+    sub                 r3d, 1<<8
     jg .hv_w16_hloop
 %if WIN64
     POP                  r7
@@ -1538,13 +1483,9 @@
 %if ARCH_X86_32
  %define base_reg r1
  %define base base_reg-put_ssse3
- %define W32_RESTORE_DSQ mov dsq, dsm
- %define W32_RESTORE_SSQ mov ssq, ssm
 %else
  %define base_reg r8
  %define base 0
- %define W32_RESTORE_DSQ
- %define W32_RESTORE_SSQ
 %endif
 
 cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
@@ -1575,10 +1516,9 @@
     add                  wq, base_reg
 ; put_bilin mangling jump
 %assign stack_offset org_stack_offset
-%if ARCH_X86_32
-    mov                 dsq, dsm
-    mov                 ssq, ssm
-%elif WIN64
+    movifnidn           dsq, dsmp
+    movifnidn           ssq, ssmp
+%if WIN64
     pop                  r8
 %endif
     lea                  r6, [ssq*3]
@@ -1590,7 +1530,7 @@
     test                myd, 0xf00
 %endif
     jnz .hv
-    W32_RESTORE_SSQ
+    movifnidn           ssq, ssmp
     WIN64_SPILL_XMM      12
     cmp                  wd, 4
     jl .h_w2
@@ -1604,11 +1544,10 @@
     shr                 mxd, 16
     sub                srcq, 3
     movzx                wd, word [base_reg+wq*2+table_offset(put, _8tap_h)]
-    movd                 m5, [base_reg+mxq*8+subpel_filters-put_ssse3+0]
-    pshufd               m5, m5, q0000
-    movd                 m6, [base_reg+mxq*8+subpel_filters-put_ssse3+4]
-    pshufd               m6, m6, q0000
+    movq                 m6, [base_reg+mxq*8+subpel_filters-put_ssse3]
     mova                 m7, [base+pw_34] ; 2 + (8 << 2)
+    pshufd               m5, m6, q0000
+    pshufd               m6, m6, q1111
     add                  wq, base_reg
     jmp                  wq
 .h_w2:
@@ -1620,9 +1559,9 @@
     dec                srcq
     mova                 m4, [base+subpel_h_shuf4]
     movd                 m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
-    pshufd               m3, m3, q0000
     mova                 m5, [base+pw_34] ; 2 + (8 << 2)
-    W32_RESTORE_DSQ
+    pshufd               m3, m3, q0000
+    movifnidn           dsq, dsmp
 .h_w2_loop:
     movq                 m0, [srcq+ssq*0]
     movhps               m0, [srcq+ssq*1]
@@ -1633,10 +1572,10 @@
     paddw                m0, m5 ; pw34
     psraw                m0, 6
     packuswb             m0, m0
-    movd                r4d, m0
-    mov        [dstq+dsq*0], r4w
-    shr                 r4d, 16
-    mov        [dstq+dsq*1], r4w
+    movd                r6d, m0
+    mov        [dstq+dsq*0], r6w
+    shr                 r6d, 16
+    mov        [dstq+dsq*1], r6w
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .h_w2_loop
@@ -1649,10 +1588,10 @@
 %endif
     dec                srcq
     movd                 m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
-    pshufd               m3, m3, q0000
-    mova                 m5, [base+pw_34] ; 2 + (8 << 2)
     mova                 m6, [base+subpel_h_shufA]
-    W32_RESTORE_DSQ
+    mova                 m5, [base+pw_34] ; 2 + (8 << 2)
+    pshufd               m3, m3, q0000
+    movifnidn           dsq, dsmp
 .h_w4_loop:
     movq                 m0, [srcq+ssq*0] ; 1
     movq                 m1, [srcq+ssq*1] ; 2
@@ -1672,7 +1611,6 @@
     sub                  hd, 2
     jg .h_w4_loop
     RET
-    ;
 %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
  %if ARCH_X86_32
     pshufb              %2, %1, [base+subpel_h_shufB]
@@ -1693,18 +1631,17 @@
     paddw               %1, m7     ; pw34
     psraw               %1, 6
 %endmacro
-    ;
 .h_w8:
-    movu                 m0,     [srcq+ssq*0]
-    movu                 m1,     [srcq+ssq*1]
-    PUT_8TAP_H           m0, m2, m3, m4
+    movu                 m0, [srcq+ssq*0]
+    movu                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
+    PUT_8TAP_H           m0, m2, m3, m4
     PUT_8TAP_H           m1, m2, m3, m4
     packuswb             m0, m1
 %if ARCH_X86_32
-    movq       [dstq      ], m0
+    movq             [dstq], m0
     add                dstq, dsm
-    movhps     [dstq      ], m0
+    movhps           [dstq], m0
     add                dstq, dsm
 %else
     movq       [dstq+dsq*0], m0
@@ -1714,22 +1651,23 @@
     sub                  hd, 2
     jg .h_w8
     RET
-.h_w16:
-    xor                 r6d, r6d
-    jmp .h_start
-.h_w32:
-    mov                  r6, -16*1
-    jmp .h_start
-.h_w64:
-    mov                  r6, -16*3
-    jmp .h_start
 .h_w128:
-    mov                  r6, -16*7
-.h_start:
-    sub                srcq, r6
-    sub                dstq, r6
-    mov                  r4, r6
-.h_loop:
+    mov                  r4, -16*7
+    jmp .h_w16_start
+.h_w64:
+    mov                  r4, -16*3
+    jmp .h_w16_start
+.h_w32:
+    mov                  r4, -16*1
+    jmp .h_w16_start
+.h_w16:
+    xor                 r4d, r4d
+.h_w16_start:
+    sub                srcq, r4
+    sub                dstq, r4
+.h_w16_loop_v:
+    mov                  r6, r4
+.h_w16_loop_h:
     movu                 m0, [srcq+r6+8*0]
     movu                 m1, [srcq+r6+8*1]
     PUT_8TAP_H           m0, m2, m3, m4
@@ -1736,17 +1674,12 @@
     PUT_8TAP_H           m1, m2, m3, m4
     packuswb             m0, m1
     mova          [dstq+r6], m0
-    add                  r6, mmsize
-    jle .h_loop
+    add                  r6, 16
+    jle .h_w16_loop_h
     add                srcq, ssq
-%if ARCH_X86_32
-    add                dstq, dsm
-%else
-    add                dstq, dsq
-%endif
-    mov                  r6, r4
+    add                dstq, dsmp
     dec                  hd
-    jg .h_loop
+    jg .h_w16_loop_v
     RET
 .v:
 %if ARCH_X86_32
@@ -1754,7 +1687,7 @@
     shr                 ssd, 16
     cmp                  hd, 6
     cmovs               ssd, mxd
-    lea                 ssq, [base_reg+ssq*8+subpel_filters-put_ssse3]
+    movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
 %else
  %assign stack_offset org_stack_offset
     WIN64_SPILL_XMM      16
@@ -1762,12 +1695,12 @@
     shr                 myd, 16
     cmp                  hd, 6
     cmovs               myd, mxd
-    lea                 myq, [base_reg+myq*8+subpel_filters-put_ssse3]
+    movq                 m0, [base_reg+myq*8+subpel_filters-put_ssse3]
 %endif
     tzcnt               r6d, wd
     movzx               r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)]
+    punpcklwd            m0, m0
     mova                 m7, [base+pw_512]
-    psrlw                m2, m7, 1 ; 0x0100
     add                  r6, base_reg
 %if ARCH_X86_32
  %define            subpel0  [rsp+mmsize*0]
@@ -1775,20 +1708,16 @@
  %define            subpel2  [rsp+mmsize*2]
  %define            subpel3  [rsp+mmsize*3]
 %assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
-    ALLOC_STACK   -mmsize*4
+    ALLOC_STACK       -16*4
 %assign regs_used 7
-    movd                 m0, [ssq+0]
-    pshufb               m0, m2
-    mova            subpel0, m0
-    movd                 m0, [ssq+2]
-    pshufb               m0, m2
-    mova            subpel1, m0
-    movd                 m0, [ssq+4]
-    pshufb               m0, m2
-    mova            subpel2, m0
-    movd                 m0, [ssq+6]
-    pshufb               m0, m2
-    mova            subpel3, m0
+    pshufd               m1, m0, q0000
+    mova            subpel0, m1
+    pshufd               m1, m0, q1111
+    mova            subpel1, m1
+    pshufd               m1, m0, q2222
+    mova            subpel2, m1
+    pshufd               m1, m0, q3333
+    mova            subpel3, m1
     mov                 ssq, [rstk+stack_offset+gprsize*4]
     lea                 ssq, [ssq*3]
     sub                srcq, ssq
@@ -1799,47 +1728,46 @@
  %define            subpel1  m9
  %define            subpel2  m10
  %define            subpel3  m11
-    movd            subpel0, [myq+0]
-    pshufb          subpel0, m2
-    movd            subpel1, [myq+2]
-    pshufb          subpel1, m2
-    movd            subpel2, [myq+4]
-    pshufb          subpel2, m2
-    movd            subpel3, [myq+6]
-    pshufb          subpel3, m2
     lea                ss3q, [ssq*3]
+    pshufd               m8, m0, q0000
     sub                srcq, ss3q
+    pshufd               m9, m0, q1111
+    pshufd              m10, m0, q2222
+    pshufd              m11, m0, q3333
 %endif
     jmp                  r6
 .v_w2:
-    movd                 m2, [srcq+ssq*0]    ; 0
-    pinsrw               m2, [srcq+ssq*1], 2 ; 0 1
-    pinsrw               m2, [srcq+ssq*2], 4 ; 0 1 2
+    movd                 m1, [srcq+ssq*0]
+    movd                 m0, [srcq+ssq*1]
 %if ARCH_X86_32
     lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
-    pinsrw               m2, [srcq+ssq*0], 6 ; 0 1 2 3
-    add                srcq, ssq
-%else
-    pinsrw               m2, [srcq+ss3q ], 6 ; 0 1 2 3
-    lea                srcq, [srcq+ssq*4]
-%endif
-    movd                 m3, [srcq+ssq*0]    ; 4
-    movd                 m1, [srcq+ssq*1]    ; 5
-    movd                 m0, [srcq+ssq*2]    ; 6
-%if ARCH_X86_32
+    movd                 m2, [srcq+ssq*0]
+    movd                 m5, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
+    movd                 m3, [srcq+ssq*0]
+    movd                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
 %else
+    movd                 m2, [srcq+ssq*2]
     add                srcq, ss3q
+    movd                 m5, [srcq+ssq*0]
+    movd                 m3, [srcq+ssq*1]
+    movd                 m4, [srcq+ssq*2]
+    add                srcq, ss3q
 %endif
-    punpckldq            m3, m1              ; 4 5 _ _
-    punpckldq            m1, m0              ; 5 6 _ _
-    palignr              m4, m3, m2, 4       ; 1 2 3 4
-    punpcklbw            m3, m1              ; 45 56
-    punpcklbw            m1, m2, m4          ; 01 12
-    punpckhbw            m2, m4              ; 23 34
+    punpcklwd            m1, m0           ; 0 1
+    punpcklwd            m0, m2           ; 1 2
+    punpcklbw            m1, m0           ; 01 12
+    movd                 m0, [srcq+ssq*0]
+    punpcklwd            m2, m5           ; 2 3
+    punpcklwd            m5, m3           ; 3 4
+    punpcklwd            m3, m4           ; 4 5
+    punpcklwd            m4, m0           ; 5 6
+    punpcklbw            m2, m5           ; 23 34
+    punpcklbw            m3, m4           ; 45 56
 .v_w2_loop:
+    movd                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
     pmaddubsw            m5, m1, subpel0     ; a0 b0
     mova                 m1, m2
     pmaddubsw            m2, subpel1         ; a1 b1
@@ -1847,17 +1775,14 @@
     mova                 m2, m3
     pmaddubsw            m3, subpel2         ; a2 b2
     paddw                m5, m3
-    movd                 m4, [srcq+ssq*0]    ; 7
-    punpckldq            m3, m0, m4          ; 6 7 _ _
-    movd                 m0, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
-    punpckldq            m4, m0              ; 7 8 _ _
+    punpcklwd            m3, m0, m4          ; 6 7
+    movd                 m0, [srcq+ssq*0]
+    punpcklwd            m4, m0              ; 7 8
     punpcklbw            m3, m4              ; 67 78
     pmaddubsw            m4, m3, subpel3     ; a3 b3
     paddw                m5, m4
     pmulhrsw             m5, m7
     packuswb             m5, m5
-    pshuflw              m5, m5, q2020
     movd                r6d, m5
     mov        [dstq+dsq*0], r6w
     shr                 r6d, 16
@@ -1873,51 +1798,46 @@
 .v_w32:
 .v_w64:
 .v_w128:
-%endif ; ARCH_X86_32
-    lea                 r6d, [wq - 4] ; horizontal loop
-    mov                  r4, dstq
-%if ARCH_X86_32
-%if STACK_ALIGNMENT < mmsize
- %define               srcm [rsp+mmsize*4+gprsize]
+    shl                  wd, 14
+%if STACK_ALIGNMENT < 16
+ %define               dstm [rsp+mmsize*4+gprsize]
+    mov                dstm, dstq
 %endif
-    mov                srcm, srcq
-%else
-    mov                  r7, srcq
-%endif
-    shl                 r6d, (16 - 2)  ; (wq / 4) << 16
-    mov                 r6w, hw
+    lea                 r6d, [hq+wq-(1<<16)]
+    mov                  r4, srcq
 .v_w4_loop0:
-    movd                 m2, [srcq+ssq*0] ; 0
-    movhps               m2, [srcq+ssq*2] ; 0 _ 2
-    movd                 m3, [srcq+ssq*1] ; 1
-%if ARCH_X86_32
-    lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
-    movhps               m3, [srcq+ssq*0] ; 1 _ 3
-    lea                srcq, [srcq+ssq*1]
-%else
-    movhps               m3, [srcq+ss3q ] ; 1 _ 3
-    lea                srcq, [srcq+ssq*4]
 %endif
-    pshufd               m2, m2, q2020    ; 0 2 0 2
-    pshufd               m3, m3, q2020    ; 1 3 1 3
-    punpckldq            m2, m3           ; 0 1 2 3
-    movd                 m3, [srcq+ssq*0] ; 4
-    movd                 m1, [srcq+ssq*1] ; 5
-    movd                 m0, [srcq+ssq*2] ; 6
+    movd                 m1, [srcq+ssq*0]
+    movd                 m0, [srcq+ssq*1]
 %if ARCH_X86_32
     lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
+    movd                 m2, [srcq+ssq*0]
+    movd                 m5, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movd                 m3, [srcq+ssq*0]
+    movd                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
 %else
+    movd                 m2, [srcq+ssq*2]
     add                srcq, ss3q
+    movd                 m5, [srcq+ssq*0]
+    movd                 m3, [srcq+ssq*1]
+    movd                 m4, [srcq+ssq*2]
+    add                srcq, ss3q
 %endif
-    punpckldq            m3, m1           ; 4 5 _ _
-    punpckldq            m1, m0           ; 5 6 _ _
-    palignr              m4, m3, m2, 4    ; 1 2 3 4
-    punpcklbw            m3, m1           ; 45 56
-    punpcklbw            m1, m2, m4       ; 01 12
-    punpckhbw            m2, m4           ; 23 34
+    punpckldq            m1, m0           ; 0 1
+    punpckldq            m0, m2           ; 1 2
+    punpcklbw            m1, m0           ; 01 12
+    movd                 m0, [srcq+ssq*0]
+    punpckldq            m2, m5           ; 2 3
+    punpckldq            m5, m3           ; 3 4
+    punpckldq            m3, m4           ; 4 5
+    punpckldq            m4, m0           ; 5 6
+    punpcklbw            m2, m5           ; 23 34
+    punpcklbw            m3, m4           ; 45 56
 .v_w4_loop:
+    movd                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
     pmaddubsw            m5, m1, subpel0  ; a0 b0
     mova                 m1, m2
     pmaddubsw            m2, subpel1      ; a1 b1
@@ -1925,10 +1845,8 @@
     mova                 m2, m3
     pmaddubsw            m3, subpel2      ; a2 b2
     paddw                m5, m3
-    movd                 m4, [srcq+ssq*0]
     punpckldq            m3, m0, m4       ; 6 7 _ _
-    movd                 m0, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
+    movd                 m0, [srcq+ssq*0]
     punpckldq            m4, m0           ; 7 8 _ _
     punpcklbw            m3, m4           ; 67 78
     pmaddubsw            m4, m3, subpel3  ; a3 b3
@@ -1936,24 +1854,21 @@
     pmulhrsw             m5, m7
     packuswb             m5, m5
     movd       [dstq+dsq*0], m5
-    pshufd               m5, m5, q0101
+    psrlq                m5, 32
     movd       [dstq+dsq*1], m5
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .v_w4_loop
-    mov                  hw, r6w ; reset vertical loop
-    add                  r4, 4
-    mov                dstq, r4
 %if ARCH_X86_32
-    mov                srcq, srcm
-    add                srcq, 4
-    mov                srcm, srcq
-%else
-    add                  r7, 4
-    mov                srcq, r7
-%endif
-    sub                 r6d, 1<<16 ; horizontal--
+    mov                dstq, dstm
+    add                  r4, 4
+    movzx                hd, r6w
+    add                dstq, 4
+    mov                srcq, r4
+    mov                dstm, dstq
+    sub                 r6d, 1<<16
     jg .v_w4_loop0
+%endif
     RET
 %if ARCH_X86_64
 .v_w8:
@@ -1961,56 +1876,51 @@
 .v_w32:
 .v_w64:
 .v_w128:
-    lea                 r6d, [wq - 8] ; horizontal loop
-    mov                  r4, dstq
-    mov                  r7, srcq
-    shl                 r6d, 8 - 3; (wq / 8) << 8
-    mov                 r6b, hb
+    lea                 r6d, [wq*8-64]
+    mov                  r4, srcq
+    mov                  r7, dstq
+    lea                 r6d, [hq+r6*4]
 .v_w8_loop0:
-    movq                 m4, [srcq+ssq*0]   ; 0
-    movq                 m5, [srcq+ssq*1]   ; 1
-    lea                srcq, [srcq+ssq*2]
-    movq                 m6, [srcq+ssq*0]   ; 2
-    movq                 m0, [srcq+ssq*1]   ; 3
-    lea                srcq, [srcq+ssq*2]
-    movq                 m1, [srcq+ssq*0]   ; 4
-    movq                 m2, [srcq+ssq*1]   ; 5
-    lea                srcq, [srcq+ssq*2]   ;
-    movq                 m3, [srcq+ssq*0]   ; 6
-    shufpd               m4, m0, 0x0c
-    shufpd               m5, m1, 0x0c
-    punpcklbw            m1, m4, m5 ; 01
-    punpckhbw            m4, m5     ; 34
-    shufpd               m6, m2, 0x0c
-    punpcklbw            m2, m5, m6 ; 12
-    punpckhbw            m5, m6     ; 45
-    shufpd               m0, m3, 0x0c
-    punpcklbw            m3, m6, m0 ; 23
-    punpckhbw            m6, m0     ; 56
+    movq                 m1, [srcq+ssq*0]
+    movq                 m2, [srcq+ssq*1]
+    movq                 m3, [srcq+ssq*2]
+    add                srcq, ss3q
+    movq                 m4, [srcq+ssq*0]
+    movq                 m5, [srcq+ssq*1]
+    movq                 m6, [srcq+ssq*2]
+    add                srcq, ss3q
+    movq                 m0, [srcq+ssq*0]
+    punpcklbw            m1, m2 ; 01
+    punpcklbw            m2, m3 ; 12
+    punpcklbw            m3, m4 ; 23
+    punpcklbw            m4, m5 ; 34
+    punpcklbw            m5, m6 ; 45
+    punpcklbw            m6, m0 ; 56
 .v_w8_loop:
-    movq                m12, [srcq+ssq*1]   ; 8
+    movq                m13, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    movq                m13, [srcq+ssq*0]   ; 9
     pmaddubsw           m14, m1, subpel0 ; a0
-    pmaddubsw           m15, m2, subpel0 ; b0
     mova                 m1, m3
+    pmaddubsw           m15, m2, subpel0 ; b0
     mova                 m2, m4
     pmaddubsw            m3, subpel1 ; a1
+    mova                m12, m0
     pmaddubsw            m4, subpel1 ; b1
+    movq                 m0, [srcq+ssq*0]
     paddw               m14, m3
     paddw               m15, m4
     mova                 m3, m5
-    mova                 m4, m6
     pmaddubsw            m5, subpel2 ; a2
+    mova                 m4, m6
     pmaddubsw            m6, subpel2 ; b2
+    punpcklbw           m12, m13     ; 67
+    punpcklbw           m13, m0      ; 78
     paddw               m14, m5
+    mova                 m5, m12
+    pmaddubsw           m12, subpel3 ; a3
     paddw               m15, m6
-    shufpd               m6, m0, m12, 0x0d
-    shufpd               m0, m12, m13, 0x0c
-    punpcklbw            m5, m6, m0  ; 67
-    punpckhbw            m6, m0      ; 78
-    pmaddubsw           m12, m5, subpel3 ; a3
-    pmaddubsw           m13, m6, subpel3 ; b3
+    mova                 m6, m13
+    pmaddubsw           m13, subpel3 ; b3
     paddw               m14, m12
     paddw               m15, m13
     pmulhrsw            m14, m7
@@ -2021,12 +1931,12 @@
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .v_w8_loop
-    movzx                hd, r6b ; reset vertical loop
     add                  r4, 8
     add                  r7, 8
-    mov                dstq, r4
-    mov                srcq, r7
-    sub                 r6d, 1<<8 ; horizontal--
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
+    sub                 r6d, 1<<8
     jg .v_w8_loop0
     RET
 %endif ;ARCH_X86_64
@@ -2051,7 +1961,7 @@
     cmp                  hd, 6
     cmovs               ssd, mxd
     movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
-    W32_RESTORE_SSQ
+    mov                 ssq, ssmp
     lea                  r6, [ssq*3]
     sub                srcq, r6
  %define           base_reg  r6
@@ -2064,7 +1974,6 @@
  %define           subpelv1  [rsp+mmsize*1]
  %define           subpelv2  [rsp+mmsize*2]
  %define           subpelv3  [rsp+mmsize*3]
-    punpcklqdq           m0, m0
     punpcklbw            m0, m0
     psraw                m0, 8 ; sign-extend
     pshufd               m6, m0, q0000
@@ -2088,7 +1997,6 @@
  %define           subpelv1  m11
  %define           subpelv2  m12
  %define           subpelv3  m13
-    punpcklqdq           m0, m0
     punpcklbw            m0, m0
     psraw                m0, 8 ; sign-extend
     mova                 m8, [base+pw_8192]
@@ -2103,22 +2011,21 @@
     je .hv_w4
 .hv_w2:
     mova                 m6, [base+subpel_h_shuf4]
-    ;
     movq                 m2, [srcq+ssq*0]     ; 0
     movhps               m2, [srcq+ssq*1]     ; 0 _ 1
-    movq                 m0, [srcq+ssq*2]     ; 2
 %if ARCH_X86_32
  %define           w8192reg  [base+pw_8192]
  %define            d512reg  [base+pd_512]
     lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
-    movhps               m0, [srcq+ssq*0]     ; 2 _ 3
-    lea                srcq, [srcq+ssq*1]
+    movq                 m0, [srcq+ssq*0]     ; 2
+    movhps               m0, [srcq+ssq*1]     ; 2 _ 3
+    lea                srcq, [srcq+ssq*2]
 %else
  %define           w8192reg  m8
  %define            d512reg  m9
-    movhps               m0, [srcq+ss3q ]     ; 2 _ 3
-    lea                srcq, [srcq+ssq*4]
+    movq                 m0, [srcq+ssq*2]     ; 2
+    add                srcq, ss3q
+    movhps               m0, [srcq+ssq*0]     ; 2 _ 3
 %endif
     pshufb               m2, m6 ; 0 ~ 1 ~
     pshufb               m0, m6 ; 2 ~ 3 ~
@@ -2126,16 +2033,16 @@
     pmaddubsw            m0, m7 ; subpel_filters
     phaddw               m2, m0 ; 0 1 2 3
     pmulhrsw             m2, w8192reg
-    ;
+%if ARCH_X86_32
     movq                 m3, [srcq+ssq*0]     ; 4
     movhps               m3, [srcq+ssq*1]     ; 4 _ 5
-    movq                 m0, [srcq+ssq*2]     ; 6
-%if ARCH_X86_32
     lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
 %else
+    movq                 m3, [srcq+ssq*1]     ; 4
+    movhps               m3, [srcq+ssq*2]     ; 4 _ 5
     add                srcq, ss3q
 %endif
+    movq                 m0, [srcq+ssq*0]     ; 6
     pshufb               m3, m6 ; 4 ~ 5 ~
     pshufb               m0, m6 ; 6 ~
     pmaddubsw            m3, m7 ; subpel_filters
@@ -2142,7 +2049,6 @@
     pmaddubsw            m0, m7 ; subpel_filters
     phaddw               m3, m0 ; 4 5 6 _
     pmulhrsw             m3, w8192reg
-    ;
     palignr              m4, m3, m2, 4; V        1 2 3 4
     punpcklwd            m1, m2, m4   ; V 01 12    0 1 1 2
     punpckhwd            m2, m4       ; V 23 34    2 3 3 4
@@ -2149,6 +2055,11 @@
     pshufd               m0, m3, q2121; V          5 6 5 6
     punpcklwd            m3, m0       ; V 45 56    4 5 5 6
 .hv_w2_loop:
+    movq                 m4, [srcq+ssq*1] ; V 7
+    lea                srcq, [srcq+ssq*2] ; V
+    movhps               m4, [srcq+ssq*0] ; V 7 8
+    pshufb               m4, m6
+    pmaddubsw            m4, m7
     pmaddwd              m5, m1, subpelv0; V a0 b0
     mova                 m1, m2       ; V
     pmaddwd              m2, subpelv1 ; V a1 b1
@@ -2155,14 +2066,9 @@
     paddd                m5, m2       ; V
     mova                 m2, m3       ; V
     pmaddwd              m3, subpelv2 ; a2 b2
-    paddd                m5, m3       ; V
-    movq                 m4, [srcq+ssq*0] ; V 7
-    movhps               m4, [srcq+ssq*1] ; V 7 8
-    lea                srcq, [srcq+ssq*2] ; V
-    pshufb               m4, m6
-    pmaddubsw            m4, m7
     phaddw               m4, m4
     pmulhrsw             m4, w8192reg
+    paddd                m5, m3       ; V
     palignr              m3, m4, m0, 12
     mova                 m0, m4
     punpcklwd            m3, m0           ; V 67 78
@@ -2182,7 +2088,6 @@
     RET
 %undef w8192reg
 %undef d512reg
-    ;
 .hv_w4:
 %define hv4_line_0_0 4
 %define hv4_line_0_1 5
@@ -2194,7 +2099,6 @@
 %define hv4_line_1_1 11
 %define hv4_line_1_2 12
 %define hv4_line_1_3 13
-    ;
 %macro SAVELINE_W4 3
     mova     [rsp+mmsize*hv4_line_%3_%2], %1
 %endmacro
@@ -2201,7 +2105,6 @@
 %macro RESTORELINE_W4 3
     mova     %1, [rsp+mmsize*hv4_line_%3_%2]
 %endmacro
-    ;
 %if ARCH_X86_32
  %define           w8192reg  [base+pw_8192]
  %define            d512reg  [base+pd_512]
@@ -2213,13 +2116,13 @@
     mova                 m6, [base+subpel_h_shuf4]
     movq                 m5, [srcq+ssq*0]   ; 0 _ _ _
     movhps               m5, [srcq+ssq*1]   ; 0 _ 1 _
-    movq                 m4, [srcq+ssq*2]   ; 2 _ _ _
 %if ARCH_X86_32
     lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
-    movhps               m4, [srcq+ssq*0]   ; 2 _ 3 _
-    add                srcq, ssq
+    movq                 m4, [srcq+ssq*0]   ; 2 _ _ _
+    movhps               m4, [srcq+ssq*1]   ; 2 _ 3 _
+    lea                srcq, [srcq+ssq*2]
 %else
+    movq                 m4, [srcq+ssq*2]   ; 2 _ _ _
     movhps               m4, [srcq+ss3q ]   ; 2 _ 3 _
     lea                srcq, [srcq+ssq*4]
 %endif
@@ -2243,7 +2146,14 @@
     mova                 m6, [base+subpel_h_shuf4]
     movq                 m5, [srcq+ssq*0]   ; 4 _ _ _
     movhps               m5, [srcq+ssq*1]   ; 4 _ 5 _
+%if ARCH_X86_32
+    lea                srcq, [srcq+ssq*2]
+    movq                 m4, [srcq+ssq*0]   ; 6 _ _ _
+    add                srcq, ssq
+%else
     movq                 m4, [srcq+ssq*2]   ; 6 _ _ _
+    add                srcq, ss3q
+%endif
     pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
     pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
     pmaddubsw            m3, m7 ;H subpel_filters
@@ -2259,13 +2169,6 @@
     pmaddubsw            m0, m7 ;H subpel_filters
     phaddw               m3, m0 ;H 4 5 6 7
     pmulhrsw             m3, w8192reg ;H pw_8192
-    ;
-%if ARCH_X86_32
-    lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
-%else
-    add                srcq, ss3q
-%endif
     ;process high
     palignr              m4, m3, m2, 4;V 1 2 3 4
     punpcklwd            m1, m2, m4  ; V 01 12
@@ -2293,7 +2196,6 @@
     mova                 m2, m3
     pmaddwd              m3, subpelv2; V a2 b2
     paddd                m5, m3
-    ;
     mova                 m6, [base+subpel_h_shuf4]
     movq                 m4, [srcq+ssq*0] ; 7
     movhps               m4, [srcq+ssq*1] ; 7 _ 8 _
@@ -2325,10 +2227,10 @@
     mova                 m2, m3
     pmaddwd              m3, subpelv2; V a2 b2
     paddd                m5, m3
-    ;
     mova                 m6, [base+subpel_h_shuf4+16]
     movq                 m4, [srcq+ssq*0] ; 7
     movhps               m4, [srcq+ssq*1] ; 7 _ 8 _
+    lea                srcq, [srcq+ssq*2]
     pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
     pmaddubsw            m4, m7 ;H subpel_filters
     phaddw               m4, m4 ;H                7 8 7 8
@@ -2340,12 +2242,10 @@
     paddd                m5, d512reg ; pd_512
     paddd                m5, m4
     psrad                m4, m5, 10
-    ;
     RESTORELINE_W4       m5, 5, 0
     packssdw             m5, m4 ; d -> w
     packuswb             m5, m5 ; w -> b
     pshuflw              m5, m5, q3120
-    lea                srcq, [srcq+ssq*2]
     movd       [dstq+dsq*0], m5
     psrlq                m5, 32
     movd       [dstq+dsq*1], m5
@@ -2365,7 +2265,6 @@
 %undef subpelv1
 %undef subpelv2
 %undef subpelv3
-    ;
 .hv_w8:
     %assign stack_offset org_stack_offset
 %define hv8_line_1 0
@@ -2400,7 +2299,7 @@
     mov                 ssq, ssmp
     ALLOC_STACK  -mmsize*13
 %if STACK_ALIGNMENT < 16
- %define               srcm  [rsp+mmsize*13+gprsize*1]
+ %define               dstm  [rsp+mmsize*13+gprsize*1]
  %define                dsm  [rsp+mmsize*13+gprsize*2]
     mov                  r6, [rstk+stack_offset+gprsize*2]
     mov                 dsm, r6
@@ -2420,10 +2319,10 @@
     mova           subpelv2, m4
     mova           subpelv3, m5
     lea                  r6, [ssq*3]
+    mov                dstm, dstq
     sub                srcq, r6
-    mov                srcm, srcq
 %else
-    ALLOC_STACK    mmsize*5, 16
+    ALLOC_STACK        16*5, 16
  %define           subpelh0  m10
  %define           subpelh1  m11
  %define           subpelv0  m12
@@ -2440,7 +2339,6 @@
     movq                 m1, [base_reg+myq*8+subpel_filters-put_ssse3]
     pshufd         subpelh0, m0, q0000
     pshufd         subpelh1, m0, q1111
-    punpcklqdq           m1, m1
     punpcklbw            m1, m1
     psraw                m1, 8 ; sign-extend
     pshufd         subpelv0, m1, q0000
@@ -2448,18 +2346,18 @@
     pshufd         subpelv2, m1, q2222
     pshufd         subpelv3, m1, q3333
     lea                ss3q, [ssq*3]
+    mov                  r7, dstq
     sub                srcq, ss3q
-    mov                  r7, srcq
 %endif
-    lea                 r6d, [wq-4]
-    mov                  r4, dstq
-    shl                 r6d, (16 - 2)
-    mov                 r6w, hw
+    shl                  wd, 14
+    lea                 r6d, [hq+wq-(1<<16)]
+    mov                  r4, srcq
 .hv_w8_loop0:
     movu                 m4, [srcq+ssq*0] ; 0 = _ _
     movu                 m5, [srcq+ssq*1] ; 1 = _ _
+%if ARCH_X86_32
     lea                srcq, [srcq+ssq*2]
-    ;
+%endif
 %macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
  %if ARCH_X86_32
     pshufb               %3, %1, [base+subpel_h_shufB]
@@ -2478,7 +2376,6 @@
     paddw                %1, %3      ; A0+C4
     phaddw               %1, %2
 %endmacro
-    ;
 %if ARCH_X86_64
     mova                 m7, [base+subpel_h_shufA]
     mova                 m8, [base+subpel_h_shufB]
@@ -2486,12 +2383,17 @@
 %endif
     HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
     HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
+%if ARCH_X86_32
     movu                 m6, [srcq+ssq*0] ; 2 = _ _
     movu                 m0, [srcq+ssq*1] ; 3 = _ _
     lea                srcq, [srcq+ssq*2]
+%else
+    movu                 m6, [srcq+ssq*2] ; 2 = _ _
+    add                srcq, ss3q
+    movu                 m0, [srcq+ssq*0] ; 3 = _ _
+%endif
     HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
     HV_H_W8              m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
-    ;
     mova                 m7, [base+pw_8192]
     pmulhrsw             m4, m7 ; H pw_8192
     pmulhrsw             m5, m7 ; H pw_8192
@@ -2503,11 +2405,16 @@
     SAVELINE_W8           1, m1
     SAVELINE_W8           2, m2
     SAVELINE_W8           3, m3
-    ;
     mova                 m7, [base+subpel_h_shufA]
+%if ARCH_X86_32
     movu                 m4, [srcq+ssq*0]       ; 4 = _ _
     movu                 m5, [srcq+ssq*1]       ; 5 = _ _
     lea                srcq, [srcq+ssq*2]
+%else
+    movu                 m4, [srcq+ssq*1]       ; 4 = _ _
+    movu                 m5, [srcq+ssq*2]       ; 5 = _ _
+    add                srcq, ss3q
+%endif
     movu                 m6, [srcq+ssq*0]       ; 6 = _ _
     HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
     HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
@@ -2519,7 +2426,6 @@
     punpcklwd            m4, m0, m1  ; 3 4 ~
     punpcklwd            m5, m1, m2  ; 4 5 ~
     punpcklwd            m6, m2, m3  ; 5 6 ~
-    ;
     SAVELINE_W8           6, m3
     RESTORELINE_W8        1, m1
     RESTORELINE_W8        2, m2
@@ -2603,16 +2509,19 @@
     RESTORELINE_W8        4, m4
     jmp .hv_w8_loop
 .hv_w8_outer:
-    movzx                hd, r6w
-    add                  r4, 4
-    mov                dstq, r4
 %if ARCH_X86_32
-    mov                srcq, srcm
-    add                srcq, 4
-    mov                srcm, srcq
+    mov                dstq, dstm
+    add                  r4, 4
+    movzx                hd, r6w
+    add                dstq, 4
+    mov                srcq, r4
+    mov                dstm, dstq
 %else
+    add                  r4, 4
     add                  r7, 4
-    mov                srcq, r7
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
 %endif
     sub                 r6d, 1<<16
     jg .hv_w8_loop0
@@ -2836,7 +2745,7 @@
     add                 mxd, t0d ; 8tap_h, mx, 4tap_h
     imul                myd, mym, 0x010101
     add                 myd, t1d ; 8tap_v, my, 4tap_v
-    movsxd               wq, wm
+    mov                  wd, wm
     movifnidn          srcd, srcm
     movifnidn            hd, hm
     test                mxd, 0xf00
@@ -2846,6 +2755,7 @@
     LEA            base_reg, prep_ssse3
     tzcnt                wd, wd
     movzx                wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
+    pxor                 m4, m4
     add                  wq, base_reg
     movifnidn       strided, stridem
     lea                  r6, [strideq*3]
@@ -2885,16 +2795,13 @@
     shr                 mxd, 16
     sub                srcq, 3
     movzx                wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
-    movd                 m5, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+0]
-    pshufd               m5, m5, q0000
-    movd                 m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+4]
-    pshufd               m6, m6, q0000
+    movq                 m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
 %if cpuflag(ssse3)
     mova                 m7, [base+pw_8192]
+    pshufd               m5, m6, q0000
+    pshufd               m6, m6, q1111
 %else
-    punpcklbw            m5, m5
     punpcklbw            m6, m6
-    psraw                m5, 8
     psraw                m6, 8
  %if ARCH_X86_64
     mova                 m7, [pw_2]
@@ -2902,6 +2809,8 @@
  %else
   %define m15 m4
  %endif
+    pshufd               m5, m6, q1010
+    punpckhqdq           m6, m6
 %endif
     add                  wq, base_reg
     jmp                  wq
@@ -2913,10 +2822,10 @@
 %endif
     dec                srcq
     movd                 m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
-    pshufd               m4, m4, q0000
 %if cpuflag(ssse3)
     mova                 m6, [base+pw_8192]
     mova                 m5, [base+subpel_h_shufA]
+    pshufd               m4, m4, q0000
 %else
     mova                 m6, [base+pw_2]
  %if ARCH_X86_64
@@ -2926,6 +2835,7 @@
  %endif
     punpcklbw            m4, m4
     psraw                m4, 8
+    punpcklqdq           m4, m4
 %endif
 %if ARCH_X86_64
     lea            stride3q, [strideq*3]
@@ -3089,11 +2999,14 @@
     shr                 myd, 16
     cmp                  hd, 6
     cmovs               myd, mxd
-    lea                 myq, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+    movq                 m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
 %if cpuflag(ssse3)
     mova                 m2, [base+pw_512]
-    psrlw                m2, m2, 1 ; 0x0100
     mova                 m7, [base+pw_8192]
+    punpcklwd            m0, m0
+%else
+    punpcklbw            m0, m0
+    psraw                m0, 8
 %endif
 %if ARCH_X86_32
  %define            subpel0  [rsp+mmsize*0]
@@ -3107,20 +3020,16 @@
     ALLOC_STACK   -mmsize*5
  %endif
 %assign regs_used 7
-    movd                 m0, [myq+0]
-    PSHUFB_0X1X          m0, m2
-    mova            subpel0, m0
-    movd                 m0, [myq+2]
-    PSHUFB_0X1X          m0, m2
-    mova            subpel1, m0
-    movd                 m0, [myq+4]
-    PSHUFB_0X1X          m0, m2
-    mova            subpel2, m0
-    movd                 m0, [myq+6]
-    PSHUFB_0X1X          m0, m2
-    mova            subpel3, m0
     mov             strideq, [rstk+stack_offset+gprsize*3]
+    pshufd               m1, m0, q0000
+    mova            subpel0, m1
+    pshufd               m1, m0, q1111
+    mova            subpel1, m1
     lea                  r5, [strideq*3]
+    pshufd               m1, m0, q2222
+    mova            subpel2, m1
+    pshufd               m1, m0, q3333
+    mova            subpel3, m1
     sub                srcq, r5
 %else
  %define            subpel0  m8
@@ -3127,15 +3036,11 @@
  %define            subpel1  m9
  %define            subpel2  m10
  %define            subpel3  m11
-    movd            subpel0, [myq+0]
-    PSHUFB_0X1X     subpel0, m2
-    movd            subpel1, [myq+2]
-    PSHUFB_0X1X     subpel1, m2
-    movd            subpel2, [myq+4]
-    PSHUFB_0X1X     subpel2, m2
-    movd            subpel3, [myq+6]
-    PSHUFB_0X1X     subpel3, m2
+    pshufd               m8, m0, q0000
+    pshufd               m9, m0, q1111
     lea            stride3q, [strideq*3]
+    pshufd              m10, m0, q2222
+    pshufd              m11, m0, q3333
     sub                srcq, stride3q
     cmp                  wd, 8
     jns .v_w8
@@ -3159,35 +3064,34 @@
     mov                 r5w, hw
 .v_w4_loop0:
 %endif
-    movd                 m2, [srcq+strideq*0] ; 0
-    movhps               m2, [srcq+strideq*2] ; 0 _ 2
-    movd                 m3, [srcq+strideq*1] ; 1
+    movd                 m1, [srcq+strideq*0]
+    movd                 m0, [srcq+strideq*1]
 %if ARCH_X86_32
     lea                srcq, [srcq+strideq*2]
-    movhps               m3, [srcq+strideq*1] ; 1 _ 3
+    movd                 m2, [srcq+strideq*0]
+    movd                 m4, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
-%else
-    movhps               m3, [srcq+stride3q ] ; 1 _ 3
-    lea                srcq, [srcq+strideq*4]
-%endif
-    pshufd               m2, m2, q2020    ; 0 2 0 2
-    pshufd               m3, m3, q2020    ; 1 3 1 3
-    punpckldq            m2, m3           ; 0 1 2 3
-    movd                 m3, [srcq+strideq*0] ; 4
-    movd                 m1, [srcq+strideq*1] ; 5
-    movd                 m0, [srcq+strideq*2] ; 6
-%if ARCH_X86_32
+    movd                 m3, [srcq+strideq*0]
+    movd                 m5, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
-    add                srcq, strideq
 %else
+    movd                 m2, [srcq+strideq*2]
     add                srcq, stride3q
+    movd                 m4, [srcq+strideq*0]
+    movd                 m3, [srcq+strideq*1]
+    movd                 m5, [srcq+strideq*2]
+    add                srcq, stride3q
 %endif
-    punpckldq            m3, m1           ; 4 5 _ _
-    punpckldq            m1, m0           ; 5 6 _ _
-    PALIGNR              m4, m3, m2, 4    ; 1 2 3 4
-    punpcklbw            m3, m1           ; 45 56
-    punpcklbw            m1, m2, m4       ; 01 12
-    punpckhbw            m2, m4           ; 23 34
+    punpckldq            m1, m0 ; 0 1
+    punpckldq            m0, m2 ; 1 2
+    punpcklbw            m1, m0 ; 01 12
+    movd                 m0, [srcq+strideq*0]
+    punpckldq            m2, m4 ; 2 3
+    punpckldq            m4, m3 ; 3 4
+    punpckldq            m3, m5 ; 4 5
+    punpckldq            m5, m0 ; 5 6
+    punpcklbw            m2, m4 ; 23 34
+    punpcklbw            m3, m5 ; 45 56
 .v_w4_loop:
 %if ARCH_X86_32 && notcpuflag(ssse3)
     mova                 m7, subpel0
@@ -3208,11 +3112,11 @@
 %endif
     mova                 m2, m3
     PMADDUBSW            m3, subpel2, m6, m4, 0  ; a2 b2
+    movd                 m4, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
     paddw                m5, m3
-    movd                 m4, [srcq+strideq*0]
     punpckldq            m3, m0, m4       ; 6 7 _ _
-    movd                 m0, [srcq+strideq*1]
-    lea                srcq, [srcq+strideq*2]
+    movd                 m0, [srcq+strideq*0]
     punpckldq            m4, m0           ; 7 8 _ _
     punpcklbw            m3, m4           ; 67 78
 %if notcpuflag(ssse3)
@@ -3242,13 +3146,13 @@
     sub                  hd, 2
     jg .v_w4_loop
 %if ARCH_X86_32
-    mov                  hw, r5w ; reset vertical loop
-    mov                tmpq, tmpm
     mov                srcq, srcm
-    add                tmpq, 8
+    mov                tmpq, tmpm
+    movzx                hd, r5w
     add                srcq, 4
-    mov                tmpm, tmpq
+    add                tmpq, 8
     mov                srcm, srcq
+    mov                tmpm, tmpq
     sub                 r5d, 1<<16 ; horizontal--
     jg .v_w4_loop0
 %endif
@@ -3255,37 +3159,30 @@
     RET
 %if ARCH_X86_64
 .v_w8:
-    lea                 r5d, [wq - 8] ; horizontal loop
+    lea                 r6d, [wq*8-64]
+    mov                  r5, srcq
     mov                  r8, tmpq
-    mov                  r6, srcq
-    shl                 r5d, 8 - 3; (wq / 8) << 8
-    mov                 r5b, hb
+    lea                 r6d, [hq+r6*4]
 .v_w8_loop0:
-    movq                 m4, [srcq+strideq*0]
-    movq                 m5, [srcq+strideq*1]
-    lea                srcq, [srcq+strideq*2]
-    movq                 m6, [srcq+strideq*0]
-    movq                 m0, [srcq+strideq*1]
-    lea                srcq, [srcq+strideq*2]
     movq                 m1, [srcq+strideq*0]
     movq                 m2, [srcq+strideq*1]
-    lea                srcq, [srcq+strideq*2]
-    movq                 m3, [srcq+strideq*0]
-    shufpd               m4, m0, 0x0c
-    shufpd               m5, m1, 0x0c
-    punpcklbw            m1, m4, m5 ; 01
-    punpckhbw            m4, m5     ; 34
-    shufpd               m6, m2, 0x0c
-    punpcklbw            m2, m5, m6 ; 12
-    punpckhbw            m5, m6     ; 45
-    shufpd               m0, m3, 0x0c
-    punpcklbw            m3, m6, m0 ; 23
-    punpckhbw            m6, m0     ; 56
+    movq                 m3, [srcq+strideq*2]
+    add                srcq, stride3q
+    movq                 m4, [srcq+strideq*0]
+    movq                 m5, [srcq+strideq*1]
+    movq                 m6, [srcq+strideq*2]
+    add                srcq, stride3q
+    movq                 m0, [srcq+strideq*0]
+    punpcklbw            m1, m2 ; 01
+    punpcklbw            m2, m3 ; 12
+    punpcklbw            m3, m4 ; 23
+    punpcklbw            m4, m5 ; 34
+    punpcklbw            m5, m6 ; 45
+    punpcklbw            m6, m0 ; 56
 .v_w8_loop:
-%if cpuflag(ssse3)
-    movq                m12, [srcq+strideq*1]
+    movq                m13, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
-    movq                m13, [srcq+strideq*0]
+%if cpuflag(ssse3)
     pmaddubsw           m14, m1, subpel0 ; a0
     pmaddubsw           m15, m2, subpel0 ; b0
     mova                 m1, m3
@@ -3298,64 +3195,59 @@
     mova                 m4, m6
     pmaddubsw            m5, subpel2 ; a2
     pmaddubsw            m6, subpel2 ; b2
+    punpcklbw           m12, m0, m13 ; 67
+    movq                 m0, [srcq+strideq*0]
+    punpcklbw           m13, m0      ; 78
     paddw               m14, m5
+    mova                 m5, m12
+    pmaddubsw           m12, subpel3 ; a3
     paddw               m15, m6
-    shufpd               m6, m0, m12, 0x0d
-    shufpd               m0, m12, m13, 0x0c
-    punpcklbw            m5, m6, m0  ; 67
-    punpckhbw            m6, m0      ; 78
-    pmaddubsw           m12, m5, subpel3 ; a3
-    pmaddubsw           m13, m6, subpel3 ; b3
+    mova                 m6, m13
+    pmaddubsw           m13, subpel3 ; b3
     paddw               m14, m12
     paddw               m15, m13
     pmulhrsw            m14, m7
     pmulhrsw            m15, m7
-    movu        [tmpq+wq*0], m14
-    movu        [tmpq+wq*2], m15
 %else
     mova                m14, m1
     PMADDUBSW           m14, subpel0, m7, m12, 1 ; a0
+    mova                m15, m2
+    PMADDUBSW           m15, subpel0, m7, m12, 0 ; b0
     mova                 m1, m3
     PMADDUBSW            m3, subpel1, m7, m12, 0 ; a1
+    mova                 m2, m4
+    PMADDUBSW            m4, subpel1, m7, m12, 0 ; b1
     paddw               m14, m3
     mova                 m3, m5
     PMADDUBSW            m5, subpel2, m7, m12, 0 ; a2
-    paddw               m14, m5
-    movq                m12, [srcq+strideq*1]
-    lea                srcq, [srcq+strideq*2]
-    movq                m13, [srcq+strideq*0]
-    shufpd              m15, m0, m12, 0x0d
-    shufpd               m0, m12, m13, 0x0c
-    punpcklbw            m5, m15, m0  ; 67
-    punpckhbw           m15, m0       ; 78
-    mova                m13, m5
-    PMADDUBSW           m13, subpel3, m7, m12, 0 ; a3
-    paddw               m14, m13
-    PMULHRSW_8192       m14, m14, [base+pw_2]
-    movu        [tmpq+wq*0], m14
-    mova                m14, m2
-    PMADDUBSW           m14, subpel0, m7, m12, 0 ; b0
-    mova                 m2, m4
-    PMADDUBSW            m4, subpel1, m7, m12, 0 ; b1
-    paddw               m14, m4
+    paddw               m15, m4
     mova                 m4, m6
     PMADDUBSW            m6, subpel2, m7, m12, 0 ; b2
-    paddw               m14, m6
-    mova                 m6, m15
-    PMADDUBSW           m15, subpel3, m7, m12, 0 ; b3
-    paddw               m14, m15
+    paddw               m15, m6
+    punpcklbw           m12, m0, m13 ; 67
+    movq                 m0, [srcq+strideq*0]
+    punpcklbw           m13, m0      ; 78
+    paddw               m14, m5
+    mova                 m5, m12
+    PMADDUBSW           m12, subpel3, m7, m6, 0  ; a3
+    paddw               m14, m12
+    mova                 m6, m13
+    PMADDUBSW           m13, subpel3, m7, m12, 0 ; b3
+    paddw               m15, m13
     PMULHRSW_8192       m14, m14, [base+pw_2]
-    movu        [tmpq+wq*2], m14
+    PMULHRSW_8192       m15, m15, [base+pw_2]
 %endif
+    movu        [tmpq+wq*0], m14
+    movu        [tmpq+wq*2], m15
     lea                tmpq, [tmpq+wq*4]
     sub                  hd, 2
     jg .v_w8_loop
-    movzx                hd, r5b ; reset vertical loop
+    add                  r5, 8
     add                  r8, 16
-    add                  r6, 8
+    movzx                hd, r6b
+    mov                srcq, r5
     mov                tmpq, r8
-    mov                srcq, r6
-    sub                 r5d, 1<<8 ; horizontal--
+    sub                 r6d, 1<<8
     jg .v_w8_loop0
     RET
 %endif ;ARCH_X86_64
@@ -3363,7 +3255,6 @@
 %undef subpel1
 %undef subpel2
 %undef subpel3
-    ;
 .hv:
     %assign stack_offset org_stack_offset
     cmp                  wd, 4
@@ -3466,13 +3357,13 @@
 %endif
     movq                 m5, [srcq+strideq*0]   ; 0 _ _ _
     movhps               m5, [srcq+strideq*1]   ; 0 _ 1 _
-    movq                 m4, [srcq+strideq*2]   ; 2 _ _ _
 %if ARCH_X86_32
     lea                srcq, [srcq+strideq*2]
-    add                srcq, strideq
-    movhps               m4, [srcq+strideq*0]   ; 2 _ 3 _
-    add                srcq, strideq
+    movq                 m4, [srcq+strideq*0]   ; 2 _ _ _
+    movhps               m4, [srcq+strideq*1]   ; 2 _ 3 _
+    lea                srcq, [srcq+strideq*2]
 %else
+    movq                 m4, [srcq+strideq*2]   ; 2 _ _ _
     movhps               m4, [srcq+stride3q ]   ; 2 _ 3 _
     lea                srcq, [srcq+strideq*4]
 %endif
@@ -3506,7 +3397,14 @@
 %endif
     movq                 m5, [srcq+strideq*0]   ; 4 _ _ _
     movhps               m5, [srcq+strideq*1]   ; 4 _ 5 _
+%if ARCH_X86_32
+    lea                srcq, [srcq+strideq*2]
+    movq                 m4, [srcq+strideq*0]   ; 6 _ _ _
+    add                srcq, strideq
+%else
     movq                 m4, [srcq+strideq*2]   ; 6 _ _ _
+    add                srcq, stride3q
+%endif
     PSHUFB_SUBPEL_H_4a   m3, m5, m6, m1, m2, 0    ;H subpel_h_shuf4 4~5~
     PSHUFB_SUBPEL_H_4a   m0, m4, m6, m1, m2, 0    ;H subpel_h_shuf4 6~6~
     PMADDUBSW            m3, m7, m1, m2, 1        ;H subpel_filters
@@ -3531,12 +3429,6 @@
     mova                 m2, [esp+mmsize*4]
  %endif
 %endif
-%if ARCH_X86_32
-    lea                srcq, [srcq+strideq*2]
-    add                srcq, strideq
-%else
-    add                srcq, stride3q
-%endif
     ;process high
     PALIGNR              m4, m3, m2, 4;V 1 2 3 4
     punpcklwd            m1, m2, m4  ; V 01 12
@@ -3572,7 +3464,6 @@
   %define m15 m3
  %endif
 %endif
-    ;
 %if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4]
 %endif
@@ -3620,7 +3511,6 @@
     mova         [esp+0xA0], m5
  %endif
 %endif
-    ;
 %if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4+16]
 %endif
@@ -3644,7 +3534,6 @@
     paddd                m5, d32reg ; pd_32
     paddd                m5, m4
     psrad                m4, m5, 6
-    ;
     RESTORELINE_W4       m5, 5, 0
     packssdw             m5, m4
     pshufd               m5, m5, q3120
@@ -3666,7 +3555,6 @@
 %undef subpelv1
 %undef subpelv2
 %undef subpelv3
-    ;
 .hv_w8:
     %assign stack_offset org_stack_offset
 %define hv8_line_1 0
@@ -3699,20 +3587,20 @@
   %define              tmpm  [rsp+mmsize*13+gprsize*1]
   %define              srcm  [rsp+mmsize*13+gprsize*2]
   %define           stridem  [rsp+mmsize*13+gprsize*3]
+    mov                tmpm, tmpq
     mov             stridem, strideq
  %endif
+ %if cpuflag(ssse3)
     pshufd               m0, m1, q0000
     pshufd               m1, m1, q1111
-    punpcklbw            m5, m5
- %if notcpuflag(ssse3)
-    punpcklbw            m0, m0
+ %else
     punpcklbw            m1, m1
- %endif
-    psraw                m5, 8
- %if notcpuflag(ssse3)
-    psraw                m0, 8
     psraw                m1, 8
+    pshufd               m0, m1, q1010
+    punpckhqdq           m1, m1
  %endif
+    punpcklbw            m5, m5
+    psraw                m5, 8
     pshufd               m2, m5, q0000
     pshufd               m3, m5, q1111
     pshufd               m4, m5, q2222
@@ -3742,38 +3630,31 @@
     cmp                  hd, 6
     cmovs               myd, mxd
     movq                 m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ %if cpuflag(ssse3)
     pshufd         subpelh0, m0, q0000
     pshufd         subpelh1, m0, q1111
-    punpcklbw            m1, m1
- %if notcpuflag(ssse3)
-    punpcklbw      subpelh0, subpelh0
-    punpcklbw      subpelh1, subpelh1
+ %else
+    punpcklbw            m0, m0
+    psraw                m0, 8
+    pshufd         subpelh0, m0, q1010
+    pshufd         subpelh1, m0, q3232
+    mova                 m7, [base+pw_2]
  %endif
+    punpcklbw            m1, m1
     psraw                m1, 8
- %if notcpuflag(ssse3)
-    psraw          subpelh0, 8
-    psraw          subpelh1, 8
- %endif
     pshufd         subpelv0, m1, q0000
     pshufd         subpelv1, m1, q1111
     pshufd         subpelv2, m1, q2222
     pshufd         subpelv3, m1, q3333
- %if notcpuflag(ssse3)
-    mova                 m7, [base+pw_2]
- %endif
     lea            stride3q, [strideq*3]
     sub                srcq, 3
     sub                srcq, stride3q
     mov                  r6, srcq
-%endif
-    lea                 r5d, [wq-4]
-%if ARCH_X86_64
     mov                  r8, tmpq
-%else
-    mov                tmpm, tmpq
 %endif
-    shl                 r5d, (16 - 2)
-    mov                 r5w, hw
+    lea                 r5d, [wq-4]
+    shl                 r5d, 14
+    add                 r5d, hd
 .hv_w8_loop0:
 %if cpuflag(ssse3)
  %if ARCH_X86_64
@@ -3791,24 +3672,24 @@
 %endif
     PREP_8TAP_HV         m4, srcq+strideq*0, m7, m0
     PREP_8TAP_HV         m5, srcq+strideq*1, m7, m0
+%if ARCH_X86_64
+    PREP_8TAP_HV         m6, srcq+strideq*2, m7, m0
+    add                srcq, stride3q
+    PREP_8TAP_HV         m0, srcq+strideq*0, m7, m9
+%else
     lea                srcq, [srcq+strideq*2]
-%if notcpuflag(ssse3)
- %if ARCH_X86_64
-    SWAP                 m9, m4
- %else
+ %if notcpuflag(ssse3)
     mova              [esp], m4
  %endif
-%endif
     PREP_8TAP_HV         m6, srcq+strideq*0, m7, m4
     PREP_8TAP_HV         m0, srcq+strideq*1, m7, m4
     lea                srcq, [srcq+strideq*2]
+%endif
 %if cpuflag(ssse3)
     mova                 m7, [base+pw_8192]
 %else
     mova                 m7, [base+pw_2]
- %if ARCH_X86_64
-    SWAP                 m4, m9
- %else
+ %if ARCH_X86_32
     mova                 m4, [esp]
  %endif
 %endif
@@ -3824,28 +3705,26 @@
     SAVELINE_W8           3, m3
 %if cpuflag(ssse3)
     mova                 m7, [base+subpel_h_shufA]
+%endif
+%if ARCH_X86_64
+    PREP_8TAP_HV         m4, srcq+strideq*1, m8, m9
+    PREP_8TAP_HV         m5, srcq+strideq*2, m8, m9
+    add                srcq, stride3q
+    PREP_8TAP_HV         m6, srcq+strideq*0, m8, m9
 %else
- %if ARCH_X86_64
-    SWAP                 m8, m7
-    SWAP                 m9, m0
- %else
+ %if notcpuflag(ssse3)
     mova         [esp+0x30], m0
  %endif
-%endif
     PREP_8TAP_HV         m4, srcq+strideq*0, m7, m0
     PREP_8TAP_HV         m5, srcq+strideq*1, m7, m0
-    PREP_8TAP_HV         m6, srcq+strideq*2, m7, m0
     lea                srcq, [srcq+strideq*2]
+    PREP_8TAP_HV         m6, srcq+strideq*0, m7, m0
+%endif
 %if cpuflag(ssse3)
     mova                 m7, [base+pw_8192]
-%else
- %if ARCH_X86_64
-    SWAP                 m0, m9
-    SWAP                 m7, m8
- %else
+%elif ARCH_X86_32
     mova                 m0, [esp+0x30]
     mova                 m7, [base+pw_2]
- %endif
 %endif
     PMULHRSW_8192        m1, m4, m7
     PMULHRSW_8192        m2, m5, m7
@@ -3902,8 +3781,8 @@
  %endif
 %endif
     PREP_8TAP_HV         m0, srcq+strideq*1, m5, m6
-    PREP_8TAP_HV         m4, srcq+strideq*2, m5, m6
     lea                srcq, [srcq+strideq*2]
+    PREP_8TAP_HV         m4, srcq+strideq*0, m5, m6
 %if cpuflag(ssse3)
     mova                 m5, [base+pw_8192]
 %else
@@ -3933,19 +3812,20 @@
     RESTORELINE_W8        4, m4
     jmp .hv_w8_loop
 .hv_w8_outer:
-    movzx                hd, r5w
 %if ARCH_X86_32
     mov                srcq, srcm
     mov                tmpq, tmpm
+    movzx                hd, r5w
     add                srcq, 4
     add                tmpq, 8
     mov                srcm, srcq
     mov                tmpm, tmpq
 %else
-    add                  r8, 8
-    mov                tmpq, r8
     add                  r6, 4
+    add                  r8, 8
+    movzx                hd, r5b
     mov                srcq, r6
+    mov                tmpq, r8
 %endif
     sub                 r5d, 1<<16
     jg .hv_w8_loop0