ref: d88abfec51e91832a64f36f90c7c61ce67a79333
parent: 14072e733465b034644dd08cfaffb3bf7ac0a310
author: Henrik Gramner <gramner@twoorioles.com>
date: Tue Sep 25 12:36:45 EDT 2018
x86: MC AVX2
--- a/meson.build
+++ b/meson.build
@@ -189,20 +189,6 @@
'src/recon.c'
)
-# Build a helper library for each bitdepth
-bitdepth_objs = []
-foreach bitdepth : dav1d_bitdepths
- bitdepth_lib = static_library(
- 'dav1d_bitdepth_@0@'.format(bitdepth),
- libdav1d_tmpl_sources, config_h_target,
- include_directories: dav1d_inc_dirs,
- c_args: ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag,
- install: false,
- build_by_default: false,
- )
- bitdepth_objs += bitdepth_lib.extract_all_objects()
-endforeach
-
entrypoints_src = files(
'src/lib.c',
'src/thread_task.c'
@@ -241,8 +227,12 @@
libdav1d_sources += files(
'src/x86/cpu.c',
)
+ libdav1d_tmpl_sources += files(
+ 'src/x86/mc_init.c',
+ )
libdav1d_sources_asm = files(
'src/x86/cpuid.asm',
+ 'src/x86/mc.asm',
)
nasm = find_program('nasm')
@@ -280,6 +270,20 @@
if host_machine.system() == 'windows'
libdav1d_sources += files('src/win32/thread.c')
endif
+
+# Build a helper library for each bitdepth
+bitdepth_objs = []
+foreach bitdepth : dav1d_bitdepths
+ bitdepth_lib = static_library(
+ 'dav1d_bitdepth_@0@'.format(bitdepth),
+ libdav1d_tmpl_sources, config_h_target,
+ include_directories: dav1d_inc_dirs,
+ c_args: ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag,
+ install: false,
+ build_by_default: false,
+ )
+ bitdepth_objs += bitdepth_lib.extract_all_objects()
+endforeach
libdav1d = library('dav1d',
libdav1d_sources, rev_target, nasm_objs,
--- a/src/mc.c
+++ b/src/mc.c
@@ -530,4 +530,8 @@
c->w_mask[2] = w_mask_420_c;
c->warp8x8 = warp_affine_8x8_c;
c->warp8x8t = warp_affine_8x8t_c;
+
+#if HAVE_ASM && ARCH_X86
+ bitfn(dav1d_mc_dsp_init_x86)(c);
+#endif
}
--- a/src/mc.h
+++ b/src/mc.h
@@ -101,4 +101,7 @@
void dav1d_mc_dsp_init_8bpc(Dav1dMCDSPContext *c);
void dav1d_mc_dsp_init_10bpc(Dav1dMCDSPContext *c);
+void dav1d_mc_dsp_init_x86_8bpc(Dav1dMCDSPContext *c);
+void dav1d_mc_dsp_init_x86_10bpc(Dav1dMCDSPContext *c);
+
#endif /* __DAV1D_SRC_MC_H__ */
--- /dev/null
+++ b/src/x86/mc.asm
@@ -1,0 +1,3035 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64 && UNIX64 ; FIXME: Windows
+
+SECTION_RODATA 32
+
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
+bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
+
+pw_8: times 2 dw 8
+pw_26: times 2 dw 26
+pw_34: times 2 dw 34
+pw_258: times 2 dw 258
+pw_512: times 2 dw 512
+pw_1024: times 2 dw 1024
+pw_2048: times 2 dw 2048
+pw_8192: times 2 dw 8192
+pd_32: dd 32
+pd_512: dd 512
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+%macro BIDIR_JMP_TABLE 1-7 4, 8, 16, 32, 64, 128
+ %xdefine %1_table (%%table - 2*4)
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %%table:
+ %rep 6
+ dd %%prefix %+ .w%2 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg_avx2
+BIDIR_JMP_TABLE w_avg_avx2
+BIDIR_JMP_TABLE mask_avx2
+BIDIR_JMP_TABLE w_mask_420_avx2
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put)
+%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep)
+
+BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+SECTION .text
+
+INIT_XMM avx2
+DECLARE_REG_TMP 4, 6, 7
+cglobal put_bilin, 4, 8, 8, dst, ds, src, ss, w, h, mxy
+ movifnidn mxyd, r6m ; mx
+ lea t2, [put_avx2]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [t2+wq*2+table_offset(put,)]
+ add wq, t2
+ lea t1, [ssq*3]
+ lea t2, [dsq*3]
+ jmp wq
+.put_w2:
+ movzx t0d, word [srcq+ssq*0]
+ movzx t1d, word [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], t0w
+ mov [dstq+dsq*1], t1w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov t0d, [srcq+ssq*0]
+ mov t1d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], t0d
+ mov [dstq+dsq*1], t1d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movq m0, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq [dstq+dsq*0], m0
+ movq [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+t1 ]
+ lea srcq, [srcq+ssq*4]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ mova [dstq+dsq*2], m2
+ mova [dstq+t2 ], m3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .put_w16
+ RET
+INIT_YMM avx2
+.put_w32:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+t1 ]
+ lea srcq, [srcq+ssq*4]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ mova [dstq+dsq*2], m2
+ mova [dstq+t2 ], m3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+ssq*0+32*0]
+ movu m1, [srcq+ssq*0+32*1]
+ movu m2, [srcq+ssq*1+32*0]
+ movu m3, [srcq+ssq*1+32*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+32*0], m0
+ mova [dstq+dsq*0+32*1], m1
+ mova [dstq+dsq*1+32*0], m2
+ mova [dstq+dsq*1+32*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+ movu m2, [srcq+32*2]
+ movu m3, [srcq+32*3]
+ add srcq, ssq
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+ ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+ imul mxyd, 0xff01
+ vbroadcasti128 m4, [bilin_h_shuf8]
+ add mxyd, 16 << 8
+ movd xm5, mxyd
+ mov mxyd, r7m ; my
+ vpbroadcastw m5, xm5
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [t2+wq*2+table_offset(put, _bilin_h)]
+ vpbroadcastd m6, [pw_2048]
+ add wq, t2
+ jmp wq
+.h_w2:
+ movd xm0, [srcq+ssq*0]
+ pinsrd xm0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+ pmulhrsw xm0, xm6
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ mova xm4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+ pmulhrsw xm0, xm6
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pshufb xm1, xm4
+ pmaddubsw xm0, xm5
+ pmaddubsw xm1, xm5
+ pmulhrsw xm0, xm6
+ pmulhrsw xm1, xm6
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+ssq*0+8*0]
+ vinserti128 m0, m0, [srcq+ssq*1+8*0], 1
+ movu xm1, [srcq+ssq*0+8*1]
+ vinserti128 m1, m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ movu m2, [srcq+8*4]
+ movu m3, [srcq+8*5]
+ add srcq, ssq
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ pmulhrsw m2, m6
+ pmulhrsw m3, m6
+ packuswb m0, m1
+ packuswb m2, m3
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m2
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ mov t1, -32*3
+.h_w128_loop:
+ movu m0, [srcq+t1+32*3+8*0]
+ movu m1, [srcq+t1+32*3+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ packuswb m0, m1
+ mova [dstq+t1+32*3], m0
+ add t1, 32
+ jle .h_w128_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [t2+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 0xff01
+ vpbroadcastd m7, [pw_2048]
+ add mxyd, 16 << 8
+ add wq, t2
+ movd xm6, mxyd
+ vpbroadcastw m6, xm6
+ jmp wq
+.v_w2:
+ movd xm0, [srcq+ssq*0]
+.v_w2_loop:
+ pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1
+ lea srcq, [srcq+ssq*2]
+ pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1
+ pshuflw xm1, xm1, q2301 ; 1 0
+ punpcklbw xm1, xm0, xm1
+ pmaddubsw xm1, xm6
+ pmulhrsw xm1, xm7
+ packuswb xm1, xm1
+ pextrw [dstq+dsq*0], xm1, 1
+ pextrw [dstq+dsq*1], xm1, 0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xm0, [srcq+ssq*0]
+.v_w4_loop:
+ vpbroadcastd xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm2, xm1, xm0, 0x01 ; 0 1
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm1, xm1, xm0, 0x02 ; 1 2
+ punpcklbw xm1, xm2
+ pmaddubsw xm1, xm6
+ pmulhrsw xm1, xm7
+ packuswb xm1, xm1
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm0, [srcq+ssq*0]
+.v_w8_loop:
+ vpbroadcastq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm2, xm1, xm0, 0x03 ; 0 1
+ vpbroadcastq xm0, [srcq+ssq*0]
+ vpblendd xm1, xm1, xm0, 0x0c ; 1 2
+ punpcklbw xm3, xm1, xm2
+ punpckhbw xm1, xm2
+ pmaddubsw xm3, xm6
+ pmaddubsw xm1, xm6
+ pmulhrsw xm3, xm7
+ pmulhrsw xm1, xm7
+ packuswb xm3, xm1
+ movq [dstq+dsq*0], xm3
+ movhps [dstq+dsq*1], xm3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu xm0, [srcq+ssq*0]
+.v_w16_loop:
+ vbroadcasti128 m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m3, m2, m0, 0x0f ; 0 1
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vpblendd m2, m2, m0, 0xf0 ; 1 2
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m6
+ pmaddubsw m2, m6
+ pmulhrsw m1, m7
+ pmulhrsw m2, m7
+ packuswb m1, m2
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+%macro PUT_BILIN_V_W32 0
+ movu m0, [srcq+ssq*0]
+%%loop:
+ movu m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m1, m4, m0
+ punpckhbw m3, m4, m0
+ movu m0, [srcq+ssq*0]
+ punpcklbw m2, m0, m4
+ punpckhbw m4, m0, m4
+ pmaddubsw m1, m6
+ pmaddubsw m3, m6
+ pmaddubsw m2, m6
+ pmaddubsw m4, m6
+ pmulhrsw m1, m7
+ pmulhrsw m3, m7
+ pmulhrsw m2, m7
+ pmulhrsw m4, m7
+ packuswb m1, m3
+ packuswb m2, m4
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg %%loop
+%endmacro
+ PUT_BILIN_V_W32
+ RET
+.v_w64:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+.v_w64_loop:
+ add srcq, ssq
+ movu m3, [srcq+32*0]
+ movu m4, [srcq+32*1]
+ punpcklbw m2, m3, m0
+ punpckhbw m5, m3, m0
+ pmaddubsw m2, m6
+ pmaddubsw m5, m6
+ mova m0, m3
+ pmulhrsw m2, m7
+ pmulhrsw m5, m7
+ packuswb m2, m5
+ punpcklbw m3, m4, m1
+ punpckhbw m5, m4, m1
+ pmaddubsw m3, m6
+ pmaddubsw m5, m6
+ mova m1, m4
+ pmulhrsw m3, m7
+ pmulhrsw m5, m7
+ packuswb m3, m5
+ mova [dstq+32*0], m2
+ mova [dstq+32*1], m3
+ add dstq, dsq
+ dec hd
+ jg .v_w64_loop
+ RET
+.v_w128:
+ mov t0, dstq
+ mov t1, srcq
+ lea t2d, [hq+(3<<8)]
+.v_w128_loop:
+ PUT_BILIN_V_W32
+ mov hb, t2b
+ add t0, 32
+ add t1, 32
+ mov dstq, t0
+ mov srcq, t1
+ sub t2d, 1<<8
+ jg .v_w128_loop
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+ ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+ movzx wd, word [t2+wq*2+table_offset(put, _bilin_hv)]
+ shl mxyd, 11 ; can't shift by 12 due to signed overflow
+ vpbroadcastd m7, [pw_2048]
+ movd xm6, mxyd
+ add wq, t2
+ vpbroadcastw m6, xm6
+ jmp wq
+.hv_w2:
+ vpbroadcastd xm0, [srcq+ssq*0]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+.hv_w2_loop:
+ movd xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pinsrd xm1, [srcq+ssq*0], 1
+ pshufb xm1, xm4
+ pmaddubsw xm1, xm5 ; 1 _ 2 _
+ shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _
+ mova xm0, xm1
+ psubw xm1, xm2
+ paddw xm1, xm1
+ pmulhw xm1, xm6
+ paddw xm1, xm2
+ pmulhrsw xm1, xm7
+ packuswb xm1, xm1
+ pextrw [dstq+dsq*0], xm1, 0
+ pextrw [dstq+dsq*1], xm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova xm4, [bilin_h_shuf4]
+ movddup xm0, [srcq+ssq*0]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+.hv_w4_loop:
+ movq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm1, [srcq+ssq*0]
+ pshufb xm1, xm4
+ pmaddubsw xm1, xm5 ; 1 2
+ shufps xm2, xm0, xm1, q1032 ; 0 1
+ mova xm0, xm1
+ psubw xm1, xm2
+ paddw xm1, xm1
+ pmulhw xm1, xm6
+ paddw xm1, xm2
+ pmulhrsw xm1, xm7
+ packuswb xm1, xm1
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti128 m0, [srcq+ssq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m1, m1, [srcq+ssq*0], 1
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2
+ vperm2i128 m2, m0, m1, 0x21 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ vextracti128 xm2, m1, 1
+ packuswb xm1, xm2
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ movu m0, [srcq+ssq*0+8*0]
+ vinserti128 m0, m0, [srcq+ssq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu xm2, [srcq+ssq*1+8*0]
+ vinserti128 m2, m2, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ movu xm3, [srcq+ssq*0+8*0]
+ vinserti128 m3, m3, [srcq+ssq*0+8*1], 1
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ psubw m1, m2, m0
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m0
+ pmaddubsw m0, m3, m5
+ psubw m3, m0, m2
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m2
+ pmulhrsw m1, m7
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ vpermq m1, m1, q3120
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+%macro PUT_BILIN_HV_W32 0
+ movu m0, [srcq+8*0]
+ vinserti128 m0, m0, [srcq+8*2], 1
+ movu m1, [srcq+8*1]
+ vinserti128 m1, m1, [srcq+8*3], 1
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+%%loop:
+ add srcq, ssq
+ movu xm2, [srcq+8*1]
+ vinserti128 m2, m2, [srcq+8*3], 1
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m3, m2, m1
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m1
+ mova m1, m2
+ pmulhrsw m8, m3, m7
+ASSERT UNIX64 ; using an additional vector register here
+ movu xm2, [srcq+8*0]
+ vinserti128 m2, m2, [srcq+8*2], 1
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m3, m2, m0
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m0
+ mova m0, m2
+ pmulhrsw m3, m7
+ packuswb m3, m8
+ mova [dstq], m3
+ add dstq, dsq
+ dec hd
+ jg %%loop
+%endmacro
+ PUT_BILIN_HV_W32
+ RET
+.hv_w64:
+ mov t0, dstq
+ mov t1, srcq
+ lea t2d, [hq+(1<<8)]
+.hv_w64_loop:
+ PUT_BILIN_HV_W32
+ mov hb, t2b
+ add t0, 32
+ add t1, 32
+ mov dstq, t0
+ mov srcq, t1
+ sub t2d, 1<<8
+ jg .hv_w64_loop
+ RET
+.hv_w128:
+ mov t0, dstq
+ mov t1, srcq
+ lea t2d, [hq+(3<<8)]
+.hv_w128_loop:
+ PUT_BILIN_HV_W32
+ mov hb, t2b
+ add t0, 32
+ add t1, 32
+ mov dstq, t0
+ mov srcq, t1
+ sub t2d, 1<<8
+ jg .hv_w128_loop
+ RET
+
+DECLARE_REG_TMP 3, 5, 6
+cglobal prep_bilin, 3, 7, 7, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea t2, [prep_avx2]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [t2+wq*2+table_offset(prep,)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movd xm0, [srcq+strideq*0]
+ pinsrd xm0, [srcq+strideq*1], 1
+ pinsrd xm0, [srcq+strideq*2], 2
+ pinsrd xm0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, xm0
+ psllw m0, 4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movq xm0, [srcq+strideq*0]
+ movhps xm0, [srcq+strideq*1]
+ movq xm1, [srcq+strideq*2]
+ movhps xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, xm0
+ pmovzxbw m1, xm1
+ psllw m0, 4
+ psllw m1, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ pmovzxbw m0, [srcq+strideq*0]
+ pmovzxbw m1, [srcq+strideq*1]
+ pmovzxbw m2, [srcq+strideq*2]
+ pmovzxbw m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmovzxbw m0, [srcq+strideq*0+16*0]
+ pmovzxbw m1, [srcq+strideq*0+16*1]
+ pmovzxbw m2, [srcq+strideq*1+16*0]
+ pmovzxbw m3, [srcq+strideq*1+16*1]
+ lea srcq, [srcq+strideq*2]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 2
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmovzxbw m0, [srcq+16*0]
+ pmovzxbw m1, [srcq+16*1]
+ pmovzxbw m2, [srcq+16*2]
+ pmovzxbw m3, [srcq+16*3]
+ add srcq, strideq
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ dec hd
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmovzxbw m0, [srcq+16*0]
+ pmovzxbw m1, [srcq+16*1]
+ pmovzxbw m2, [srcq+16*2]
+ pmovzxbw m3, [srcq+16*3]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ pmovzxbw m0, [srcq+16*4]
+ pmovzxbw m1, [srcq+16*5]
+ pmovzxbw m2, [srcq+16*6]
+ pmovzxbw m3, [srcq+16*7]
+ add tmpq, 32*8
+ add srcq, strideq
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq-32*4], m0
+ mova [tmpq-32*3], m1
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+ ; = (16 - mx) * src[x] + mx * src[x + 1]
+ imul mxyd, 0xff01
+ vbroadcasti128 m4, [bilin_h_shuf8]
+ add mxyd, 16 << 8
+ movd xm5, mxyd
+ mov mxyd, r6m ; my
+ vpbroadcastw m5, xm5
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ vbroadcasti128 m4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xm0, [srcq+strideq*0]
+ movhps xm0, [srcq+strideq*1]
+ movq xm1, [srcq+strideq*2]
+ movhps xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m0, m0, xm1, 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, m0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*2]
+ vinserti128 m1, m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 4
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, m0, [srcq+strideq*0+8*1], 1
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, m1, [srcq+strideq*1+8*1], 1
+ movu xm2, [srcq+strideq*2+8*0]
+ vinserti128 m2, m2, [srcq+strideq*2+8*1], 1
+ movu xm3, [srcq+stride3q +8*0]
+ vinserti128 m3, m3, [srcq+stride3q +8*1], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 4
+ jg .h_w16
+ RET
+.h_w32:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, m0, [srcq+strideq*0+8*1], 1
+ movu xm1, [srcq+strideq*0+8*2]
+ vinserti128 m1, m1, [srcq+strideq*0+8*3], 1
+ movu xm2, [srcq+strideq*1+8*0]
+ vinserti128 m2, m2, [srcq+strideq*1+8*1], 1
+ movu xm3, [srcq+strideq*1+8*2]
+ vinserti128 m3, m3, [srcq+strideq*1+8*3], 1
+ lea srcq, [srcq+strideq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, m1, [srcq+8*3], 1
+ movu xm2, [srcq+8*4]
+ vinserti128 m2, m2, [srcq+8*5], 1
+ movu xm3, [srcq+8*6]
+ vinserti128 m3, m3, [srcq+8*7], 1
+ add srcq, strideq
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, m1, [srcq+8*3], 1
+ movu xm2, [srcq+8*4]
+ vinserti128 m2, m2, [srcq+8*5], 1
+ movu xm3, [srcq+8*6]
+ vinserti128 m3, m3, [srcq+8*7], 1
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ movu xm0, [srcq+8* 8]
+ vinserti128 m0, m0, [srcq+8* 9], 1
+ movu xm1, [srcq+8*10]
+ vinserti128 m1, m1, [srcq+8*11], 1
+ movu xm2, [srcq+8*12]
+ vinserti128 m2, m2, [srcq+8*13], 1
+ movu xm3, [srcq+8*14]
+ vinserti128 m3, m3, [srcq+8*15], 1
+ add tmpq, 32*8
+ add srcq, strideq
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq-32*4], m0
+ mova [tmpq-32*3], m1
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
+ imul mxyd, 0xff01
+ add mxyd, 16 << 8
+ add wq, t2
+ lea stride3q, [strideq*3]
+ movd xm6, mxyd
+ vpbroadcastw m6, xm6
+ jmp wq
+.v_w4:
+ movd xm0, [srcq+strideq*0]
+.v_w4_loop:
+ vpbroadcastd m1, [srcq+strideq*2]
+ vpbroadcastd xm2, [srcq+strideq*1]
+ vpbroadcastd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m1, m0, 0x05 ; 0 2 2 2
+ vpbroadcastd m0, [srcq+strideq*0]
+ vpblendd m3, m3, m2, 0x0f ; 1 1 3 3
+ vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4
+ vpblendd m1, m1, m3, 0xaa ; 0 1 2 3
+ vpblendd m2, m2, m3, 0x55 ; 1 2 3 4
+ punpcklbw m2, m1
+ pmaddubsw m2, m6
+ mova [tmpq], m2
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm0, [srcq+strideq*0]
+.v_w8_loop:
+ vpbroadcastq m1, [srcq+strideq*2]
+ vpbroadcastq m2, [srcq+strideq*1]
+ vpbroadcastq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m1, m0, 0x03 ; 0 2 2 2
+ vpbroadcastq m0, [srcq+strideq*0]
+ vpblendd m3, m3, m2, 0x33 ; 1 3 1 3
+ vpblendd m2, m1, m3, 0x0f ; 1 3 2 2
+ vpblendd m1, m1, m3, 0xf0 ; 0 2 1 3
+ vpblendd m2, m2, m0, 0xc0 ; 1 3 2 4
+ punpcklbw m3, m2, m1
+ punpckhbw m2, m1
+ pmaddubsw m3, m6
+ pmaddubsw m2, m6
+ mova [tmpq+32*0], m3
+ mova [tmpq+32*1], m2
+ add tmpq, 32*2
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ vbroadcasti128 m0, [srcq+strideq*0]
+.v_w16_loop:
+ vbroadcasti128 m1, [srcq+strideq*2]
+ vbroadcasti128 m2, [srcq+strideq*1]
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ shufpd m4, m0, m1, 0x0c ; 0 2
+ vbroadcasti128 m0, [srcq+strideq*0]
+ shufpd m2, m2, m3, 0x0c ; 1 3
+ shufpd m1, m1, m0, 0x0c ; 2 4
+ punpcklbw m3, m2, m4
+ punpcklbw m5, m1, m2
+ punpckhbw m1, m2
+ punpckhbw m2, m4
+ pmaddubsw m3, m6
+ pmaddubsw m5, m6
+ pmaddubsw m2, m6
+ pmaddubsw m1, m6
+ mova [tmpq+32*0], m3
+ mova [tmpq+32*1], m5
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m1
+ add tmpq, 32*4
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ vpermq m0, [srcq+strideq*0], q3120
+.v_w32_loop:
+ vpermq m1, [srcq+strideq*1], q3120
+ vpermq m2, [srcq+strideq*2], q3120
+ vpermq m3, [srcq+stride3q ], q3120
+ lea srcq, [srcq+strideq*4]
+ punpcklbw m4, m1, m0
+ punpckhbw m5, m1, m0
+ vpermq m0, [srcq+strideq*0], q3120
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*0], m4
+ mova [tmpq+32*1], m5
+ punpcklbw m4, m2, m1
+ punpckhbw m5, m2, m1
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*2], m4
+ mova [tmpq+32*3], m5
+ add tmpq, 32*8
+ punpcklbw m4, m3, m2
+ punpckhbw m5, m3, m2
+ punpcklbw m1, m0, m3
+ punpckhbw m2, m0, m3
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ pmaddubsw m1, m6
+ pmaddubsw m2, m6
+ mova [tmpq-32*4], m4
+ mova [tmpq-32*3], m5
+ mova [tmpq-32*2], m1
+ mova [tmpq-32*1], m2
+ sub hd, 4
+ jg .v_w32_loop
+ RET
+.v_w64:
+ vpermq m0, [srcq+strideq*0+32*0], q3120
+ vpermq m1, [srcq+strideq*0+32*1], q3120
+.v_w64_loop:
+ vpermq m2, [srcq+strideq*1+32*0], q3120
+ vpermq m3, [srcq+strideq*1+32*1], q3120
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m2, m0
+ punpckhbw m5, m2, m0
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*0], m4
+ mova [tmpq+32*1], m5
+ punpcklbw m4, m3, m1
+ punpckhbw m5, m3, m1
+ vpermq m0, [srcq+strideq*0+32*0], q3120
+ vpermq m1, [srcq+strideq*0+32*1], q3120
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*2], m4
+ mova [tmpq+32*3], m5
+ add tmpq, 32*8
+ punpcklbw m4, m0, m2
+ punpckhbw m5, m0, m2
+ punpcklbw m2, m1, m3
+ punpckhbw m3, m1, m3
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ mova [tmpq-32*4], m4
+ mova [tmpq-32*3], m5
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ mov t0, tmpq
+ mov t1, srcq
+ lea t2d, [hq+(3<<8)]
+.v_w128_loop0:
+ vpermq m0, [srcq+strideq*0], q3120
+.v_w128_loop:
+ vpermq m1, [srcq+strideq*1], q3120
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m2, m1, m0
+ punpckhbw m3, m1, m0
+ vpermq m0, [srcq+strideq*0], q3120
+ punpcklbw m4, m0, m1
+ punpckhbw m5, m0, m1
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*0], m2
+ mova [tmpq+32*1], m3
+ mova [tmpq+32*8], m4
+ mova [tmpq+32*9], m5
+ add tmpq, 32*16
+ sub hd, 2
+ jg .v_w128_loop
+ mov hb, t2b
+ add t0, 64
+ add t1, 32
+ mov tmpq, t0
+ mov srcq, t1
+ sub t2d, 1<<8
+ jg .v_w128_loop0
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ movd xm6, mxyd
+ add wq, t2
+ lea stride3q, [strideq*3]
+ vpbroadcastw m6, xm6
+ jmp wq
+.hv_w4:
+ vbroadcasti128 m4, [bilin_h_shuf4]
+ vpbroadcastq m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w4_loop:
+ movq xm1, [srcq+strideq*1]
+ movhps xm1, [srcq+strideq*2]
+ movq xm2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ movhps xm2, [srcq+strideq*0]
+ vinserti128 m1, m1, xm2, 1
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2 3 4
+ vpblendd m2, m1, m0, 0xc0
+ vpermq m2, m2, q2103 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti128 m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xm1, [srcq+strideq*1]
+ vinserti128 m1, m1, [srcq+strideq*2], 1
+ movu xm2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m2, m2, [srcq+strideq*0], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5 ; 1 2
+ vperm2i128 m3, m0, m1, 0x21 ; 0 1
+ pmaddubsw m0, m2, m5 ; 3 4
+ vperm2i128 m2, m1, m0, 0x21 ; 2 3
+ psubw m1, m3
+ pmulhrsw m1, m6
+ paddw m1, m3
+ psubw m3, m0, m2
+ pmulhrsw m3, m6
+ paddw m3, m2
+ mova [tmpq+32*0], m1
+ mova [tmpq+32*1], m3
+ add tmpq, 32*2
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ movu m0, [srcq+strideq*0+8*0]
+ vinserti128 m0, m0, [srcq+strideq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, m1, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu xm2, [srcq+strideq*0+8*0]
+ vinserti128 m2, m2, [srcq+strideq*0+8*1], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+32*0], m3
+ mova [tmpq+32*1], m2
+ add tmpq, 32*2
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ movu m0, [srcq+8*0]
+ vinserti128 m0, m0, [srcq+8*1], 1
+ movu m1, [srcq+8*2]
+ vinserti128 m1, m1, [srcq+8*3], 1
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w32_loop:
+ add srcq, strideq
+ movu xm2, [srcq+8*0]
+ vinserti128 m2, m2, [srcq+8*1], 1
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m3, m2, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ mova m0, m2
+ mova [tmpq+ 0], m3
+ movu xm2, [srcq+8*2]
+ vinserti128 m2, m2, [srcq+8*3], 1
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m3, m2, m1
+ pmulhrsw m3, m6
+ paddw m3, m1
+ mova m1, m2
+ mova [tmpq+32], m3
+ add tmpq, 32*2
+ dec hd
+ jg .hv_w32_loop
+ RET
+.hv_w64:
+ mov t0, tmpq
+ mov t1, srcq
+ lea t2d, [hq+(3<<8)]
+.hv_w64_loop0:
+ movu m0, [srcq+strideq*0+8*0]
+ vinserti128 m0, m0, [srcq+strideq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w64_loop:
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, m1, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu xm2, [srcq+strideq*0+8*0]
+ vinserti128 m2, m2, [srcq+strideq*0+8*1], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+32*0], m3
+ add tmpq, 32*8
+ mova [tmpq-32*4], m2
+ sub hd, 2
+ jg .hv_w64_loop
+ mov hb, t2b
+ add t0, 32
+ add t1, 16
+ mov tmpq, t0
+ mov srcq, t1
+ sub t2d, 1<<8
+ jg .hv_w64_loop0
+ RET
+.hv_w128:
+ mov t0, tmpq
+ mov t1, srcq
+ lea t2d, [hq+(7<<8)]
+.hv_w128_loop0:
+ movu m0, [srcq+strideq*0+8*0]
+ vinserti128 m0, m0, [srcq+strideq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w128_loop:
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, m1, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu xm2, [srcq+strideq*0+8*0]
+ vinserti128 m2, m2, [srcq+strideq*0+8*1], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+32*0], m3
+ mova [tmpq+32*8], m2
+ add tmpq, 32*16
+ sub hd, 2
+ jg .hv_w128_loop
+ mov hb, t2b
+ add t0, 32
+ add t1, 16
+ mov tmpq, t0
+ mov srcq, t1
+ sub t2d, 1<<8
+ jg .hv_w128_loop0
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+DECLARE_REG_TMP 7, 8
+%macro PUT_8TAP_FN 3 ; type, type_h, type_v
+cglobal put_8tap_%1
+ mov t0d, FILTER_%2
+ mov t1d, FILTER_%3
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX)
+%endif
+%endmacro
+
+PUT_8TAP_FN regular, REGULAR, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+
+cglobal put_8tap, 4, 9, 16, dst, ds, src, ss, w, h, mx, my, ss3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx2]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+ lea r6, [ssq*3]
+ lea r7, [dsq*3]
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [pw_34] ; 2 + (8 << 2)
+ cmp wd, 4
+ jl .h_w2
+ vbroadcasti128 m6, [subpel_h_shufA]
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m7, [subpel_h_shufB]
+ vbroadcasti128 m8, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+ vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0]
+ vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4]
+ add wq, r8
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ dec srcq
+ mova xm4, [subpel_h_shuf4]
+ vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+.h_w2_loop:
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm3
+ phaddw xm0, xm0
+ paddw xm0, xm5
+ psraw xm0, 6
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+.h_w4_loop:
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm6
+ pshufb xm1, xm6
+ pmaddubsw xm0, xm3
+ pmaddubsw xm1, xm3
+ phaddw xm0, xm1
+ paddw xm0, xm5
+ psraw xm0, 6
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+ pshufb m%2, m%1, m7
+ pshufb m%3, m%1, m8
+ pshufb m%1, m6
+ pmaddubsw m%4, m%2, m9
+ pmaddubsw m%2, m10
+ pmaddubsw m%3, m10
+ pmaddubsw m%1, m9
+ paddw m%3, m%4
+ paddw m%1, m%2
+ phaddw m%1, m%3
+ paddw m%1, m5
+ psraw m%1, 6
+%endmacro
+ movu xm0, [srcq+ssq*0]
+ vinserti128 m0, m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 1, 2, 3
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+ssq*0+8*0]
+ vinserti128 m0, m0, [srcq+ssq*1+8*0], 1
+ movu xm1, [srcq+ssq*0+8*1]
+ vinserti128 m1, m1, [srcq+ssq*1+8*1], 1
+ PUT_8TAP_H 0, 2, 3, 4
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 1, 2, 3, 4
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ xor r6d, r6d
+ jmp .h_start
+.h_w64:
+ mov r6, -32*1
+ jmp .h_start
+.h_w128:
+ mov r6, -32*3
+.h_start:
+ sub srcq, r6
+ sub dstq, r6
+ mov r4, r6
+.h_loop:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 2, 3, 4
+ packuswb m0, m1
+ mova [dstq+r6], m0
+ add r6, 32
+ jle .h_loop
+ add srcq, ssq
+ add dstq, dsq
+ mov r6, r4
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ tzcnt r6d, wd
+ movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
+ vpbroadcastd m7, [pw_512]
+ lea myq, [r8+myq*8+subpel_filters-put_avx2]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ add r6, r8
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ jmp r6
+.v_w2:
+ movd xm2, [srcq+ssq*0]
+ pinsrw xm2, [srcq+ssq*1], 2
+ pinsrw xm2, [srcq+ssq*2], 4
+ pinsrw xm2, [srcq+ss3q ], 6 ; 0 1 2 3
+ lea srcq, [srcq+ssq*4]
+ movd xm3, [srcq+ssq*0]
+ vpbroadcastd xm1, [srcq+ssq*1]
+ vpbroadcastd xm0, [srcq+ssq*2]
+ add srcq, ss3q
+ vpblendd xm3, xm3, xm1, 0x02 ; 4 5
+ vpblendd xm1, xm1, xm0, 0x02 ; 5 6
+ palignr xm4, xm3, xm2, 4 ; 1 2 3 4
+ punpcklbw xm3, xm1 ; 45 56
+ punpcklbw xm1, xm2, xm4 ; 01 12
+ punpckhbw xm2, xm4 ; 23 34
+.v_w2_loop:
+ pmaddubsw xm5, xm1, xm8 ; a0 b0
+ mova xm1, xm2
+ pmaddubsw xm2, xm9 ; a1 b1
+ paddw xm5, xm2
+ mova xm2, xm3
+ pmaddubsw xm3, xm10 ; a2 b2
+ paddw xm5, xm3
+ vpbroadcastd xm4, [srcq+ssq*0]
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm4, xm4, xm0, 0x02 ; 7 8
+ punpcklbw xm3, xm4 ; 67 78
+ pmaddubsw xm4, xm3, xm11 ; a3 b3
+ paddw xm5, xm4
+ pmulhrsw xm5, xm7
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xm2, [srcq+ssq*0]
+ pinsrd xm2, [srcq+ssq*1], 1
+ pinsrd xm2, [srcq+ssq*2], 2
+ pinsrd xm2, [srcq+ss3q ], 3 ; 0 1 2 3
+ lea srcq, [srcq+ssq*4]
+ movd xm3, [srcq+ssq*0]
+ vpbroadcastd xm1, [srcq+ssq*1]
+ vpbroadcastd xm0, [srcq+ssq*2]
+ add srcq, ss3q
+ vpblendd xm3, xm3, xm1, 0x02 ; 4 5
+ vpblendd xm1, xm1, xm0, 0x02 ; 5 6
+ palignr xm4, xm3, xm2, 4 ; 1 2 3 4
+ punpcklbw xm3, xm1 ; 45 56
+ punpcklbw xm1, xm2, xm4 ; 01 12
+ punpckhbw xm2, xm4 ; 23 34
+.v_w4_loop:
+ pmaddubsw xm5, xm1, xm8 ; a0 b0
+ mova xm1, xm2
+ pmaddubsw xm2, xm9 ; a1 b1
+ paddw xm5, xm2
+ mova xm2, xm3
+ pmaddubsw xm3, xm10 ; a2 b2
+ paddw xm5, xm3
+ vpbroadcastd xm4, [srcq+ssq*0]
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm4, xm4, xm0, 0x02 ; 7 8
+ punpcklbw xm3, xm4 ; 67 78
+ pmaddubsw xm4, xm3, xm11 ; a3 b3
+ paddw xm5, xm4
+ pmulhrsw xm5, xm7
+ packuswb xm5, xm5
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm1, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m2, [srcq+ssq*2]
+ vpbroadcastq m5, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastq m3, [srcq+ssq*0]
+ vpbroadcastq m6, [srcq+ssq*1]
+ vpbroadcastq m0, [srcq+ssq*2]
+ add srcq, ss3q
+ vpblendd m1, m1, m4, 0x30
+ vpblendd m4, m4, m2, 0x30
+ punpcklbw m1, m4 ; 01 12
+ vpblendd m2, m2, m5, 0x30
+ vpblendd m5, m5, m3, 0x30
+ punpcklbw m2, m5 ; 23 34
+ vpblendd m3, m3, m6, 0x30
+ vpblendd m6, m6, m0, 0x30
+ punpcklbw m3, m6 ; 45 56
+.v_w8_loop:
+ pmaddubsw m5, m1, m8 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, m9 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, m10 ; a2 b2
+ paddw m5, m3
+ vpbroadcastq m4, [srcq+ssq*0]
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m4, m4, m0, 0x30
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, m11 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ vextracti128 xm4, m5, 1
+ packuswb xm5, xm4
+ movq [dstq+dsq*0], xm5
+ movhps [dstq+dsq*1], xm5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ lea r6d, [wq-16]
+ mov r4, dstq
+ mov r7, srcq
+ shl r6d, 4
+ mov r6b, hb
+.v_w16_loop0:
+ vbroadcasti128 m4, [srcq+ssq*0]
+ vbroadcasti128 m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m0, [srcq+ssq*1]
+ vbroadcasti128 m6, [srcq+ssq*0]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m1, [srcq+ssq*0]
+ vbroadcasti128 m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m3, [srcq+ssq*0]
+ shufpd m4, m4, m0, 0x0c
+ shufpd m5, m5, m1, 0x0c
+ punpcklbw m1, m4, m5 ; 01
+ punpckhbw m4, m5 ; 34
+ shufpd m6, m6, m2, 0x0c
+ punpcklbw m2, m5, m6 ; 12
+ punpckhbw m5, m6 ; 45
+ shufpd m0, m0, m3, 0x0c
+ punpcklbw m3, m6, m0 ; 23
+ punpckhbw m6, m0 ; 56
+.v_w16_loop:
+ vbroadcasti128 m12, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m13, [srcq+ssq*0]
+ pmaddubsw m14, m1, m8 ; a0
+ pmaddubsw m15, m2, m8 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, m9 ; a1
+ pmaddubsw m4, m9 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, m10 ; a2
+ pmaddubsw m6, m10 ; b2
+ paddw m14, m5
+ paddw m15, m6
+ shufpd m6, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m6, m0 ; 67
+ punpckhbw m6, m0 ; 78
+ pmaddubsw m12, m5, m11 ; a3
+ pmaddubsw m13, m6, m11 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ packuswb m14, m15
+ vpermq m14, m14, q3120
+ mova [dstq+dsq*0], xm14
+ vextracti128 [dstq+dsq*1], m14, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ mov hb, r6b
+ add r4, 16
+ add r7, 16
+ mov dstq, r4
+ mov srcq, r7
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2]
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m8, [pw_8192]
+ vpbroadcastd m9, [pd_512]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 m6, [subpel_h_shuf4]
+ movq xm2, [srcq+ssq*0]
+ movhps xm2, [srcq+ssq*1]
+ movq xm0, [srcq+ssq*2]
+ movhps xm0, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastq m3, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpblendd m2, m2, m3, 0x30
+ vpblendd m0, m0, m1, 0x30
+ vpblendd m2, m2, m4, 0xc0
+ pshufb m2, m6
+ pshufb m0, m6
+ pmaddubsw m2, m7
+ pmaddubsw m0, m7
+ phaddw m2, m0
+ pmulhrsw m2, m8
+ vextracti128 xm3, m2, 1
+ palignr xm4, xm3, xm2, 4
+ punpcklwd xm1, xm2, xm4 ; 01 12
+ punpckhwd xm2, xm4 ; 23 34
+ pshufd xm0, xm3, q2121
+ punpcklwd xm3, xm0 ; 45 56
+.hv_w2_loop:
+ pmaddwd xm5, xm1, xm10 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm11 ; a1 b1
+ paddd xm5, xm2
+ mova xm2, xm3
+ pmaddwd xm3, xm12 ; a2 b2
+ paddd xm5, xm3
+ movq xm4, [srcq+ssq*0]
+ movhps xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm4, xm6
+ pmaddubsw xm4, xm7
+ phaddw xm4, xm4
+ pmulhrsw xm4, xm8
+ palignr xm3, xm4, xm0, 12
+ mova xm0, xm4
+ punpcklwd xm3, xm0 ; 67 78
+ pmaddwd xm4, xm3, xm13 ; a3 b3
+ paddd xm5, xm9
+ paddd xm5, xm4
+ psrad xm5, 10
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova m6, [subpel_h_shuf4]
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m0, [srcq+ssq*2]
+ vpbroadcastq m5, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastq m3, [srcq+ssq*0]
+ vpblendd m2, m2, m4, 0xcc ; 0 1
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpblendd m0, m0, m5, 0xcc ; 2 3
+ vpblendd m3, m3, m4, 0xcc ; 4 5
+ pshufb m2, m6
+ pshufb m0, m6
+ pshufb m3, m6
+ pshufb m1, m6
+ pmaddubsw m2, m7
+ pmaddubsw m0, m7
+ pmaddubsw m3, m7
+ pmaddubsw m1, m7
+ phaddw m2, m0
+ phaddw m3, m1
+ pmulhrsw m2, m8
+ pmulhrsw m3, m8
+ palignr m4, m3, m2, 4
+ punpcklwd m1, m2, m4 ; 01 12
+ punpckhwd m2, m4 ; 23 34
+ pshufd m0, m3, q2121
+ punpcklwd m3, m0 ; 45 56
+.hv_w4_loop:
+ pmaddwd m5, m1, m10 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m11 ; a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m12 ; a2 b2
+ paddd m5, m3
+ vpbroadcastq m4, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m4, m4, m3, 0xcc ; 7 8
+ pshufb m4, m6
+ pmaddubsw m4, m7
+ phaddw m4, m4
+ pmulhrsw m4, m8
+ palignr m3, m4, m0, 12
+ mova m0, m4
+ punpcklwd m3, m0 ; 67 78
+ pmaddwd m4, m3, m13 ; a3 b3
+ paddd m5, m9
+ paddd m5, m4
+ psrad m5, 10
+ vextracti128 xm4, m5, 1
+ packssdw xm5, xm4
+ packuswb xm5, xm5
+ pshuflw xm5, xm5, q3120
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0]
+ vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2]
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ lea r6d, [wq-8]
+ mov r4, dstq
+ mov r7, srcq
+ shl r6d, 5
+ mov r6b, hb
+.hv_w8_loop0:
+ vbroadcasti128 m7, [subpel_h_shufA]
+ vbroadcasti128 m8, [subpel_h_shufB]
+ vbroadcasti128 m9, [subpel_h_shufC]
+ movu xm4, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movu xm6, [srcq+ssq*0]
+ vbroadcasti128 m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m4, m4, m0, 0xf0 ; 0 3
+ vinserti128 m5, m5, [srcq+ssq*0], 1 ; 1 4
+ vinserti128 m6, m6, [srcq+ssq*1], 1 ; 2 5
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m0, m0, [srcq+ssq*0], 1 ; 3 6
+%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+ pshufb %3, %1, %6
+ pshufb %4, %1, %7
+ pshufb %1, %5
+ pmaddubsw %2, %3, m10
+ pmaddubsw %4, m11
+ pmaddubsw %3, m11
+ pmaddubsw %1, m10
+ paddw %2, %4
+ paddw %1, %3
+ phaddw %1, %2
+%endmacro
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9
+ vpbroadcastd m7, [pw_8192]
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ pmulhrsw m0, m7
+ pmulhrsw m4, m7
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ vpermq m7, m0, q3120
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vextracti128 r6m, m0, 1 ; not enough registers
+ movu xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m0, m0, [srcq+ssq*0], 1 ; 7 8
+ pmaddwd m8, m1, m12 ; a0
+ pmaddwd m9, m2, m12 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m13 ; a1
+ pmaddwd m4, m13 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m14 ; a2
+ pmaddwd m6, m14 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ vbroadcasti128 m5, [subpel_h_shufA]
+ HV_H_W8 m0, m5, m6, m7, m5, m6, m7
+ vpbroadcastd m5, [pw_8192]
+ vpbroadcastd m7, [pd_512]
+ vbroadcasti128 m6, r6m
+ pmulhrsw m0, m5
+ paddd m8, m7
+ paddd m9, m7
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, m15 ; a3
+ paddd m8, m7
+ pmaddwd m7, m6, m15 ; b3
+ paddd m7, m9
+ psrad m8, 10
+ psrad m7, 10
+ packssdw m8, m7
+ vextracti128 xm7, m8, 1
+ packuswb xm8, xm7
+ pshufd xm7, xm8, q3120
+ movq [dstq+dsq*0], xm7
+ movhps [dstq+dsq*1], xm7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ mov hb, r6b
+ add r4, 8
+ add r7, 8
+ mov dstq, r4
+ mov srcq, r7
+ sub r6d, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+DECLARE_REG_TMP 6, 7
+%macro PREP_8TAP_FN 3 ; type, type_h, type_v
+cglobal prep_8tap_%1
+ mov t0d, FILTER_%2
+ mov t1d, FILTER_%3
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
+%endif
+%endmacro
+
+PREP_8TAP_FN regular, REGULAR, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+
+cglobal prep_8tap, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep_avx2]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ add wq, r7
+ lea r6, [strideq*3]
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vbroadcasti128 m5, [subpel_h_shufA]
+ vpbroadcastd m4, [pw_8192]
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx2+0]
+ vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx2+4]
+ add wq, r7
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m3, [r7+mxq*8+subpel_filters-prep_avx2+2]
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq xm0, [srcq+strideq*0]
+ vpbroadcastq m2, [srcq+strideq*2]
+ movq xm1, [srcq+strideq*1]
+ vpblendd m0, m0, m2, 0xf0
+ vpbroadcastq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m1, m2, 0xf0
+ pshufb m0, m5
+ pshufb m1, m5
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ phaddw m0, m1
+ pmulhrsw m0, m4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+%macro PREP_8TAP_H 0
+ pshufb m1, m0, m6
+ pshufb m2, m0, m7
+ pshufb m0, m5
+ pmaddubsw m3, m1, m8
+ pmaddubsw m1, m9
+ pmaddubsw m2, m9
+ pmaddubsw m0, m8
+ paddw m2, m3
+ paddw m0, m1
+ phaddw m0, m2
+ pmulhrsw m0, m4
+%endmacro
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, m0, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, m0, [srcq+strideq*0+8*1], 1
+ PREP_8TAP_H
+ mova [tmpq+32*0], m0
+ movu xm0, [srcq+strideq*1+8*0]
+ vinserti128 m0, m0, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ mova [tmpq+32*1], m0
+ add tmpq, 64
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ xor r6d, r6d
+ jmp .h_start
+.h_w64:
+ mov r6, -32*1
+ jmp .h_start
+.h_w128:
+ mov r6, -32*3
+.h_start:
+ sub srcq, r6
+ mov r5, r6
+.h_loop:
+ movu xm0, [srcq+r6+8*0]
+ vinserti128 m0, m0, [srcq+r6+8*1], 1
+ PREP_8TAP_H
+ mova [tmpq+32*0], m0
+ movu xm0, [srcq+r6+8*2]
+ vinserti128 m0, m0, [srcq+r6+8*3], 1
+ PREP_8TAP_H
+ mova [tmpq+32*1], m0
+ add tmpq, 64
+ add r6, 32
+ jle .h_loop
+ add srcq, strideq
+ mov r6, r5
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
+ shr myd, 16 ; Note that the code is 8-tap only, having
+ cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
+ cmove myd, mxd ; had a negligible effect on performance.
+ ; TODO: Would a 6-tap code path be worth it?
+ vpbroadcastd m7, [pw_8192]
+ lea myq, [r7+myq*8+subpel_filters-prep_avx2]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ cmp wd, 8
+ jg .v_w16
+ je .v_w8
+.v_w4:
+ movd xm0, [srcq+strideq*0]
+ vpbroadcastd m1, [srcq+strideq*2]
+ vpbroadcastd xm2, [srcq+strideq*1]
+ vpbroadcastd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _
+ vpblendd m3, m3, m2, 0x03 ; 1 1 3 3 3 3 _ _
+ vpbroadcastd m0, [srcq+strideq*0]
+ vpbroadcastd m2, [srcq+strideq*1]
+ vpblendd m1, m1, m0, 0x68 ; 0 2 2 4 2 4 4 _
+ vpbroadcastd m0, [srcq+strideq*2]
+ vbroadcasti128 m6, [deint_shuf4]
+ vpblendd m3, m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5
+ vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5
+ vpblendd m3, m3, m1, 0xaa ; 1 2 3 4 3 4 5 _
+ punpcklbw m1, m2, m3 ; 01 12 23 34
+ vpblendd m3, m3, m0, 0x80 ; 1 2 3 4 3 4 5 6
+ punpckhbw m2, m3 ; 23 34 45 56
+.v_w4_loop:
+ pinsrd xm0, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastd m3, [srcq+strideq*0]
+ vpbroadcastd m4, [srcq+strideq*1]
+ vpblendd m3, m3, m4, 0x20 ; _ _ 8 _ 8 9 _ _
+ vpblendd m3, m3, m0, 0x03 ; 6 7 8 _ 8 9 _ _
+ vpbroadcastd m0, [srcq+strideq*2]
+ vpblendd m3, m3, m0, 0x40 ; 6 7 8 _ 8 9 a _
+ pshufb m3, m6 ; 67 78 89 9a
+ pmaddubsw m4, m1, m8
+ vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78
+ pmaddubsw m2, m9
+ paddw m4, m2
+ mova m2, m3
+ pmaddubsw m3, m11
+ paddw m3, m4
+ pmaddubsw m4, m1, m10
+ paddw m3, m4
+ pmulhrsw m3, m7
+ mova [tmpq], m3
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm1, [srcq+strideq*0]
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq m5, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m6, [srcq+strideq*1]
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpblendd m1, m1, m4, 0x30
+ vpblendd m4, m4, m2, 0x30
+ punpcklbw m1, m4 ; 01 12
+ vpblendd m2, m2, m5, 0x30
+ vpblendd m5, m5, m3, 0x30
+ punpcklbw m2, m5 ; 23 34
+ vpblendd m3, m3, m6, 0x30
+ vpblendd m6, m6, m0, 0x30
+ punpcklbw m3, m6 ; 45 56
+.v_w8_loop:
+ vpbroadcastq m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmaddubsw m5, m2, m9 ; a1
+ pmaddubsw m6, m2, m8 ; b0
+ vpblendd m2, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+strideq*0]
+ vpblendd m4, m4, m0, 0x30
+ punpcklbw m2, m4 ; 67 78
+ pmaddubsw m1, m8 ; a0
+ pmaddubsw m4, m3, m9 ; b1
+ paddw m5, m1
+ mova m1, m3
+ pmaddubsw m3, m10 ; a2
+ paddw m6, m4
+ paddw m5, m3
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpblendd m4, m4, m0, 0x30
+ punpcklbw m3, m4 ; 89 9a
+ pmaddubsw m4, m2, m11 ; a3
+ paddw m5, m4
+ pmaddubsw m4, m2, m10 ; b2
+ paddw m6, m4
+ pmaddubsw m4, m3, m11 ; b3
+ paddw m6, m4
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ mova [tmpq+32*0], m5
+ mova [tmpq+32*1], m6
+ add tmpq, 32*2
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ lea r6d, [wq-16]
+ mov r5, tmpq
+ mov r7, srcq
+ shl r6d, 4
+ mov r6b, hb
+.v_w16_loop0:
+ vbroadcasti128 m4, [srcq+strideq*0]
+ vbroadcasti128 m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m0, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*0]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m1, [srcq+strideq*0]
+ vbroadcasti128 m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m3, [srcq+strideq*0]
+ shufpd m4, m4, m0, 0x0c
+ shufpd m5, m5, m1, 0x0c
+ punpcklbw m1, m4, m5 ; 01
+ punpckhbw m4, m5 ; 34
+ shufpd m6, m6, m2, 0x0c
+ punpcklbw m2, m5, m6 ; 12
+ punpckhbw m5, m6 ; 45
+ shufpd m0, m0, m3, 0x0c
+ punpcklbw m3, m6, m0 ; 23
+ punpckhbw m6, m0 ; 56
+.v_w16_loop:
+ vbroadcasti128 m12, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m13, [srcq+strideq*0]
+ pmaddubsw m14, m1, m8 ; a0
+ pmaddubsw m15, m2, m8 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, m9 ; a1
+ pmaddubsw m4, m9 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, m10 ; a2
+ pmaddubsw m6, m10 ; b2
+ paddw m14, m5
+ paddw m15, m6
+ shufpd m6, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m6, m0 ; 67
+ punpckhbw m6, m0 ; 78
+ pmaddubsw m12, m5, m11 ; a3
+ pmaddubsw m13, m6, m11 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ mova [tmpq+wq*0], m14
+ mova [tmpq+wq*2], m15
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w16_loop
+ mov hb, r6b
+ add r5, 32
+ add r7, 16
+ mov tmpq, r5
+ mov srcq, r7
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ dec srcq
+ mova m7, [subpel_h_shuf4]
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx2+2]
+ pmovzxbd m9, [deint_shuf4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx2]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m10, [pw_8192]
+ vpbroadcastd m11, [pd_32]
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ vpbroadcastq m2, [srcq+strideq*0]
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpbroadcastq m5, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m6, [srcq+strideq*1]
+ vpbroadcastq m1, [srcq+strideq*2]
+ vpblendd m2, m2, m4, 0xcc ; 0 1
+ vpblendd m0, m0, m5, 0xcc ; 2 3
+ vpblendd m3, m3, m6, 0xcc ; 4 5
+ pshufb m2, m7
+ pshufb m0, m7
+ pshufb m3, m7
+ pshufb m1, m7
+ pmaddubsw m2, m8
+ pmaddubsw m0, m8
+ pmaddubsw m3, m8
+ pmaddubsw m1, m8
+ phaddw m2, m0
+ phaddw m3, m1
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ palignr m4, m3, m2, 4
+ punpcklwd m1, m2, m4 ; 01 12
+ punpckhwd m2, m4 ; 23 34
+ pshufd m0, m3, q2121
+ punpcklwd m3, m0 ; 45 56
+.hv_w4_loop:
+ pmaddwd m5, m1, m12 ; a0 b0
+ pmaddwd m6, m2, m12 ; c0 d0
+ pmaddwd m2, m13 ; a1 b1
+ pmaddwd m4, m3, m13 ; c1 d1
+ mova m1, m3
+ pmaddwd m3, m14 ; a2 b2
+ paddd m5, m2
+ vpbroadcastq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ paddd m6, m4
+ paddd m5, m3
+ vpbroadcastq m4, [srcq+strideq*0]
+ vpbroadcastq m3, [srcq+strideq*1]
+ vpblendd m2, m2, m4, 0xcc
+ vpbroadcastq m4, [srcq+strideq*2]
+ vpblendd m3, m3, m4, 0xcc
+ pshufb m2, m7
+ pshufb m3, m7
+ pmaddubsw m2, m8
+ pmaddubsw m3, m8
+ phaddw m2, m3
+ pmulhrsw m2, m10
+ palignr m3, m2, m0, 12
+ mova m0, m2
+ punpcklwd m2, m3, m0 ; 67 78
+ punpckhwd m3, m0 ; 89 9a
+ pmaddwd m4, m2, m14 ; c2 d2
+ paddd m6, m11
+ paddd m5, m11
+ paddd m6, m4
+ pmaddwd m4, m2, m15 ; a3 b3
+ paddd m5, m4
+ pmaddwd m4, m3, m15 ; c3 d3
+ paddd m6, m4
+ psrad m5, 6
+ psrad m6, 6
+ packssdw m5, m6
+ vpermd m5, m9, m5
+ mova [tmpq], m5
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx2+0]
+ vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx2+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx2]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ lea r6d, [wq-8]
+ mov r5, tmpq
+ mov r7, srcq
+ shl r6d, 5
+ mov r6b, hb
+.hv_w8_loop0:
+ vbroadcasti128 m7, [subpel_h_shufA]
+ vbroadcasti128 m8, [subpel_h_shufB]
+ vbroadcasti128 m9, [subpel_h_shufC]
+ movu xm4, [srcq+strideq*0]
+ movu xm5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu xm6, [srcq+strideq*0]
+ vbroadcasti128 m0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpblendd m4, m4, m0, 0xf0 ; 0 3
+ vinserti128 m5, m5, [srcq+strideq*0], 1 ; 1 4
+ vinserti128 m6, m6, [srcq+strideq*1], 1 ; 2 5
+ lea srcq, [srcq+strideq*2]
+ vinserti128 m0, m0, [srcq+strideq*0], 1 ; 3 6
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9
+ vpbroadcastd m7, [pw_8192]
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ pmulhrsw m0, m7
+ pmulhrsw m4, m7
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ vpermq m7, m0, q3120
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vextracti128 [tmpq], m0, 1 ; not enough registers
+ movu xm0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti128 m0, m0, [srcq+strideq*0], 1 ; 7 8
+ pmaddwd m8, m1, m12 ; a0
+ pmaddwd m9, m2, m12 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m13 ; a1
+ pmaddwd m4, m13 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m14 ; a2
+ pmaddwd m6, m14 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ vbroadcasti128 m5, [subpel_h_shufA]
+ HV_H_W8 m0, m5, m6, m7, m5, m6, m7
+ vpbroadcastd m5, [pw_8192]
+ vpbroadcastd m7, [pd_32]
+ vbroadcasti128 m6, [tmpq]
+ pmulhrsw m0, m5
+ paddd m8, m7
+ paddd m9, m7
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, m15 ; a3
+ paddd m8, m7
+ pmaddwd m7, m6, m15 ; b3
+ paddd m7, m9
+ psrad m8, 6
+ psrad m7, 6
+ packssdw m8, m7
+ vpermq m7, m8, q3120
+ mova [tmpq+wq*0], xm7
+ vextracti128 [tmpq+wq*2], m7, 1
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .hv_w8_loop
+ mov hb, r6b
+ add r5, 16
+ add r7, 8
+ mov tmpq, r5
+ mov srcq, r7
+ sub r6d, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+%macro BIDIR_FN 1 ; op
+ %1 0
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ cmp hd, 8
+ je .ret
+ %1 2
+ lea dstq, [dstq+strideq*4]
+ vextracti128 xm1, m0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.ret:
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w8:
+ vextracti128 xm1, m0, 1
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq ], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq*2]
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ %1_INC_PTR 4
+ %1 0
+ add dstq, strideq
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+32], m0
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ %1 0
+ add dstq, strideq
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+0*32], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+1*32], m0
+ %1_INC_PTR 8
+ %1 -4
+ vpermq m0, m0, q3120
+ mova [dstq+2*32], m0
+ %1 -2
+ vpermq m0, m0, q3120
+ mova [dstq+3*32], m0
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ mova m0, [tmp1q+(%1+0)*mmsize]
+ paddw m0, [tmp2q+(%1+0)*mmsize]
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ paddw m1, [tmp2q+(%1+1)*mmsize]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ packuswb m0, m1
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+ lea r6, [avg_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m2, [pw_1024+r6-avg_avx2_table]
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m0, [tmp2q+(%1+0)*mmsize]
+ psubw m2, m0, [tmp1q+(%1+0)*mmsize]
+ mova m1, [tmp2q+(%1+1)*mmsize]
+ psubw m3, m1, [tmp1q+(%1+1)*mmsize]
+ paddw m2, m2 ; compensate for the weight only being half
+ paddw m3, m3 ; of what it should be
+ pmulhw m2, m4
+ pmulhw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+ lea r6, [w_avg_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m0, r6m ; weight
+ movsxd wq, dword [r6+wq*4]
+ pxor m4, m4
+ psllw m0, 11 ; can't shift by 12, sign bit must be preserved
+ psubw m4, m0
+ vpbroadcastd m5, [pw_2048+r6-w_avg_avx2_table]
+ add wq, r6
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+ vpermq m3, [maskq+(%1+0)*(mmsize/2)], q3120
+ mova m0, [tmp2q+(%1+0)*mmsize]
+ psubw m1, m0, [tmp1q+(%1+0)*mmsize]
+ psubb m3, m4, m3
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3
+ punpcklbw m2, m4, m3 ; -m << 9
+ pmulhw m1, m2
+ paddw m0, m1
+ mova m1, [tmp2q+(%1+1)*mmsize]
+ psubw m2, m1, [tmp1q+(%1+1)*mmsize]
+ paddw m2, m2
+ punpckhbw m3, m4, m3
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*mmsize/2
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ lea r7, [mask_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ pxor m4, m4
+ vpbroadcastd m5, [pw_2048+r7-mask_avx2_table]
+ add wq, r7
+ BIDIR_FN MASK
+
+%macro W_MASK_420 2 ; src_offset, mask_out
+ mova m0, [tmp1q+(%1+0)*mmsize]
+ mova m1, [tmp2q+(%1+0)*mmsize]
+ psubw m1, m0
+ pabsw m%2, m1
+ paddw m%2, m6
+ psrlw m%2, 8 ; (abs(tmp1 - tmp2) + 8) >> 8
+ psubusw m%2, m7, m%2 ; 64 - min(m, 64)
+ psllw m2, m%2, 10
+ pmulhw m1, m2
+ paddw m0, m1
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ mova m2, [tmp2q+(%1+1)*mmsize]
+ psubw m2, m1
+ pabsw m3, m2
+ paddw m3, m6
+ psrlw m3, 8
+ psubusw m3, m7, m3
+ phaddw m%2, m3
+ psllw m3, 10
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m0, m8
+ pmulhrsw m1, m8
+ packuswb m0, m1
+%endmacro
+
+cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ lea r7, [w_mask_420_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ vpbroadcastw m0, r7m ; sign
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [pw_8 +r7-w_mask_420_avx2_table]
+ vpbroadcastd m7, [pw_26 +r7-w_mask_420_avx2_table] ; 64 - 38
+ vpbroadcastd m8, [pw_2048 +r7-w_mask_420_avx2_table]
+ vpbroadcastd m9, [pw_258 +r7-w_mask_420_avx2_table] ; 64 * 4 + 2
+ pmovzxbd m10, [deint_shuf4+r7-w_mask_420_avx2_table]
+ psubw m9, m0
+ add wq, r7
+ W_MASK_420 0, 4
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 4
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ cmp hd, 8
+ jg .w4_h16
+.w4_end:
+ vextracti128 xm0, m4, 1
+ vpblendd xm1, xm4, xm0, 0x05
+ vpblendd xm4, xm4, xm0, 0x0a
+ pshufd xm1, xm1, q2301
+ psubw xm4, xm9, xm4
+ psubw xm4, xm1
+ psrlw xm4, 2
+ packuswb xm4, xm4
+ movq [maskq], xm4
+ RET
+.w4_h16:
+ W_MASK_420 2, 5
+ lea dstq, [dstq+strideq*4]
+ phaddd m4, m5
+ vextracti128 xm1, m0, 1
+ psubw m4, m9, m4
+ psrlw m4, 2
+ vpermd m4, m10, m4
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq], xm4
+ RET
+.w8_loop:
+ add tmp1q, 2*32
+ add tmp2q, 2*32
+ W_MASK_420 0, 4
+ lea dstq, [dstq+strideq*4]
+ add maskq, 8
+.w8:
+ vextracti128 xm2, m4, 1
+ vextracti128 xm1, m0, 1
+ psubw xm4, xm9, xm4
+ psubw xm4, xm2
+ psrlw xm4, 2
+ packuswb xm4, xm4
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ movq [maskq], xm4
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK_420 0, 4
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq ], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ W_MASK_420 2, 5
+ punpckhqdq m1, m4, m5
+ punpcklqdq m4, m5
+ psubw m1, m9, m1
+ psubw m1, m4
+ psrlw m1, 2
+ vpermq m0, m0, q3120
+ packuswb m1, m1
+ vpermd m1, m10, m1
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ mova [maskq], xm1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK_420 0, 4
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq], m0
+ W_MASK_420 2, 5
+ psubw m4, m9, m4
+ psubw m4, m5
+ psrlw m4, 2
+ vpermq m0, m0, q3120
+ packuswb m4, m4
+ vpermd m4, m10, m4
+ mova [dstq+strideq*1], m0
+ mova [maskq], xm4
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop_even:
+ psubw m11, m9, m4
+ psubw m12, m9, m5
+ dec hd
+.w64_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK_420 0, 4
+ add dstq, strideq
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq], m0
+ W_MASK_420 2, 5
+ vpermq m0, m0, q3120
+ mova [dstq+32], m0
+ test hd, 1
+ jz .w64_loop_even
+ psubw m4, m11, m4
+ psubw m5, m12, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m10, m4
+ mova [maskq], m4
+ add maskq, 32
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop_even:
+ psubw m13, m9, m4
+ psubw m14, m9, m5
+ dec hd
+.w128_loop:
+ W_MASK_420 0, 4
+ add dstq, strideq
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+0*32], m0
+ W_MASK_420 2, 5
+ vpermq m0, m0, q3120
+ mova [dstq+1*32], m0
+ add tmp1q, 8*32
+ add tmp2q, 8*32
+ test hd, 1
+ jz .w128_even
+ psubw m4, m11, m4
+ psubw m5, m12, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m10, m4
+ mova [maskq], m4
+ jmp .w128_odd
+.w128_even:
+ psubw m11, m9, m4
+ psubw m12, m9, m5
+.w128_odd:
+ W_MASK_420 -4, 4
+ vpermq m0, m0, q3120
+ mova [dstq+2*32], m0
+ W_MASK_420 -2, 5
+ vpermq m0, m0, q3120
+ mova [dstq+3*32], m0
+ test hd, 1
+ jz .w128_loop_even
+ psubw m4, m13, m4
+ psubw m5, m14, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m10, m4
+ mova [maskq+32], m4
+ add maskq, 64
+ dec hd
+ jg .w128_loop
+ RET
+
+%endif ; ARCH_X86_64
--- /dev/null
+++ b/src/x86/mc_init.c
@@ -1,0 +1,97 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/mc.h"
+#include "src/x86/cpu.h"
+
+decl_mc_fn(dav1d_put_8tap_regular_avx2);
+decl_mc_fn(dav1d_put_8tap_regular_smooth_avx2);
+decl_mc_fn(dav1d_put_8tap_regular_sharp_avx2);
+decl_mc_fn(dav1d_put_8tap_smooth_avx2);
+decl_mc_fn(dav1d_put_8tap_smooth_regular_avx2);
+decl_mc_fn(dav1d_put_8tap_smooth_sharp_avx2);
+decl_mc_fn(dav1d_put_8tap_sharp_avx2);
+decl_mc_fn(dav1d_put_8tap_sharp_regular_avx2);
+decl_mc_fn(dav1d_put_8tap_sharp_smooth_avx2);
+decl_mc_fn(dav1d_put_bilin_avx2);
+
+decl_mct_fn(dav1d_prep_8tap_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
+decl_mct_fn(dav1d_prep_bilin_avx2);
+
+decl_avg_fn(dav1d_avg_avx2);
+decl_w_avg_fn(dav1d_w_avg_avx2);
+decl_mask_fn(dav1d_mask_avx2);
+decl_w_mask_fn(dav1d_w_mask_420_avx2);
+
+void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+#define init_mc_fn(type, name, suffix) \
+ c->mc[type] = dav1d_put_##name##_##suffix
+#define init_mct_fn(type, name, suffix) \
+ c->mct[type] = dav1d_prep_##name##_##suffix
+ const enum CpuFlags flags = dav1d_get_cpu_flags_x86();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+#if BITDEPTH == 8 && ARCH_X86_64 && !defined(_WIN32) // FIXME: Windows
+ init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+ init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mc_fn (FILTER_2D_BILINEAR, bilin, avx2);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2);
+
+ c->avg = dav1d_avg_avx2;
+ c->w_avg = dav1d_w_avg_avx2;
+ c->mask = dav1d_mask_avx2;
+ c->w_mask[2] = dav1d_w_mask_420_avx2;
+#endif
+}