ref: 83956bf10e7cb4af3660cb7be2754657d9ecf1cd
parent: 88798ebf44d5ab6c5c92d28b9190cbe619fcbc29
author: Victorien Le Couviour--Tuffet <victorien@videolan.org>
date: Sun Jun 7 15:29:37 EDT 2020
x86: Adapt SSSE3 prep_bilin to SSE2 --------------------- x86_64: ------------------------------------------ mct_bilinear_w4_h_8bpc_c: 98.9 mct_bilinear_w4_h_8bpc_sse2: 30.2 mct_bilinear_w4_h_8bpc_ssse3: 11.5 --------------------- mct_bilinear_w8_h_8bpc_c: 175.3 mct_bilinear_w8_h_8bpc_sse2: 57.0 mct_bilinear_w8_h_8bpc_ssse3: 19.7 --------------------- mct_bilinear_w16_h_8bpc_c: 396.2 mct_bilinear_w16_h_8bpc_sse2: 179.3 mct_bilinear_w16_h_8bpc_ssse3: 50.9 --------------------- mct_bilinear_w32_h_8bpc_c: 1311.2 mct_bilinear_w32_h_8bpc_sse2: 718.8 mct_bilinear_w32_h_8bpc_ssse3: 243.9 --------------------- mct_bilinear_w64_h_8bpc_c: 2892.7 mct_bilinear_w64_h_8bpc_sse2: 1746.0 mct_bilinear_w64_h_8bpc_ssse3: 568.0 --------------------- mct_bilinear_w128_h_8bpc_c: 7192.6 mct_bilinear_w128_h_8bpc_sse2: 4339.8 mct_bilinear_w128_h_8bpc_ssse3: 1619.2 ------------------------------------------ mct_bilinear_w4_v_8bpc_c: 129.7 mct_bilinear_w4_v_8bpc_sse2: 26.6 mct_bilinear_w4_v_8bpc_ssse3: 16.7 --------------------- mct_bilinear_w8_v_8bpc_c: 233.3 mct_bilinear_w8_v_8bpc_sse2: 55.0 mct_bilinear_w8_v_8bpc_ssse3: 24.7 --------------------- mct_bilinear_w16_v_8bpc_c: 498.9 mct_bilinear_w16_v_8bpc_sse2: 146.0 mct_bilinear_w16_v_8bpc_ssse3: 54.2 --------------------- mct_bilinear_w32_v_8bpc_c: 1562.2 mct_bilinear_w32_v_8bpc_sse2: 560.6 mct_bilinear_w32_v_8bpc_ssse3: 201.0 --------------------- mct_bilinear_w64_v_8bpc_c: 3221.3 mct_bilinear_w64_v_8bpc_sse2: 1380.6 mct_bilinear_w64_v_8bpc_ssse3: 499.3 --------------------- mct_bilinear_w128_v_8bpc_c: 7357.7 mct_bilinear_w128_v_8bpc_sse2: 3439.0 mct_bilinear_w128_v_8bpc_ssse3: 1489.1 ------------------------------------------ mct_bilinear_w4_hv_8bpc_c: 185.0 mct_bilinear_w4_hv_8bpc_sse2: 54.5 mct_bilinear_w4_hv_8bpc_ssse3: 22.1 --------------------- mct_bilinear_w8_hv_8bpc_c: 377.8 mct_bilinear_w8_hv_8bpc_sse2: 104.3 mct_bilinear_w8_hv_8bpc_ssse3: 35.8 --------------------- mct_bilinear_w16_hv_8bpc_c: 1159.4 mct_bilinear_w16_hv_8bpc_sse2: 311.0 mct_bilinear_w16_hv_8bpc_ssse3: 106.3 --------------------- mct_bilinear_w32_hv_8bpc_c: 4436.2 mct_bilinear_w32_hv_8bpc_sse2: 1230.7 mct_bilinear_w32_hv_8bpc_ssse3: 400.7 --------------------- mct_bilinear_w64_hv_8bpc_c: 10627.7 mct_bilinear_w64_hv_8bpc_sse2: 2934.2 mct_bilinear_w64_hv_8bpc_ssse3: 957.2 --------------------- mct_bilinear_w128_hv_8bpc_c: 26048.9 mct_bilinear_w128_hv_8bpc_sse2: 7590.3 mct_bilinear_w128_hv_8bpc_ssse3: 2947.0 ------------------------------------------
--- a/src/meson.build
+++ b/src/meson.build
@@ -186,7 +186,7 @@
'x86/itx_ssse3.asm',
'x86/loopfilter_ssse3.asm',
'x86/looprestoration_ssse3.asm',
- 'x86/mc_ssse3.asm',
+ 'x86/mc_sse.asm',
)
endif
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -89,6 +89,7 @@
decl_mct_fn(dav1d_prep_bilin_avx512icl);
decl_mct_fn(dav1d_prep_bilin_avx2);
decl_mct_fn(dav1d_prep_bilin_ssse3);
+decl_mct_fn(dav1d_prep_bilin_sse2);
decl_avg_fn(dav1d_avg_avx512icl);
decl_avg_fn(dav1d_avg_avx2);
@@ -142,6 +143,8 @@
return;
#if BITDEPTH == 8
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2);
+
c->warp8x8 = dav1d_warp_affine_8x8_sse2;
c->warp8x8t = dav1d_warp_affine_8x8t_sse2;
#endif
--- /dev/null
+++ b/src/x86/mc_sse.asm
@@ -1,0 +1,5436 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+; dav1d_obmc_masks[] with 64-x interleaved
+obmc_masks: db 0, 0, 0, 0
+ ; 2 @4
+ db 45, 19, 64, 0
+ ; 4 @8
+ db 39, 25, 50, 14, 59, 5, 64, 0
+ ; 8 @16
+ db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
+ ; 16 @32
+ db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+ db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
+ ; 32 @64
+ db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+ db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
+ db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
+
+warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
+warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
+warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
+warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
+bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+
+pb_8x0_8x8: times 8 db 0
+ times 8 db 8
+resize_mul: dd 0, 1, 2, 3
+resize_shuf: times 5 db 0
+ db 1, 2, 3, 4, 5, 6
+ times 5+16 db 7
+
+pb_64: times 16 db 64
+pw_m256: times 8 dw -256
+%if ARCH_X86_32
+pw_1: times 8 dw 1
+%endif
+pw_8: times 8 dw 8
+pw_26: times 8 dw 26
+pw_34: times 8 dw 34
+pw_512: times 8 dw 512
+pw_1024: times 8 dw 1024
+pw_2048: times 8 dw 2048
+pw_6903: times 8 dw 6903
+pw_8192: times 8 dw 8192
+pd_32: times 4 dd 32
+pd_63: times 4 dd 63
+pd_512: times 4 dd 512
+pd_16384: times 4 dd 16484
+pd_32768: times 4 dd 32768
+pd_262144:times 4 dd 262144
+
+pw_258: times 2 dw 258
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+%macro BIDIR_JMP_TABLE 1-*
+ ;evaluated at definition time (in loop below)
+ %xdefine %1_table (%%table - 2*%2)
+ %xdefine %%base %1_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ ; dynamically generated label
+ %%table:
+ %rep %0 - 1 ; repeat for num args
+ dd %%prefix %+ .w%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg_ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg_ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask_ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420_ssse3, 4, 8, 16, 16, 16, 16
+BIDIR_JMP_TABLE blend_ssse3, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v_ssse3, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_sse2.prep)
+%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put)
+%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_ssse3.prep)
+
+BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+HV_JMP_TABLE prep, bilin, sse2, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+cextern mc_warp_filter
+
+SECTION .text
+
+INIT_XMM ssse3
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1
+ %define base t0-put_ssse3
+%else
+ DECLARE_REG_TMP 7
+ %define base 0
+%endif
+;
+%macro RESTORE_DSQ_32 1
+ %if ARCH_X86_32
+ mov %1, dsm ; restore dsq
+ %endif
+%endmacro
+;
+cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
+ movifnidn mxyd, r6m ; mx
+ LEA t0, put_ssse3
+ tzcnt wd, wm
+ mov hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [t0+wq*2+table_offset(put,)]
+ add wq, t0
+ RESTORE_DSQ_32 t0
+ jmp wq
+.put_w2:
+ movzx r4d, word [srcq+ssq*0]
+ movzx r6d, word [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r4w
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r4d, [srcq+ssq*0]
+ mov r6d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r4d
+ mov [dstq+dsq*1], r6d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movq m0, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq [dstq+dsq*0], m0
+ movq [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+ssq*0+16*0]
+ movu m1, [srcq+ssq*0+16*1]
+ movu m2, [srcq+ssq*1+16*0]
+ movu m3, [srcq+ssq*1+16*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+16*0], m0
+ mova [dstq+dsq*0+16*1], m1
+ mova [dstq+dsq*1+16*0], m2
+ mova [dstq+dsq*1+16*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ add srcq, ssq
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ mova [dstq+16*6], m2
+ mova [dstq+16*7], m3
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+ ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+ imul mxyd, 0xff01
+ mova m4, [base+bilin_h_shuf8]
+ mova m0, [base+bilin_h_shuf4]
+ add mxyd, 16 << 8
+ movd m5, mxyd
+ mov mxyd, r7m ; my
+ pshuflw m5, m5, q0000
+ punpcklqdq m5, m5
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)]
+ mova m3, [base+pw_2048]
+ add wq, t0
+ RESTORE_DSQ_32 t0
+ jmp wq
+.h_w2:
+ pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
+.h_w2_loop:
+ movd m0, [srcq+ssq*0]
+ movd m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq m0, m1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ pmulhrsw m0, m3
+ packuswb m0, m0
+ movd r6d, m0
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movq m4, [srcq+ssq*0]
+ movhps m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m4, m0
+ pmaddubsw m4, m5
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movd [dstq+dsq*0], m4
+ psrlq m4, 32
+ movd [dstq+dsq*1], m4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w16
+ RET
+.h_w32:
+ movu m0, [srcq+mmsize*0+8*0]
+ movu m1, [srcq+mmsize*0+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ movu m1, [srcq+mmsize*1+8*0]
+ movu m2, [srcq+mmsize*1+8*1]
+ add srcq, ssq
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w32
+ RET
+.h_w64:
+ mov r6, -16*3
+.h_w64_loop:
+ movu m0, [srcq+r6+16*3+8*0]
+ movu m1, [srcq+r6+16*3+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+r6+16*3], m0
+ add r6, 16
+ jle .h_w64_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ mov r6, -16*7
+.h_w128_loop:
+ movu m0, [srcq+r6+16*7+8*0]
+ movu m1, [srcq+r6+16*7+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+r6+16*7], m0
+ add r6, 16
+ jle .h_w128_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 0xff01
+ mova m5, [base+pw_2048]
+ add mxyd, 16 << 8
+ add wq, t0
+ movd m4, mxyd
+ pshuflw m4, m4, q0000
+ punpcklqdq m4, m4
+ RESTORE_DSQ_32 t0
+ jmp wq
+.v_w2:
+ movd m0, [srcq+ssq*0]
+.v_w2_loop:
+ pinsrw m0, [srcq+ssq*1], 1 ; 0 1
+ lea srcq, [srcq+ssq*2]
+ pshuflw m2, m0, q2301
+ pinsrw m0, [srcq+ssq*0], 0 ; 2 1
+ punpcklbw m1, m0, m2
+ pmaddubsw m1, m4
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ movd r6d, m1
+ mov [dstq+dsq*1], r6w
+ shr r6d, 16
+ mov [dstq+dsq*0], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd m0, [srcq+ssq*0]
+.v_w4_loop:
+ movd m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq m2, m0, m1 ; 0 1
+ movd m0, [srcq+ssq*0]
+ punpckldq m1, m0 ; 1 2
+ punpcklbw m1, m2
+ pmaddubsw m1, m4
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ movd [dstq+dsq*0], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ ;
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq m0, [srcq+ssq*0]
+.v_w8_loop:
+ movq m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m1, m3, m0
+ movq m0, [srcq+ssq*0]
+ punpcklbw m2, m0, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+ ;
+%macro PUT_BILIN_V_W16 0
+ movu m0, [srcq+ssq*0]
+%%loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m1, m3, m0
+ punpckhbw m2, m3, m0
+ movu m0, [srcq+ssq*0]
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ mova [dstq+dsq*0], m1
+ punpcklbw m1, m0, m3
+ punpckhbw m2, m0, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg %%loop
+%endmacro
+ ;
+.v_w16:
+ PUT_BILIN_V_W16
+ RET
+.v_w16gt:
+ mov r4, dstq
+ mov r6, srcq
+.v_w16gt_loop:
+%if ARCH_X86_32
+ mov bakm, t0q
+ RESTORE_DSQ_32 t0
+ PUT_BILIN_V_W16
+ mov t0q, bakm
+%else
+ PUT_BILIN_V_W16
+%endif
+ mov hw, t0w
+ add r4, mmsize
+ add r6, mmsize
+ mov dstq, r4
+ mov srcq, r6
+ sub t0d, 1<<16
+ jg .v_w16gt
+ RET
+.v_w32:
+ lea t0d, [hq+(1<<16)]
+ jmp .v_w16gt
+.v_w64:
+ lea t0d, [hq+(3<<16)]
+ jmp .v_w16gt
+.v_w128:
+ lea t0d, [hq+(7<<16)]
+ jmp .v_w16gt
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+ ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11 ; can't shift by 12 due to signed overflow
+ mova m7, [base+pw_2048]
+ movd m6, mxyd
+ add wq, t0
+ pshuflw m6, m6, q0000
+ punpcklqdq m6, m6
+ jmp wq
+.hv_w2:
+ RESTORE_DSQ_32 t0
+ movd m0, [srcq+ssq*0]
+ pshufd m0, m0, q0000 ; src[x - src_stride]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w2_loop:
+ movd m1, [srcq+ssq*1] ; src[x]
+ lea srcq, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*0] ; src[x + src_stride]
+ pshufd m1, m1, q3120
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 _ 2 _
+ shufps m2, m0, m1, q1032 ; 0 _ 1 _
+ mova m0, m1
+ psubw m1, m2 ; src[x + src_stride] - src[x]
+ paddw m1, m1
+ pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x])
+ paddw m1, m2 ; src[x] + (my * (src[x + src_stride] - src[x])
+ pmulhrsw m1, m7
+ packuswb m1, m1
+%if ARCH_X86_64
+ movq r6, m1
+%else
+ pshuflw m1, m1, q2020
+ movd r6d, m1
+%endif
+ mov [dstq+dsq*0], r6w
+ shr r6, gprsize*4
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova m4, [base+bilin_h_shuf4]
+ RESTORE_DSQ_32 t0
+ movddup xm0, [srcq+ssq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w4_loop:
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*0]
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2
+ shufps m2, m0, m1, q1032 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ packuswb m1, m1
+ movd [dstq+dsq*0], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ RESTORE_DSQ_32 t0
+ movu m0, [srcq+ssq*0+8*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu m2, [srcq+ssq*1+8*0]
+ lea srcq, [srcq+ssq*2]
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m1, m2, m0
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m0
+ movu m0, [srcq+ssq*0+8*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ psubw m3, m0, m2
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m2
+ pmulhrsw m1, m7
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ xor t0d, t0d
+.hv_w16gt:
+ mov r4, dstq
+ mov r6, srcq
+ %if WIN64
+ movaps r4m, xmm8
+ %endif
+.hv_w16_loop0:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w16_loop:
+%if ARCH_X86_32
+ %define m0tmp [dstq]
+%else
+ %define m0tmp m8
+%endif
+ add srcq, ssq
+ movu m2, [srcq+8*0]
+ movu m3, [srcq+8*1]
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova m0tmp, m2
+ psubw m2, m0
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m0
+ mova m0, m3
+ psubw m3, m1
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m1
+ mova m1, m0
+ mova m0, m0tmp
+ pmulhrsw m2, m7
+ pmulhrsw m3, m7
+ packuswb m2, m3
+ mova [dstq], m2
+ add dstq, dsmp
+ dec hd
+ jg .hv_w16_loop
+ movzx hd, t0w
+ add r4, mmsize
+ add r6, mmsize
+ mov dstq, r4
+ mov srcq, r6
+ sub t0d, 1<<16
+ jg .hv_w16_loop0
+ %if WIN64
+ movaps xmm8, r4m
+ %endif
+ RET
+.hv_w32:
+ lea t0d, [hq+(1<<16)]
+ jmp .hv_w16gt
+.hv_w64:
+ lea t0d, [hq+(3<<16)]
+ jmp .hv_w16gt
+.hv_w128:
+ lea t0d, [hq+(7<<16)]
+ jmp .hv_w16gt
+
+%macro PSHUFB_0X1X 1-2 ; dst[, src]
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ punpcklbw %1, %1
+ psraw %1, 8
+ pshufd %1, %1, q0000
+ %endif
+%endmacro
+
+%macro PSHUFB_BILIN_H8 2 ; dst, src
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ mova %2, %1
+ psrldq %1, 1
+ punpcklbw %1, %2
+ %endif
+%endmacro
+
+%macro PSHUFB_BILIN_H4 3 ; dst, src, tmp
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ mova %2, %1
+ psrldq %1, 1
+ punpckhbw %3, %1, %2
+ punpcklbw %1, %2
+ punpcklqdq %1, %3
+ %endif
+%endmacro
+
+%macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero
+ %if cpuflag(ssse3)
+ pmaddubsw %1, %2
+ %else
+ %if %5 == 1
+ pxor %3, %3
+ %endif
+ punpckhbw %4, %1, %3
+ punpcklbw %1, %1, %3
+ pmaddwd %4, %2
+ pmaddwd %1, %2
+ packssdw %1, %4
+ %endif
+%endmacro
+
+%macro PMULHRSW 5 ; dst, src, tmp, rndval, shift
+ %if cpuflag(ssse3)
+ pmulhrsw %1, %2
+ %else
+ punpckhwd %3, %1, %4
+ punpcklwd %1, %4
+ pmaddwd %3, %2
+ pmaddwd %1, %2
+ psrad %3, %5
+ psrad %1, %5
+ packssdw %1, %3
+ %endif
+%endmacro
+
+%macro PREP_BILIN 0
+
+DECLARE_REG_TMP 3, 5, 6
+%if ARCH_X86_32
+ %define base t2-prep%+SUFFIX
+%else
+ %define base 0
+%endif
+
+cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ LEA t2, prep%+SUFFIX
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+%if notcpuflag(ssse3)
+ add t2, prep_ssse3 - prep_sse2
+ jmp prep_ssse3
+%else
+ movzx wd, word [t2+wq*2+table_offset(prep,)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movd m0, [srcq+strideq*0]
+ movd m1, [srcq+strideq*1]
+ movd m2, [srcq+strideq*2]
+ movd m3, [srcq+stride3q ]
+ punpckldq m0, m1
+ punpckldq m2, m3
+ lea srcq, [srcq+strideq*4]
+ pxor m1, m1
+ punpcklbw m0, m1
+ punpcklbw m2, m1
+ psllw m0, 4
+ psllw m2, 4
+ mova [tmpq+mmsize*0], m0
+ mova [tmpq+mmsize*1], m2
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movq m0, [srcq+strideq*0]
+ movq m1, [srcq+strideq*1]
+ movq m2, [srcq+strideq*2]
+ movq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pxor m4, m4
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movq m0, [srcq+strideq*0+8*0]
+ movq m1, [srcq+strideq*0+8*1]
+ movq m2, [srcq+strideq*1+8*0]
+ movq m3, [srcq+strideq*1+8*1]
+ lea srcq, [srcq+strideq*2]
+ pxor m4, m4
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 2
+ jg .prep_w16
+ RET
+.prep_w32:
+ mov t2d, 1
+ jmp .prep_w32_vloop
+.prep_w64:
+ mov t2d, 2
+ jmp .prep_w32_vloop
+.prep_w128:
+ mov t2d, 4
+.prep_w32_vloop:
+ mov t1q, srcq
+ mov r3d, t2d
+.prep_w32_hloop:
+ movq m0, [t1q+8*0]
+ movq m1, [t1q+8*1]
+ movq m2, [t1q+8*2]
+ movq m3, [t1q+8*3]
+ pxor m4, m4
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ add t1q, 32
+ dec r3d
+ jg .prep_w32_hloop
+ lea srcq, [srcq+strideq]
+ dec hd
+ jg .prep_w32_vloop
+ RET
+%endif
+.h:
+ ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+ ; = (16 - mx) * src[x] + mx * src[x + 1]
+ imul mxyd, 0xff01
+%if cpuflag(ssse3)
+ mova m4, [base+bilin_h_shuf8]
+%endif
+ add mxyd, 16 << 8
+ movd m5, mxyd
+ mov mxyd, r6m ; my
+%if cpuflag(ssse3)
+ pshuflw m5, m5, q0000
+ punpcklqdq m5, m5
+%else
+ PSHUFB_0X1X m5
+%endif
+ test mxyd, mxyd
+ jnz .hv
+%if ARCH_X86_32
+ mov t1, t2 ; save base reg for w4
+%endif
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+%if notcpuflag(ssse3)
+ WIN64_SPILL_XMM 8
+ pxor m6, m6
+%endif
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+%if cpuflag(ssse3)
+ %if ARCH_X86_32
+ mova m4, [t1-prep_ssse3+bilin_h_shuf4]
+ %else
+ mova m4, [bilin_h_shuf4]
+ %endif
+%endif
+.h_w4_loop:
+ movq m0, [srcq+strideq*0]
+ movhps m0, [srcq+strideq*1]
+ movq m1, [srcq+strideq*2]
+ movhps m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ PSHUFB_BILIN_H4 m0, m4, m2
+ PMADDUBSW m0, m5, m6, m2, 0
+ PSHUFB_BILIN_H4 m1, m4, m2
+ PMADDUBSW m1, m5, m6, m2, 0
+ mova [tmpq+0 ], m0
+ mova [tmpq+16], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*2]
+ movu m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .h_w8
+ RET
+.h_w16:
+ movu m0, [srcq+strideq*0+8*0]
+ movu m1, [srcq+strideq*0+8*1]
+ movu m2, [srcq+strideq*1+8*0]
+ movu m3, [srcq+strideq*1+8*1]
+ lea srcq, [srcq+strideq*2]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ mov t2d, 1 << 0
+ jmp .h_w32_vloop
+.h_w64:
+ mov t2d, 1 << 1
+ jmp .h_w32_vloop
+.h_w128:
+ mov t2d, 1 << 3
+.h_w32_vloop:
+ mov t1q, srcq
+ mov r3d, t2d
+.h_w32_hloop:
+ movu m0, [t1q+8*0]
+ movu m1, [t1q+8*1]
+ movu m2, [t1q+8*2]
+ movu m3, [t1q+8*3]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ add t1q, 32
+ shr r3d, 1
+ jnz .h_w32_hloop
+ lea srcq, [srcq+strideq]
+ sub hd, 1
+ jg .h_w32_vloop
+ RET
+.v:
+%if notcpuflag(ssse3)
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 8
+%endif
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
+ imul mxyd, 0xff01
+ add mxyd, 16 << 8
+ add wq, t2
+ lea stride3q, [strideq*3]
+ movd m5, mxyd
+%if cpuflag(ssse3)
+ pshuflw m5, m5, q0000
+ punpcklqdq m5, m5
+%else
+ PSHUFB_0X1X m5
+ pxor m6, m6
+%endif
+ jmp wq
+.v_w4:
+ movd m0, [srcq+strideq*0]
+.v_w4_loop:
+ movd m1, [srcq+strideq*1]
+ movd m2, [srcq+strideq*2]
+ movd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpcklwd m0, m1 ; 0 1 _ _
+ punpcklwd m1, m2 ; 1 2 _ _
+ punpcklbw m1, m0
+ PMADDUBSW m1, m5, m6, m7, 0
+ pshufd m1, m1, q3120
+ mova [tmpq+16*0], m1
+ movd m0, [srcq+strideq*0]
+ punpcklwd m2, m3 ; 2 3 _ _
+ punpcklwd m3, m0 ; 3 4 _ _
+ punpcklbw m3, m2
+ PMADDUBSW m3, m5, m6, m7, 0
+ pshufd m3, m3, q3120
+ mova [tmpq+16*1], m3
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq m0, [srcq+strideq*0]
+.v_w8_loop:
+ movq m1, [srcq+strideq*2]
+ movq m2, [srcq+strideq*1]
+ movq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ shufpd m4, m0, m1, 0x0c ; 0 2
+ movq m0, [srcq+strideq*0]
+ shufpd m2, m3, 0x0c ; 1 3
+ shufpd m1, m0, 0x0c ; 2 4
+ punpcklbw m3, m2, m4
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*0], m3
+ punpckhbw m3, m2, m4
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*2], m3
+ punpcklbw m3, m1, m2
+ punpckhbw m1, m2
+ PMADDUBSW m3, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*1], m3
+ mova [tmpq+16*3], m1
+ add tmpq, 16*4
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu m0, [srcq+strideq*0]
+.v_w16_loop:
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*2]
+ punpcklbw m3, m1, m0
+ punpckhbw m4, m1, m0
+ PMADDUBSW m3, m5, m6, m7, 0
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*0], m3
+ mova [tmpq+16*1], m4
+ punpcklbw m3, m2, m1
+ punpckhbw m4, m2, m1
+ PMADDUBSW m3, m5, m6, m7, 0
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*2], m3
+ mova [tmpq+16*3], m4
+ movu m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ movu m0, [srcq+strideq*0]
+ add tmpq, 16*8
+ punpcklbw m1, m3, m2
+ punpckhbw m4, m3, m2
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq-16*4], m1
+ mova [tmpq-16*3], m4
+ punpcklbw m1, m0, m3
+ punpckhbw m2, m0, m3
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq-16*2], m1
+ mova [tmpq-16*1], m2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ lea t2d, [hq+(0<<16)]
+ mov t0d, 64
+ jmp .v_w32_start
+.v_w64:
+ lea t2d, [hq+(1<<16)]
+ mov t0d, 128
+ jmp .v_w32_start
+.v_w128:
+ lea t2d, [hq+(3<<16)]
+ mov t0d, 256
+.v_w32_start:
+%if ARCH_X86_64
+ %if WIN64
+ PUSH r7
+ %endif
+ mov r7, tmpq
+%endif
+ mov t1, srcq
+.v_w32_hloop:
+ movu m0, [srcq+strideq*0+16*0]
+ movu m1, [srcq+strideq*0+16*1]
+.v_w32_vloop:
+ movu m2, [srcq+strideq*1+16*0]
+ movu m3, [srcq+strideq*1+16*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m2, m0
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*0], m4
+ punpckhbw m4, m2, m0
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*1], m4
+ punpcklbw m4, m3, m1
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*2], m4
+ punpckhbw m4, m3, m1
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*3], m4
+ add tmpq, t0q
+ movu m0, [srcq+strideq*0+16*0]
+ movu m1, [srcq+strideq*0+16*1]
+ punpcklbw m4, m0, m2
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*0], m4
+ punpckhbw m4, m0, m2
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*1], m4
+ punpcklbw m4, m1, m3
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*2], m4
+ punpckhbw m4, m1, m3
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*3], m4
+ add tmpq, t0q
+ sub hd, 2
+ jg .v_w32_vloop
+ movzx hd, t2w
+ add t1, 32
+ mov srcq, t1
+%if ARCH_X86_64
+ add r7, 2*16*2
+ mov tmpq, r7
+%else
+ mov tmpq, tmpmp
+ add tmpq, 2*16*2
+ mov tmpmp, tmpq
+%endif
+ sub t2d, 1<<16
+ jg .v_w32_hloop
+%if WIN64
+ POP r7
+%endif
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+%assign stack_offset stack_offset - stack_size_padded
+%if cpuflag(ssse3)
+ WIN64_SPILL_XMM 8
+%else
+ WIN64_SPILL_XMM 10
+%endif
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+%if cpuflag(ssse3)
+ shl mxyd, 11
+%else
+ %if ARCH_X86_64
+ mova m8, [pw_8]
+ %else
+ %define m8 [pw_8]
+ %endif
+ pxor m7, m7
+%endif
+ movd m6, mxyd
+ add wq, t2
+ pshuflw m6, m6, q0000
+%if cpuflag(ssse3)
+ punpcklqdq m6, m6
+%else
+ %if ARCH_X86_64
+ psrlw m0, m8, 3
+ punpcklwd m6, m0
+ %else
+ punpcklwd m6, [base+pw_1]
+ %endif
+%endif
+%if ARCH_X86_32
+ mov t1, t2 ; save base reg for w4
+%endif
+ lea stride3q, [strideq*3]
+ jmp wq
+.hv_w4:
+%if cpuflag(ssse3)
+ %if ARCH_X86_32
+ mova m4, [t1-prep_ssse3+bilin_h_shuf4]
+ %else
+ mova m4, [bilin_h_shuf4]
+ %endif
+%endif
+ movhps m0, [srcq+strideq*0]
+ PSHUFB_BILIN_H4 m0, m4, m3
+ PMADDUBSW m0, m5, m7, m4, 0 ; _ 0
+.hv_w4_loop:
+ movq m1, [srcq+strideq*1]
+ movhps m1, [srcq+strideq*2]
+ movq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ movhps m2, [srcq+strideq*0]
+ PSHUFB_BILIN_H4 m1, m4, m3
+ PSHUFB_BILIN_H4 m2, m4, m3
+ PMADDUBSW m1, m5, m7, m4, 0 ; 1 2
+ shufpd m3, m0, m1, 0x01 ; 0 1
+ mova m0, m2
+ PMADDUBSW m0, m5, m7, m4, 0 ; 3 4
+ shufpd m2, m1, m0, 0x01 ; 2 3
+ psubw m1, m3
+ PMULHRSW m1, m6, m4, m8, 4
+ paddw m1, m3
+ psubw m3, m0, m2
+ PMULHRSW m3, m6, m4, m8, 4
+ paddw m3, m2
+ mova [tmpq+16*0], m1
+ mova [tmpq+16*1], m3
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ movu m0, [srcq+strideq*0]
+ PSHUFB_BILIN_H8 m0, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 0
+.hv_w8_loop:
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*2]
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PMADDUBSW m1, m5, m7, m4, 0 ; 1
+ PMADDUBSW m2, m5, m7, m4, 0 ; 2
+ psubw m3, m1, m0
+ PMULHRSW m3, m6, m4, m8, 4
+ paddw m3, m0
+%if notcpuflag(ssse3) && ARCH_X86_64
+ SWAP m9, m7
+%endif
+ psubw m7, m2, m1
+ PMULHRSW m7, m6, m4, m8, 4
+ paddw m7, m1
+ mova [tmpq+16*0], m3
+ mova [tmpq+16*1], m7
+%if notcpuflag(ssse3) && ARCH_X86_64
+ SWAP m7, m9
+%endif
+ movu m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ movu m0, [srcq+strideq*0]
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m0, m4
+ PMADDUBSW m1, m5, m7, m4, ARCH_X86_32 ; 3
+ PMADDUBSW m0, m5, m7, m4, 0 ; 4
+ psubw m3, m1, m2
+ PMULHRSW m3, m6, m4, m8, 4
+ paddw m3, m2
+%if notcpuflag(ssse3) && ARCH_X86_64
+ SWAP m9, m7
+%endif
+ psubw m7, m0, m1
+ PMULHRSW m7, m6, m4, m8, 4
+ paddw m7, m1
+ mova [tmpq+16*2], m3
+ mova [tmpq+16*3], m7
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m7, m9
+ %else
+ pxor m7, m7
+ %endif
+%endif
+ add tmpq, 16*4
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ mov t2d, hd
+ mov t0d, 32
+ jmp .hv_w16_start
+.hv_w32:
+ lea t2d, [hq+(1<<16)]
+ mov t0d, 64
+ jmp .hv_w16_start
+.hv_w64:
+ lea t2d, [hq+(3<<16)]
+ mov t0d, 128
+ jmp .hv_w16_start
+.hv_w128:
+ lea t2d, [hq+(7<<16)]
+ mov t0d, 256
+.hv_w16_start:
+%if ARCH_X86_64
+ %if WIN64
+ PUSH r7
+ %endif
+ mov r7, tmpq
+%endif
+ mov t1, srcq
+.hv_w16_hloop:
+ movu m0, [srcq+strideq*0+8*0]
+ movu m1, [srcq+strideq*0+8*1]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 0a
+ PMADDUBSW m1, m5, m7, m4, 0 ; 0b
+.hv_w16_vloop:
+ movu m2, [srcq+strideq*1+8*0]
+ PSHUFB_BILIN_H8 m2, m4
+ PMADDUBSW m2, m5, m7, m4, 0 ; 1a
+ psubw m3, m2, m0
+ PMULHRSW m3, m6, m4, m8, 4
+ paddw m3, m0
+ mova [tmpq+16*0], m3
+ movu m3, [srcq+strideq*1+8*1]
+ lea srcq, [srcq+strideq*2]
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m3, m5, m7, m4, 0 ; 1b
+ psubw m0, m3, m1
+ PMULHRSW m0, m6, m4, m8, 4
+ paddw m0, m1
+ mova [tmpq+16*1], m0
+ add tmpq, t0q
+ movu m0, [srcq+strideq*0+8*0]
+ PSHUFB_BILIN_H8 m0, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 2a
+ psubw m1, m0, m2
+ PMULHRSW m1, m6, m4, m8, 4
+ paddw m1, m2
+ mova [tmpq+16*0], m1
+ movu m1, [srcq+strideq*0+8*1]
+ PSHUFB_BILIN_H8 m1, m4
+ PMADDUBSW m1, m5, m7, m4, 0 ; 2b
+ psubw m2, m1, m3
+ PMULHRSW m2, m6, m4, m8, 4
+ paddw m2, m3
+ mova [tmpq+16*1], m2
+ add tmpq, t0q
+ sub hd, 2
+ jg .hv_w16_vloop
+ movzx hd, t2w
+ add t1, 16
+ mov srcq, t1
+%if ARCH_X86_64
+ add r7, 2*16
+ mov tmpq, r7
+%else
+ mov tmpq, tmpmp
+ add tmpq, 2*16
+ mov tmpmp, tmpq
+%endif
+ sub t2d, 1<<16
+ jg .hv_w16_hloop
+%if WIN64
+ POP r7
+%endif
+ RET
+%endmacro
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1, 2
+%elif WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%macro PUT_8TAP_FN 3 ; type, type_h, type_v
+cglobal put_8tap_%1
+ mov t0d, FILTER_%2
+ mov t1d, FILTER_%3
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX)
+%endif
+%endmacro
+
+PUT_8TAP_FN regular, REGULAR, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+
+%if ARCH_X86_32
+ %define base_reg r1
+ %define base base_reg-put_ssse3
+ %define W32_RESTORE_DSQ mov dsq, dsm
+ %define W32_RESTORE_SSQ mov ssq, ssm
+%else
+ %define base_reg r8
+ %define base 0
+ %define W32_RESTORE_DSQ
+ %define W32_RESTORE_SSQ
+%endif
+
+cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+%assign org_stack_offset stack_offset
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+%if ARCH_X86_64
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+%else
+ imul ssd, mym, 0x010101
+ add ssd, t1d ; 8tap_v, my, 4tap_v
+ mov srcq, srcm
+%endif
+ mov wd, wm
+ movifnidn hd, hm
+ LEA base_reg, put_ssse3
+ test mxd, 0xf00
+ jnz .h
+%if ARCH_X86_32
+ test ssd, 0xf00
+%else
+ test myd, 0xf00
+%endif
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [base_reg+wq*2+table_offset(put,)]
+ add wq, base_reg
+; put_bilin mangling jump
+%assign stack_offset org_stack_offset
+%if ARCH_X86_32
+ mov dsq, dsm
+ mov ssq, ssm
+%elif WIN64
+ pop r8
+%endif
+ lea r6, [ssq*3]
+ jmp wq
+.h:
+%if ARCH_X86_32
+ test ssd, 0xf00
+%else
+ test myd, 0xf00
+%endif
+ jnz .hv
+ W32_RESTORE_SSQ
+ WIN64_SPILL_XMM 12
+ cmp wd, 4
+ jl .h_w2
+ je .h_w4
+ tzcnt wd, wd
+%if ARCH_X86_64
+ mova m10, [base+subpel_h_shufA]
+ mova m11, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+%endif
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)]
+ movd m5, [base_reg+mxq*8+subpel_filters-put_ssse3+0]
+ pshufd m5, m5, q0000
+ movd m6, [base_reg+mxq*8+subpel_filters-put_ssse3+4]
+ pshufd m6, m6, q0000
+ mova m7, [base+pw_34] ; 2 + (8 << 2)
+ add wq, base_reg
+ jmp wq
+.h_w2:
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ mova m4, [base+subpel_h_shuf4]
+ movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+ pshufd m3, m3, q0000
+ mova m5, [base+pw_34] ; 2 + (8 << 2)
+ W32_RESTORE_DSQ
+.h_w2_loop:
+ movq m0, [srcq+ssq*0]
+ movhps m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pmaddubsw m0, m3
+ phaddw m0, m0
+ paddw m0, m5 ; pw34
+ psraw m0, 6
+ packuswb m0, m0
+ movd r4d, m0
+ mov [dstq+dsq*0], r4w
+ shr r4d, 16
+ mov [dstq+dsq*1], r4w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+ pshufd m3, m3, q0000
+ mova m5, [base+pw_34] ; 2 + (8 << 2)
+ mova m6, [base+subpel_h_shufA]
+ W32_RESTORE_DSQ
+.h_w4_loop:
+ movq m0, [srcq+ssq*0] ; 1
+ movq m1, [srcq+ssq*1] ; 2
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m6 ; subpel_h_shufA
+ pshufb m1, m6 ; subpel_h_shufA
+ pmaddubsw m0, m3 ; subpel_filters
+ pmaddubsw m1, m3 ; subpel_filters
+ phaddw m0, m1
+ paddw m0, m5 ; pw34
+ psraw m0, 6
+ packuswb m0, m0
+ movd [dstq+dsq*0], m0
+ psrlq m0, 32
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+ ;
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+ %if ARCH_X86_32
+ pshufb %2, %1, [base+subpel_h_shufB]
+ pshufb %3, %1, [base+subpel_h_shufC]
+ pshufb %1, [base+subpel_h_shufA]
+ %else
+ pshufb %2, %1, m11; subpel_h_shufB
+ pshufb %3, %1, m9 ; subpel_h_shufC
+ pshufb %1, m10 ; subpel_h_shufA
+ %endif
+ pmaddubsw %4, %2, m5 ; subpel +0 B0
+ pmaddubsw %2, m6 ; subpel +4 B4
+ pmaddubsw %3, m6 ; C4
+ pmaddubsw %1, m5 ; A0
+ paddw %3, %4 ; C4+B0
+ paddw %1, %2 ; A0+B4
+ phaddw %1, %3
+ paddw %1, m7 ; pw34
+ psraw %1, 6
+%endmacro
+ ;
+.h_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ PUT_8TAP_H m0, m2, m3, m4
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H m1, m2, m3, m4
+ packuswb m0, m1
+%if ARCH_X86_32
+ movq [dstq ], m0
+ add dstq, dsm
+ movhps [dstq ], m0
+ add dstq, dsm
+%else
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+%endif
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ xor r6d, r6d
+ jmp .h_start
+.h_w32:
+ mov r6, -16*1
+ jmp .h_start
+.h_w64:
+ mov r6, -16*3
+ jmp .h_start
+.h_w128:
+ mov r6, -16*7
+.h_start:
+ sub srcq, r6
+ sub dstq, r6
+ mov r4, r6
+.h_loop:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ PUT_8TAP_H m0, m2, m3, m4
+ PUT_8TAP_H m1, m2, m3, m4
+ packuswb m0, m1
+ mova [dstq+r6], m0
+ add r6, mmsize
+ jle .h_loop
+ add srcq, ssq
+%if ARCH_X86_32
+ add dstq, dsm
+%else
+ add dstq, dsq
+%endif
+ mov r6, r4
+ dec hd
+ jg .h_loop
+ RET
+.v:
+%if ARCH_X86_32
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 6
+ cmovs ssd, mxd
+ lea ssq, [base_reg+ssq*8+subpel_filters-put_ssse3]
+%else
+ %assign stack_offset org_stack_offset
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ lea myq, [base_reg+myq*8+subpel_filters-put_ssse3]
+%endif
+ tzcnt r6d, wd
+ movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)]
+ mova m7, [base+pw_512]
+ psrlw m2, m7, 1 ; 0x0100
+ add r6, base_reg
+%if ARCH_X86_32
+ %define subpel0 [rsp+mmsize*0]
+ %define subpel1 [rsp+mmsize*1]
+ %define subpel2 [rsp+mmsize*2]
+ %define subpel3 [rsp+mmsize*3]
+%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
+ ALLOC_STACK -mmsize*4
+%assign regs_used 7
+ movd m0, [ssq+0]
+ pshufb m0, m2
+ mova subpel0, m0
+ movd m0, [ssq+2]
+ pshufb m0, m2
+ mova subpel1, m0
+ movd m0, [ssq+4]
+ pshufb m0, m2
+ mova subpel2, m0
+ movd m0, [ssq+6]
+ pshufb m0, m2
+ mova subpel3, m0
+ mov ssq, [rstk+stack_offset+gprsize*4]
+ lea ssq, [ssq*3]
+ sub srcq, ssq
+ mov ssq, [rstk+stack_offset+gprsize*4]
+ mov dsq, [rstk+stack_offset+gprsize*2]
+%else
+ %define subpel0 m8
+ %define subpel1 m9
+ %define subpel2 m10
+ %define subpel3 m11
+ movd subpel0, [myq+0]
+ pshufb subpel0, m2
+ movd subpel1, [myq+2]
+ pshufb subpel1, m2
+ movd subpel2, [myq+4]
+ pshufb subpel2, m2
+ movd subpel3, [myq+6]
+ pshufb subpel3, m2
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+%endif
+ jmp r6
+.v_w2:
+ movd m2, [srcq+ssq*0] ; 0
+ pinsrw m2, [srcq+ssq*1], 2 ; 0 1
+ pinsrw m2, [srcq+ssq*2], 4 ; 0 1 2
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+ pinsrw m2, [srcq+ssq*0], 6 ; 0 1 2 3
+ add srcq, ssq
+%else
+ pinsrw m2, [srcq+ss3q ], 6 ; 0 1 2 3
+ lea srcq, [srcq+ssq*4]
+%endif
+ movd m3, [srcq+ssq*0] ; 4
+ movd m1, [srcq+ssq*1] ; 5
+ movd m0, [srcq+ssq*2] ; 6
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+%else
+ add srcq, ss3q
+%endif
+ punpckldq m3, m1 ; 4 5 _ _
+ punpckldq m1, m0 ; 5 6 _ _
+ palignr m4, m3, m2, 4 ; 1 2 3 4
+ punpcklbw m3, m1 ; 45 56
+ punpcklbw m1, m2, m4 ; 01 12
+ punpckhbw m2, m4 ; 23 34
+.v_w2_loop:
+ pmaddubsw m5, m1, subpel0 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, subpel1 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, subpel2 ; a2 b2
+ paddw m5, m3
+ movd m4, [srcq+ssq*0] ; 7
+ punpckldq m3, m0, m4 ; 6 7 _ _
+ movd m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq m4, m0 ; 7 8 _ _
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, subpel3 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ packuswb m5, m5
+ pshuflw m5, m5, q2020
+ movd r6d, m5
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+%if ARCH_X86_32
+.v_w8:
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+%endif ; ARCH_X86_32
+ lea r6d, [wq - 4] ; horizontal loop
+ mov r4, dstq
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+ %define srcm [rsp+mmsize*4+gprsize]
+%endif
+ mov srcm, srcq
+%else
+ mov r7, srcq
+%endif
+ shl r6d, (16 - 2) ; (wq / 4) << 16
+ mov r6w, hw
+.v_w4_loop0:
+ movd m2, [srcq+ssq*0] ; 0
+ movhps m2, [srcq+ssq*2] ; 0 _ 2
+ movd m3, [srcq+ssq*1] ; 1
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+ movhps m3, [srcq+ssq*0] ; 1 _ 3
+ lea srcq, [srcq+ssq*1]
+%else
+ movhps m3, [srcq+ss3q ] ; 1 _ 3
+ lea srcq, [srcq+ssq*4]
+%endif
+ pshufd m2, m2, q2020 ; 0 2 0 2
+ pshufd m3, m3, q2020 ; 1 3 1 3
+ punpckldq m2, m3 ; 0 1 2 3
+ movd m3, [srcq+ssq*0] ; 4
+ movd m1, [srcq+ssq*1] ; 5
+ movd m0, [srcq+ssq*2] ; 6
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+%else
+ add srcq, ss3q
+%endif
+ punpckldq m3, m1 ; 4 5 _ _
+ punpckldq m1, m0 ; 5 6 _ _
+ palignr m4, m3, m2, 4 ; 1 2 3 4
+ punpcklbw m3, m1 ; 45 56
+ punpcklbw m1, m2, m4 ; 01 12
+ punpckhbw m2, m4 ; 23 34
+.v_w4_loop:
+ pmaddubsw m5, m1, subpel0 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, subpel1 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, subpel2 ; a2 b2
+ paddw m5, m3
+ movd m4, [srcq+ssq*0]
+ punpckldq m3, m0, m4 ; 6 7 _ _
+ movd m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq m4, m0 ; 7 8 _ _
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, subpel3 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ packuswb m5, m5
+ movd [dstq+dsq*0], m5
+ pshufd m5, m5, q0101
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ mov hw, r6w ; reset vertical loop
+ add r4, 4
+ mov dstq, r4
+%if ARCH_X86_32
+ mov srcq, srcm
+ add srcq, 4
+ mov srcm, srcq
+%else
+ add r7, 4
+ mov srcq, r7
+%endif
+ sub r6d, 1<<16 ; horizontal--
+ jg .v_w4_loop0
+ RET
+%if ARCH_X86_64
+.v_w8:
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ lea r6d, [wq - 8] ; horizontal loop
+ mov r4, dstq
+ mov r7, srcq
+ shl r6d, 8 - 3; (wq / 8) << 8
+ mov r6b, hb
+.v_w8_loop0:
+ movq m4, [srcq+ssq*0] ; 0
+ movq m5, [srcq+ssq*1] ; 1
+ lea srcq, [srcq+ssq*2]
+ movq m6, [srcq+ssq*0] ; 2
+ movq m0, [srcq+ssq*1] ; 3
+ lea srcq, [srcq+ssq*2]
+ movq m1, [srcq+ssq*0] ; 4
+ movq m2, [srcq+ssq*1] ; 5
+ lea srcq, [srcq+ssq*2] ;
+ movq m3, [srcq+ssq*0] ; 6
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklbw m1, m4, m5 ; 01
+ punpckhbw m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklbw m2, m5, m6 ; 12
+ punpckhbw m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklbw m3, m6, m0 ; 23
+ punpckhbw m6, m0 ; 56
+.v_w8_loop:
+ movq m12, [srcq+ssq*1] ; 8
+ lea srcq, [srcq+ssq*2]
+ movq m13, [srcq+ssq*0] ; 9
+ pmaddubsw m14, m1, subpel0 ; a0
+ pmaddubsw m15, m2, subpel0 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, subpel1 ; a1
+ pmaddubsw m4, subpel1 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, subpel2 ; a2
+ pmaddubsw m6, subpel2 ; b2
+ paddw m14, m5
+ paddw m15, m6
+ shufpd m6, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m6, m0 ; 67
+ punpckhbw m6, m0 ; 78
+ pmaddubsw m12, m5, subpel3 ; a3
+ pmaddubsw m13, m6, subpel3 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ packuswb m14, m15
+ movq [dstq+dsq*0], xm14
+ movhps [dstq+dsq*1], xm14
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ movzx hd, r6b ; reset vertical loop
+ add r4, 8
+ add r7, 8
+ mov dstq, r4
+ mov srcq, r7
+ sub r6d, 1<<8 ; horizontal--
+ jg .v_w8_loop0
+ RET
+%endif ;ARCH_X86_64
+%undef subpel0
+%undef subpel1
+%undef subpel2
+%undef subpel3
+.hv:
+ %assign stack_offset org_stack_offset
+ cmp wd, 4
+ jg .hv_w8
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+%if ARCH_X86_32
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 6
+ cmovs ssd, mxd
+ movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
+ W32_RESTORE_SSQ
+ lea r6, [ssq*3]
+ sub srcq, r6
+ %define base_reg r6
+ mov r6, r1; use as new base
+ %assign regs_used 2
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
+ mov dsq, [rstk+stack_offset+gprsize*2]
+ %define subpelv0 [rsp+mmsize*0]
+ %define subpelv1 [rsp+mmsize*1]
+ %define subpelv2 [rsp+mmsize*2]
+ %define subpelv3 [rsp+mmsize*3]
+ punpcklqdq m0, m0
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m6, m0, q0000
+ mova subpelv0, m6
+ pshufd m6, m0, q1111
+ mova subpelv1, m6
+ pshufd m6, m0, q2222
+ mova subpelv2, m6
+ pshufd m6, m0, q3333
+ mova subpelv3, m6
+%else
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-put_ssse3]
+ ALLOC_STACK mmsize*14, 14
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ %define subpelv0 m10
+ %define subpelv1 m11
+ %define subpelv2 m12
+ %define subpelv3 m13
+ punpcklqdq m0, m0
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ mova m8, [base+pw_8192]
+ mova m9, [base+pd_512]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+%endif
+ pshufd m7, m1, q0000
+ cmp wd, 4
+ je .hv_w4
+.hv_w2:
+ mova m6, [base+subpel_h_shuf4]
+ ;
+ movq m2, [srcq+ssq*0] ; 0
+ movhps m2, [srcq+ssq*1] ; 0 _ 1
+ movq m0, [srcq+ssq*2] ; 2
+%if ARCH_X86_32
+ %define w8192reg [base+pw_8192]
+ %define d512reg [base+pd_512]
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+ movhps m0, [srcq+ssq*0] ; 2 _ 3
+ lea srcq, [srcq+ssq*1]
+%else
+ %define w8192reg m8
+ %define d512reg m9
+ movhps m0, [srcq+ss3q ] ; 2 _ 3
+ lea srcq, [srcq+ssq*4]
+%endif
+ pshufb m2, m6 ; 0 ~ 1 ~
+ pshufb m0, m6 ; 2 ~ 3 ~
+ pmaddubsw m2, m7 ; subpel_filters
+ pmaddubsw m0, m7 ; subpel_filters
+ phaddw m2, m0 ; 0 1 2 3
+ pmulhrsw m2, w8192reg
+ ;
+ movq m3, [srcq+ssq*0] ; 4
+ movhps m3, [srcq+ssq*1] ; 4 _ 5
+ movq m0, [srcq+ssq*2] ; 6
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+%else
+ add srcq, ss3q
+%endif
+ pshufb m3, m6 ; 4 ~ 5 ~
+ pshufb m0, m6 ; 6 ~
+ pmaddubsw m3, m7 ; subpel_filters
+ pmaddubsw m0, m7 ; subpel_filters
+ phaddw m3, m0 ; 4 5 6 _
+ pmulhrsw m3, w8192reg
+ ;
+ palignr m4, m3, m2, 4; V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2
+ punpckhwd m2, m4 ; V 23 34 2 3 3 4
+ pshufd m0, m3, q2121; V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56 4 5 5 6
+.hv_w2_loop:
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2 ; V
+ pmaddwd m2, subpelv1 ; V a1 b1
+ paddd m5, m2 ; V
+ mova m2, m3 ; V
+ pmaddwd m3, subpelv2 ; a2 b2
+ paddd m5, m3 ; V
+ movq m4, [srcq+ssq*0] ; V 7
+ movhps m4, [srcq+ssq*1] ; V 7 8
+ lea srcq, [srcq+ssq*2] ; V
+ pshufb m4, m6
+ pmaddubsw m4, m7
+ phaddw m4, m4
+ pmulhrsw m4, w8192reg
+ palignr m3, m4, m0, 12
+ mova m0, m4
+ punpcklwd m3, m0 ; V 67 78
+ pmaddwd m4, m3, subpelv3 ; V a3 b3
+ paddd m5, d512reg
+ paddd m5, m4
+ psrad m5, 10
+ packssdw m5, m5
+ packuswb m5, m5
+ movd r4d, m5
+ mov [dstq+dsq*0], r4w
+ shr r4d, 16
+ mov [dstq+dsq*1], r4w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+%undef w8192reg
+%undef d512reg
+ ;
+.hv_w4:
+%define hv4_line_0_0 4
+%define hv4_line_0_1 5
+%define hv4_line_0_2 6
+%define hv4_line_0_3 7
+%define hv4_line_0_4 8
+%define hv4_line_0_5 9
+%define hv4_line_1_0 10
+%define hv4_line_1_1 11
+%define hv4_line_1_2 12
+%define hv4_line_1_3 13
+ ;
+%macro SAVELINE_W4 3
+ mova [rsp+mmsize*hv4_line_%3_%2], %1
+%endmacro
+%macro RESTORELINE_W4 3
+ mova %1, [rsp+mmsize*hv4_line_%3_%2]
+%endmacro
+ ;
+%if ARCH_X86_32
+ %define w8192reg [base+pw_8192]
+ %define d512reg [base+pd_512]
+%else
+ %define w8192reg m8
+ %define d512reg m9
+%endif
+ ; lower shuffle 0 1 2 3 4
+ mova m6, [base+subpel_h_shuf4]
+ movq m5, [srcq+ssq*0] ; 0 _ _ _
+ movhps m5, [srcq+ssq*1] ; 0 _ 1 _
+ movq m4, [srcq+ssq*2] ; 2 _ _ _
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+ movhps m4, [srcq+ssq*0] ; 2 _ 3 _
+ add srcq, ssq
+%else
+ movhps m4, [srcq+ss3q ] ; 2 _ 3 _
+ lea srcq, [srcq+ssq*4]
+%endif
+ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+ pmaddubsw m2, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m2, m0 ;H 0 1 2 3
+ pmulhrsw m2, w8192reg ;H pw_8192
+ SAVELINE_W4 m2, 2, 0
+ ; upper shuffle 2 3 4 5 6
+ mova m6, [base+subpel_h_shuf4+16]
+ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+ pmaddubsw m2, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m2, m0 ;H 0 1 2 3
+ pmulhrsw m2, w8192reg ;H pw_8192
+ ;
+ ; lower shuffle
+ mova m6, [base+subpel_h_shuf4]
+ movq m5, [srcq+ssq*0] ; 4 _ _ _
+ movhps m5, [srcq+ssq*1] ; 4 _ 5 _
+ movq m4, [srcq+ssq*2] ; 6 _ _ _
+ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+ pmaddubsw m3, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m3, m0 ;H 4 5 6 7
+ pmulhrsw m3, w8192reg ;H pw_8192
+ SAVELINE_W4 m3, 3, 0
+ ; upper shuffle
+ mova m6, [base+subpel_h_shuf4+16]
+ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+ pmaddubsw m3, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m3, m0 ;H 4 5 6 7
+ pmulhrsw m3, w8192reg ;H pw_8192
+ ;
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+%else
+ add srcq, ss3q
+%endif
+ ;process high
+ palignr m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ ;process low
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ palignr m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+.hv_w4_loop:
+ ;process low
+ pmaddwd m5, m1, subpelv0 ; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+ ;
+ mova m6, [base+subpel_h_shuf4]
+ movq m4, [srcq+ssq*0] ; 7
+ movhps m4, [srcq+ssq*1] ; 7 _ 8 _
+ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+ pmaddubsw m4, m7 ;H subpel_filters
+ phaddw m4, m4 ;H 7 8 7 8
+ pmulhrsw m4, w8192reg ;H pw_8192
+ palignr m3, m4, m0, 12 ; 6 7 8 7
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+ paddd m5, d512reg ; pd_512
+ paddd m5, m4
+ psrad m5, 10
+ SAVELINE_W4 m0, 0, 0
+ SAVELINE_W4 m1, 1, 0
+ SAVELINE_W4 m2, 2, 0
+ SAVELINE_W4 m3, 3, 0
+ SAVELINE_W4 m5, 5, 0
+ ;process high
+ RESTORELINE_W4 m0, 0, 1
+ RESTORELINE_W4 m1, 1, 1
+ RESTORELINE_W4 m2, 2, 1
+ RESTORELINE_W4 m3, 3, 1
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+ ;
+ mova m6, [base+subpel_h_shuf4+16]
+ movq m4, [srcq+ssq*0] ; 7
+ movhps m4, [srcq+ssq*1] ; 7 _ 8 _
+ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+ pmaddubsw m4, m7 ;H subpel_filters
+ phaddw m4, m4 ;H 7 8 7 8
+ pmulhrsw m4, w8192reg ;H pw_8192
+ palignr m3, m4, m0, 12 ; 6 7 8 7
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+ paddd m5, d512reg ; pd_512
+ paddd m5, m4
+ psrad m4, m5, 10
+ ;
+ RESTORELINE_W4 m5, 5, 0
+ packssdw m5, m4 ; d -> w
+ packuswb m5, m5 ; w -> b
+ pshuflw m5, m5, q3120
+ lea srcq, [srcq+ssq*2]
+ movd [dstq+dsq*0], m5
+ psrlq m5, 32
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ RESTORELINE_W4 m0, 0, 0
+ RESTORELINE_W4 m1, 1, 0
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ jg .hv_w4_loop
+ RET
+%undef subpelv0
+%undef subpelv1
+%undef subpelv2
+%undef subpelv3
+ ;
+.hv_w8:
+ %assign stack_offset org_stack_offset
+%define hv8_line_1 0
+%define hv8_line_2 1
+%define hv8_line_3 2
+%define hv8_line_4 3
+%define hv8_line_6 4
+%macro SAVELINE_W8 2
+ mova [rsp+hv8_line_%1*mmsize], %2
+%endmacro
+%macro RESTORELINE_W8 2
+ mova %2, [rsp+hv8_line_%1*mmsize]
+%endmacro
+ shr mxd, 16
+ sub srcq, 3
+%if ARCH_X86_32
+ %define base_reg r1
+ %define subpelh0 [rsp+mmsize*5]
+ %define subpelh1 [rsp+mmsize*6]
+ %define subpelv0 [rsp+mmsize*7]
+ %define subpelv1 [rsp+mmsize*8]
+ %define subpelv2 [rsp+mmsize*9]
+ %define subpelv3 [rsp+mmsize*10]
+ %define accuv0 [rsp+mmsize*11]
+ %define accuv1 [rsp+mmsize*12]
+ movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 6
+ cmovs ssd, mxd
+ movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
+ mov ssq, ssmp
+ ALLOC_STACK -mmsize*13
+%if STACK_ALIGNMENT < 16
+ %define srcm [rsp+mmsize*13+gprsize*1]
+ %define dsm [rsp+mmsize*13+gprsize*2]
+ mov r6, [rstk+stack_offset+gprsize*2]
+ mov dsm, r6
+%endif
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ punpcklbw m5, m5
+ psraw m5, 8 ; sign-extend
+ pshufd m2, m5, q0000
+ pshufd m3, m5, q1111
+ pshufd m4, m5, q2222
+ pshufd m5, m5, q3333
+ mova subpelh0, m0
+ mova subpelh1, m1
+ mova subpelv0, m2
+ mova subpelv1, m3
+ mova subpelv2, m4
+ mova subpelv3, m5
+ lea r6, [ssq*3]
+ sub srcq, r6
+ mov srcm, srcq
+%else
+ ALLOC_STACK mmsize*5, 16
+ %define subpelh0 m10
+ %define subpelh1 m11
+ %define subpelv0 m12
+ %define subpelv1 m13
+ %define subpelv2 m14
+ %define subpelv3 m15
+ %define accuv0 m8
+ %define accuv1 m9
+ movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m1, [base_reg+myq*8+subpel_filters-put_ssse3]
+ pshufd subpelh0, m0, q0000
+ pshufd subpelh1, m0, q1111
+ punpcklqdq m1, m1
+ punpcklbw m1, m1
+ psraw m1, 8 ; sign-extend
+ pshufd subpelv0, m1, q0000
+ pshufd subpelv1, m1, q1111
+ pshufd subpelv2, m1, q2222
+ pshufd subpelv3, m1, q3333
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ mov r7, srcq
+%endif
+ lea r6d, [wq-4]
+ mov r4, dstq
+ shl r6d, (16 - 2)
+ mov r6w, hw
+.hv_w8_loop0:
+ movu m4, [srcq+ssq*0] ; 0 = _ _
+ movu m5, [srcq+ssq*1] ; 1 = _ _
+ lea srcq, [srcq+ssq*2]
+ ;
+%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+ %if ARCH_X86_32
+ pshufb %3, %1, [base+subpel_h_shufB]
+ pshufb %4, %1, [base+subpel_h_shufC]
+ pshufb %1, [base+subpel_h_shufA]
+ %else
+ pshufb %3, %1, %6 ; subpel_h_shufB
+ pshufb %4, %1, %7 ; subpel_h_shufC
+ pshufb %1, %5 ; subpel_h_shufA
+ %endif
+ pmaddubsw %2, %3, subpelh0 ; subpel +0 C0
+ pmaddubsw %4, subpelh1; subpel +4 B4
+ pmaddubsw %3, subpelh1; C4
+ pmaddubsw %1, subpelh0; A0
+ paddw %2, %4 ; C0+B4
+ paddw %1, %3 ; A0+C4
+ phaddw %1, %2
+%endmacro
+ ;
+%if ARCH_X86_64
+ mova m7, [base+subpel_h_shufA]
+ mova m8, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+%endif
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
+ movu m6, [srcq+ssq*0] ; 2 = _ _
+ movu m0, [srcq+ssq*1] ; 3 = _ _
+ lea srcq, [srcq+ssq*2]
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
+ ;
+ mova m7, [base+pw_8192]
+ pmulhrsw m4, m7 ; H pw_8192
+ pmulhrsw m5, m7 ; H pw_8192
+ pmulhrsw m6, m7 ; H pw_8192
+ pmulhrsw m0, m7 ; H pw_8192
+ punpcklwd m1, m4, m5 ; 0 1 ~
+ punpcklwd m2, m5, m6 ; 1 2 ~
+ punpcklwd m3, m6, m0 ; 2 3 ~
+ SAVELINE_W8 1, m1
+ SAVELINE_W8 2, m2
+ SAVELINE_W8 3, m3
+ ;
+ mova m7, [base+subpel_h_shufA]
+ movu m4, [srcq+ssq*0] ; 4 = _ _
+ movu m5, [srcq+ssq*1] ; 5 = _ _
+ lea srcq, [srcq+ssq*2]
+ movu m6, [srcq+ssq*0] ; 6 = _ _
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
+ mova m7, [base+pw_8192]
+ pmulhrsw m1, m4, m7 ; H pw_8192 4 ~
+ pmulhrsw m2, m5, m7 ; H pw_8192 5 ~
+ pmulhrsw m3, m6, m7 ; H pw_8192 6 ~
+ punpcklwd m4, m0, m1 ; 3 4 ~
+ punpcklwd m5, m1, m2 ; 4 5 ~
+ punpcklwd m6, m2, m3 ; 5 6 ~
+ ;
+ SAVELINE_W8 6, m3
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+.hv_w8_loop:
+ ; m8 accu for V a
+ ; m9 accu for V b
+ SAVELINE_W8 1, m3
+ SAVELINE_W8 2, m4
+ SAVELINE_W8 3, m5
+ SAVELINE_W8 4, m6
+%if ARCH_X86_32
+ pmaddwd m0, m1, subpelv0 ; a0
+ pmaddwd m7, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m0, m3
+ paddd m7, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m0, m5
+ paddd m7, m6
+ mova m5, [base+pd_512]
+ paddd m0, m5 ; pd_512
+ paddd m7, m5 ; pd_512
+ mova accuv0, m0
+ mova accuv1, m7
+%else
+ pmaddwd m8, m1, subpelv0 ; a0
+ pmaddwd m9, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ mova m7, [base+pd_512]
+ paddd m8, m7 ; pd_512
+ paddd m9, m7 ; pd_512
+ mova m7, [base+subpel_h_shufB]
+ mova m6, [base+subpel_h_shufC]
+ mova m5, [base+subpel_h_shufA]
+%endif
+ movu m0, [srcq+ssq*1] ; 7
+ movu m4, [srcq+ssq*2] ; 8
+ lea srcq, [srcq+ssq*2]
+ HV_H_W8 m0, m1, m2, m3, m5, m7, m6
+ HV_H_W8 m4, m1, m2, m3, m5, m7, m6
+ mova m5, [base+pw_8192]
+ pmulhrsw m0, m5 ; H pw_8192
+ pmulhrsw m4, m5 ; H pw_8192
+ RESTORELINE_W8 6, m6
+ punpcklwd m5, m6, m0 ; 6 7 ~
+ punpcklwd m6, m0, m4 ; 7 8 ~
+ pmaddwd m1, m5, subpelv3 ; a3
+ paddd m2, m1, accuv0
+ pmaddwd m1, m6, subpelv3 ; b3
+ paddd m1, m1, accuv1 ; H + V
+ psrad m2, 10
+ psrad m1, 10
+ packssdw m2, m1 ; d -> w
+ packuswb m2, m1 ; w -> b
+ movd [dstq+dsq*0], m2
+ psrlq m2, 32
+%if ARCH_X86_32
+ add dstq, dsm
+ movd [dstq+dsq*0], m2
+ add dstq, dsm
+%else
+ movd [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+%endif
+ sub hd, 2
+ jle .hv_w8_outer
+ SAVELINE_W8 6, m4
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+ RESTORELINE_W8 4, m4
+ jmp .hv_w8_loop
+.hv_w8_outer:
+ movzx hd, r6w
+ add r4, 4
+ mov dstq, r4
+%if ARCH_X86_32
+ mov srcq, srcm
+ add srcq, 4
+ mov srcm, srcq
+%else
+ add r7, 4
+ mov srcq, r7
+%endif
+ sub r6d, 1<<16
+ jg .hv_w8_loop0
+ RET
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1, 2
+%elif WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+%macro PREP_8TAP_FN 3 ; type, type_h, type_v
+cglobal prep_8tap_%1
+ mov t0d, FILTER_%2
+ mov t1d, FILTER_%3
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
+%endif
+%endmacro
+
+PREP_8TAP_FN regular, REGULAR, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+
+%if ARCH_X86_32
+ %define base_reg r2
+ %define base base_reg-prep_ssse3
+ %define W32_RESTORE_SSQ mov strideq, stridem
+%else
+ %define base_reg r7
+ %define base 0
+ %define W32_RESTORE_SSQ
+%endif
+
+cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+%assign org_stack_offset stack_offset
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ movsxd wq, wm
+ movifnidn srcd, srcm
+ movifnidn hd, hm
+ LEA base_reg, prep_ssse3
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [base_reg+wq*2+table_offset(prep,)]
+ add wq, base_reg
+ movifnidn strided, stridem
+ lea r6, [strideq*3]
+ %assign stack_offset org_stack_offset
+%if WIN64
+ pop r8
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ WIN64_SPILL_XMM 12
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+%if ARCH_X86_64
+ mova m10, [base+subpel_h_shufA]
+ mova m11, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+%endif
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
+ movd m5, [base_reg+mxq*8+subpel_filters-prep_ssse3+0]
+ pshufd m5, m5, q0000
+ movd m6, [base_reg+mxq*8+subpel_filters-prep_ssse3+4]
+ pshufd m6, m6, q0000
+ mova m7, [base+pw_8192]
+ add wq, base_reg
+ jmp wq
+.h_w4:
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ movd m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
+ pshufd m4, m4, q0000
+ mova m6, [base+pw_8192]
+ mova m5, [base+subpel_h_shufA]
+ W32_RESTORE_SSQ
+%if ARCH_X86_64
+ lea stride3q, [strideq*3]
+%endif
+.h_w4_loop:
+ movq m0, [srcq+strideq*0] ; 0
+ movq m1, [srcq+strideq*1] ; 1
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movq m2, [srcq+strideq*0] ; 2
+ movq m3, [srcq+strideq*1] ; 3
+ lea srcq, [srcq+strideq*2]
+%else
+ movq m2, [srcq+strideq*2] ; 2
+ movq m3, [srcq+stride3q ] ; 3
+ lea srcq, [srcq+strideq*4]
+%endif
+ pshufb m0, m5 ; subpel_h_shufA
+ pshufb m1, m5
+ pshufb m2, m5
+ pshufb m3, m5
+ pmaddubsw m0, m4 ; subpel_filters + 2
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmaddubsw m3, m4
+ phaddw m0, m1
+ phaddw m2, m3
+ pmulhrsw m0, m6 ; pw_8192
+ pmulhrsw m2, m6 ; pw_8192
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m2
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+ ;
+%macro PREP_8TAP_H 4 ; dst/src, tmp[1-3]
+%if ARCH_X86_32
+ pshufb %2, %1, [base+subpel_h_shufB]
+ pshufb %3, %1, [base+subpel_h_shufC]
+ pshufb %1, [base+subpel_h_shufA]
+%else
+ pshufb %2, %1, m11; subpel_h_shufB
+ pshufb %3, %1, m9 ; subpel_h_shufC
+ pshufb %1, m10 ; subpel_h_shufA
+%endif
+ pmaddubsw %4, %2, m5 ; subpel +0 B0
+ pmaddubsw %2, m6 ; subpel +4 B4
+ pmaddubsw %3, m6 ; subpel +4 C4
+ pmaddubsw %1, m5 ; subpel +0 A0
+ paddw %3, %4
+ paddw %1, %2
+ phaddw %1, %3
+ pmulhrsw %1, m7 ; 8192
+%endmacro
+ ;
+.h_w8:
+%if ARCH_X86_32
+ mov r3, r2
+ %define base_reg r3
+ W32_RESTORE_SSQ
+%endif
+.h_w8_loop:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H m0, m2, m3, m4
+ PREP_8TAP_H m1, m2, m3, m4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8_loop
+ RET
+.h_w16:
+ xor r6d, r6d
+ jmp .h_start
+.h_w32:
+ mov r6, -16*1
+ jmp .h_start
+.h_w64:
+ mov r6, -16*3
+ jmp .h_start
+.h_w128:
+ mov r6, -16*7
+.h_start:
+%if ARCH_X86_32
+ mov r3, r2
+ %define base_reg r3
+%endif
+ sub srcq, r6
+ mov r5, r6
+ W32_RESTORE_SSQ
+.h_loop:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ PREP_8TAP_H m0, m2, m3, m4
+ PREP_8TAP_H m1, m2, m3, m4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 32
+ add r6, 16
+ jle .h_loop
+ add srcq, strideq
+ mov r6, r5
+ dec hd
+ jg .h_loop
+ RET
+%if ARCH_X86_32
+ %define base_reg r2
+%endif
+
+.v:
+%if ARCH_X86_32
+ mov mxd, myd
+ and mxd, 0x7f
+%else
+ %assign stack_offset org_stack_offset
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb
+%endif
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ lea myq, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ mova m2, [base+pw_512]
+ psrlw m2, m2, 1 ; 0x0100
+ mova m7, [base+pw_8192]
+%if ARCH_X86_32
+ %define subpel0 [rsp+mmsize*0]
+ %define subpel1 [rsp+mmsize*1]
+ %define subpel2 [rsp+mmsize*2]
+ %define subpel3 [rsp+mmsize*3]
+%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed
+ ALLOC_STACK -mmsize*4
+%assign regs_used 7
+ movd m0, [myq+0]
+ pshufb m0, m2
+ mova subpel0, m0
+ movd m0, [myq+2]
+ pshufb m0, m2
+ mova subpel1, m0
+ movd m0, [myq+4]
+ pshufb m0, m2
+ mova subpel2, m0
+ movd m0, [myq+6]
+ pshufb m0, m2
+ mova subpel3, m0
+ mov strideq, [rstk+stack_offset+gprsize*3]
+ lea strideq, [strideq*3]
+ sub [rstk+stack_offset+gprsize*2], strideq
+ mov strideq, [rstk+stack_offset+gprsize*3]
+ mov srcq, [rstk+stack_offset+gprsize*2]
+%else
+ %define subpel0 m8
+ %define subpel1 m9
+ %define subpel2 m10
+ %define subpel3 m11
+ movd subpel0, [myq+0]
+ pshufb subpel0, m2
+ movd subpel1, [myq+2]
+ pshufb subpel1, m2
+ movd subpel2, [myq+4]
+ pshufb subpel2, m2
+ movd subpel3, [myq+6]
+ pshufb subpel3, m2
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ cmp wd, 8
+ jg .v_w16
+ je .v_w8
+%endif
+.v_w4:
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+ %define srcm [rsp+mmsize*4+gprsize*1]
+ %define tmpm [rsp+mmsize*4+gprsize*2]
+%endif
+ mov tmpm, tmpq
+ mov srcm, srcq
+ lea r5d, [wq - 4] ; horizontal loop
+ shl r5d, (16 - 2) ; (wq / 4) << 16
+ mov r5w, hw
+.v_w4_loop0:
+%endif
+ movd m2, [srcq+strideq*0] ; 0
+ movhps m2, [srcq+strideq*2] ; 0 _ 2
+ movd m3, [srcq+strideq*1] ; 1
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movhps m3, [srcq+strideq*1] ; 1 _ 3
+ lea srcq, [srcq+strideq*2]
+%else
+ movhps m3, [srcq+stride3q ] ; 1 _ 3
+ lea srcq, [srcq+strideq*4]
+%endif
+ pshufd m2, m2, q2020 ; 0 2 0 2
+ pshufd m3, m3, q2020 ; 1 3 1 3
+ punpckldq m2, m3 ; 0 1 2 3
+ movd m3, [srcq+strideq*0] ; 4
+ movd m1, [srcq+strideq*1] ; 5
+ movd m0, [srcq+strideq*2] ; 6
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ add srcq, strideq
+%else
+ add srcq, stride3q
+%endif
+ punpckldq m3, m1 ; 4 5 _ _
+ punpckldq m1, m0 ; 5 6 _ _
+ palignr m4, m3, m2, 4 ; 1 2 3 4
+ punpcklbw m3, m1 ; 45 56
+ punpcklbw m1, m2, m4 ; 01 12
+ punpckhbw m2, m4 ; 23 34
+.v_w4_loop:
+ pmaddubsw m5, m1, subpel0 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, subpel1 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, subpel2 ; a2 b2
+ paddw m5, m3
+ movd m4, [srcq+strideq*0]
+ punpckldq m3, m0, m4 ; 6 7 _ _
+ movd m0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpckldq m4, m0 ; 7 8 _ _
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, subpel3 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ movq [tmpq+wq*0], m5
+ movhps [tmpq+wq*2], m5
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w4_loop
+%if ARCH_X86_32
+ mov hw, r5w ; reset vertical loop
+ mov tmpq, tmpm
+ mov srcq, srcm
+ add tmpq, 8
+ add srcq, 4
+ mov tmpm, tmpq
+ mov srcm, srcq
+ sub r5d, 1<<16 ; horizontal--
+ jg .v_w4_loop0
+%endif
+ RET
+
+%if ARCH_X86_64
+.v_w8:
+.v_w16:
+ lea r5d, [wq - 8] ; horizontal loop
+ mov r8, tmpq
+ mov r6, srcq
+ shl r5d, 8 - 3; (wq / 8) << 8
+ mov r5b, hb
+.v_w8_loop0:
+ movq m4, [srcq+strideq*0] ; 0
+ movq m5, [srcq+strideq*1] ; 1
+ lea srcq, [srcq+strideq*2]
+ movq m6, [srcq+strideq*0] ; 2
+ movq m0, [srcq+strideq*1] ; 3
+ lea srcq, [srcq+strideq*2]
+ movq m1, [srcq+strideq*0] ; 4
+ movq m2, [srcq+strideq*1] ; 5
+ lea srcq, [srcq+strideq*2] ;
+ movq m3, [srcq+strideq*0] ; 6
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklbw m1, m4, m5 ; 01
+ punpckhbw m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklbw m2, m5, m6 ; 12
+ punpckhbw m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklbw m3, m6, m0 ; 23
+ punpckhbw m6, m0 ; 56
+.v_w8_loop:
+ movq m12, [srcq+strideq*1] ; 8
+ lea srcq, [srcq+strideq*2]
+ movq m13, [srcq+strideq*0] ; 9
+ pmaddubsw m14, m1, subpel0 ; a0
+ pmaddubsw m15, m2, subpel0 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, subpel1 ; a1
+ pmaddubsw m4, subpel1 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, subpel2 ; a2
+ pmaddubsw m6, subpel2 ; b2
+ paddw m14, m5
+ paddw m15, m6
+ shufpd m6, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m6, m0 ; 67
+ punpckhbw m6, m0 ; 78
+ pmaddubsw m12, m5, subpel3 ; a3
+ pmaddubsw m13, m6, subpel3 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ movu [tmpq+wq*0], xm14
+ movu [tmpq+wq*2], xm15
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w8_loop
+ movzx hd, r5b ; reset vertical loop
+ add r8, 16
+ add r6, 8
+ mov tmpq, r8
+ mov srcq, r6
+ sub r5d, 1<<8 ; horizontal--
+ jg .v_w8_loop0
+ RET
+%endif ;ARCH_X86_64
+%undef subpel0
+%undef subpel1
+%undef subpel2
+%undef subpel3
+
+.hv:
+ %assign stack_offset org_stack_offset
+ cmp wd, 4
+ jg .hv_w8
+ and mxd, 0x7f
+ movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
+%if ARCH_X86_32
+ mov mxd, myd
+ shr myd, 16
+ and mxd, 0x7f
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ mov r5, r2; use as new base
+ %define base_reg r5
+ %assign regs_used 2
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
+ mov strideq, [rstk+stack_offset+gprsize*3]
+ lea strideq, [strideq*3 + 1]
+ sub [rstk+stack_offset+gprsize*2], strideq
+ mov strideq, [rstk+stack_offset+gprsize*3]
+ mov srcq, [rstk+stack_offset+gprsize*2]
+ %define subpelv0 [rsp+mmsize*0]
+ %define subpelv1 [rsp+mmsize*1]
+ %define subpelv2 [rsp+mmsize*2]
+ %define subpelv3 [rsp+mmsize*3]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m6, m0, q0000
+ mova subpelv0, m6
+ pshufd m6, m0, q1111
+ mova subpelv1, m6
+ pshufd m6, m0, q2222
+ mova subpelv2, m6
+ pshufd m6, m0, q3333
+ mova subpelv3, m6
+%else
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ ALLOC_STACK mmsize*14, 14
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ dec srcq
+ %define subpelv0 m10
+ %define subpelv1 m11
+ %define subpelv2 m12
+ %define subpelv3 m13
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ mova m8, [base+pw_8192]
+ mova m9, [base+pd_32]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+%endif
+ pshufd m7, m1, q0000
+.hv_w4:
+%define hv4_line_0_0 4
+%define hv4_line_0_1 5
+%define hv4_line_0_2 6
+%define hv4_line_0_3 7
+%define hv4_line_0_4 8
+%define hv4_line_0_5 9
+%define hv4_line_1_0 10
+%define hv4_line_1_1 11
+%define hv4_line_1_2 12
+%define hv4_line_1_3 13
+ ;
+ ;
+%if ARCH_X86_32
+ %define w8192reg [base+pw_8192]
+ %define d32reg [base+pd_32]
+%else
+ %define w8192reg m8
+ %define d32reg m9
+%endif
+ ; lower shuffle 0 1 2 3 4
+ mova m6, [base+subpel_h_shuf4]
+ movq m5, [srcq+strideq*0] ; 0 _ _ _
+ movhps m5, [srcq+strideq*1] ; 0 _ 1 _
+ movq m4, [srcq+strideq*2] ; 2 _ _ _
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ add srcq, strideq
+ movhps m4, [srcq+strideq*0] ; 2 _ 3 _
+ add srcq, strideq
+%else
+ movhps m4, [srcq+stride3q ] ; 2 _ 3 _
+ lea srcq, [srcq+strideq*4]
+%endif
+ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+ pmaddubsw m2, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m2, m0 ;H 0 1 2 3
+ pmulhrsw m2, w8192reg ;H pw_8192
+ SAVELINE_W4 m2, 2, 0
+ ; upper shuffle 2 3 4 5 6
+ mova m6, [base+subpel_h_shuf4+16]
+ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+ pmaddubsw m2, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m2, m0 ;H 0 1 2 3
+ pmulhrsw m2, w8192reg ;H pw_8192
+ ;
+ ; lower shuffle
+ mova m6, [base+subpel_h_shuf4]
+ movq m5, [srcq+strideq*0] ; 4 _ _ _
+ movhps m5, [srcq+strideq*1] ; 4 _ 5 _
+ movq m4, [srcq+strideq*2] ; 6 _ _ _
+ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+ pmaddubsw m3, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m3, m0 ;H 4 5 6 7
+ pmulhrsw m3, w8192reg ;H pw_8192
+ SAVELINE_W4 m3, 3, 0
+ ; upper shuffle
+ mova m6, [base+subpel_h_shuf4+16]
+ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+ pmaddubsw m3, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m3, m0 ;H 4 5 6 7
+ pmulhrsw m3, w8192reg ;H pw_8192
+ ;
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ add srcq, strideq
+%else
+ add srcq, stride3q
+%endif
+ ;process high
+ palignr m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ ;process low
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ palignr m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+.hv_w4_loop:
+ ;process low
+ pmaddwd m5, m1, subpelv0 ; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+ ;
+ mova m6, [base+subpel_h_shuf4]
+ movq m4, [srcq+strideq*0] ; 7
+ movhps m4, [srcq+strideq*1] ; 7 _ 8 _
+ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+ pmaddubsw m4, m7 ;H subpel_filters
+ phaddw m4, m4 ;H 7 8 7 8
+ pmulhrsw m4, w8192reg ;H pw_8192
+ palignr m3, m4, m0, 12 ; 6 7 8 7
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+ paddd m5, d32reg ; pd_32
+ paddd m5, m4
+ psrad m5, 6
+ SAVELINE_W4 m0, 0, 0
+ SAVELINE_W4 m1, 1, 0
+ SAVELINE_W4 m2, 2, 0
+ SAVELINE_W4 m3, 3, 0
+ SAVELINE_W4 m5, 5, 0
+ ;process high
+ RESTORELINE_W4 m0, 0, 1
+ RESTORELINE_W4 m1, 1, 1
+ RESTORELINE_W4 m2, 2, 1
+ RESTORELINE_W4 m3, 3, 1
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+ ;
+ mova m6, [base+subpel_h_shuf4+16]
+ movq m4, [srcq+strideq*0] ; 7
+ movhps m4, [srcq+strideq*1] ; 7 _ 8 _
+ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+ pmaddubsw m4, m7 ;H subpel_filters
+ phaddw m4, m4 ;H 7 8 7 8
+ pmulhrsw m4, w8192reg ;H pw_8192
+ palignr m3, m4, m0, 12 ; 6 7 8 7
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+ paddd m5, d32reg ; pd_32
+ paddd m5, m4
+ psrad m4, m5, 6
+ ;
+ RESTORELINE_W4 m5, 5, 0
+ packssdw m5, m4
+ pshufd m5, m5, q3120
+ movu [tmpq], m5
+ lea srcq, [srcq+strideq*2]
+ add tmpq, 16
+ sub hd, 2
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ RESTORELINE_W4 m0, 0, 0
+ RESTORELINE_W4 m1, 1, 0
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ jg .hv_w4_loop
+ RET
+%undef subpelv0
+%undef subpelv1
+%undef subpelv2
+%undef subpelv3
+ ;
+
+
+.hv_w8:
+ %assign stack_offset org_stack_offset
+%define hv8_line_1 0
+%define hv8_line_2 1
+%define hv8_line_3 2
+%define hv8_line_4 3
+%define hv8_line_6 4
+ shr mxd, 16
+%if ARCH_X86_32
+ %define base_reg r2
+ %define subpelh0 [rsp+mmsize*5]
+ %define subpelh1 [rsp+mmsize*6]
+ %define subpelv0 [rsp+mmsize*7]
+ %define subpelv1 [rsp+mmsize*8]
+ %define subpelv2 [rsp+mmsize*9]
+ %define subpelv3 [rsp+mmsize*10]
+ %define accuv0 [rsp+mmsize*11]
+ %define accuv1 [rsp+mmsize*12]
+ movq m1, [base_reg+mxq*8+subpel_filters-prep_ssse3]
+ mov mxd, myd
+ shr myd, 16
+ and mxd, 0x7f
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ ALLOC_STACK -mmsize*13
+%if STACK_ALIGNMENT < mmsize
+ mov rstk, r2m
+ %define tmpm [rsp+mmsize*13+gprsize*1]
+ %define srcm [rsp+mmsize*13+gprsize*2]
+ %define stridem [rsp+mmsize*13+gprsize*3]
+ mov stridem, rstk
+%endif
+ mov r6, r2
+%define base_reg r6
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ punpcklbw m5, m5
+ psraw m5, 8 ; sign-extend
+ pshufd m2, m5, q0000
+ pshufd m3, m5, q1111
+ pshufd m4, m5, q2222
+ pshufd m5, m5, q3333
+ mova subpelh0, m0
+ mova subpelh1, m1
+ mova subpelv0, m2
+ mova subpelv1, m3
+ mova subpelv2, m4
+ mova subpelv3, m5
+ W32_RESTORE_SSQ
+ lea strided, [strided*3]
+ sub srcd, strided
+ sub srcd, 3
+ mov srcm, srcd
+ W32_RESTORE_SSQ
+%else
+ ALLOC_STACK mmsize*5, 16
+ %define subpelh0 m10
+ %define subpelh1 m11
+ %define subpelv0 m12
+ %define subpelv1 m13
+ %define subpelv2 m14
+ %define subpelv3 m15
+ %define accuv0 m8
+ %define accuv1 m9
+ movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ pshufd subpelh0, m0, q0000
+ pshufd subpelh1, m0, q1111
+ punpcklbw m1, m1
+ psraw m1, 8 ; sign-extend
+ pshufd subpelv0, m1, q0000
+ pshufd subpelv1, m1, q1111
+ pshufd subpelv2, m1, q2222
+ pshufd subpelv3, m1, q3333
+ lea stride3q, [strideq*3]
+ sub srcq, 3
+ sub srcq, stride3q
+ mov r6, srcq
+%endif
+ lea r5d, [wq-4]
+%if ARCH_X86_64
+ mov r8, tmpq
+%else
+ mov tmpm, tmpq
+%endif
+ shl r5d, (16 - 2)
+ mov r5w, hw
+.hv_w8_loop0:
+ movu m4, [srcq+strideq*0] ; 0 = _ _
+ movu m5, [srcq+strideq*1] ; 1 = _ _
+ lea srcq, [srcq+strideq*2]
+%if ARCH_X86_64
+ mova m7, [base+subpel_h_shufA]
+ mova m8, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+%endif
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
+ movu m6, [srcq+strideq*0] ; 2 = _ _
+ movu m0, [srcq+strideq*1] ; 3 = _ _
+ lea srcq, [srcq+strideq*2]
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
+ ;
+ mova m7, [base+pw_8192]
+ pmulhrsw m4, m7 ; H pw_8192
+ pmulhrsw m5, m7 ; H pw_8192
+ pmulhrsw m6, m7 ; H pw_8192
+ pmulhrsw m0, m7 ; H pw_8192
+ punpcklwd m1, m4, m5 ; 0 1 ~
+ punpcklwd m2, m5, m6 ; 1 2 ~
+ punpcklwd m3, m6, m0 ; 2 3 ~
+ SAVELINE_W8 1, m1
+ SAVELINE_W8 2, m2
+ SAVELINE_W8 3, m3
+ ;
+ mova m7, [base+subpel_h_shufA]
+ movu m4, [srcq+strideq*0] ; 4 = _ _
+ movu m5, [srcq+strideq*1] ; 5 = _ _
+ lea srcq, [srcq+strideq*2]
+ movu m6, [srcq+strideq*0] ; 6 = _ _
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
+ mova m7, [base+pw_8192]
+ pmulhrsw m1, m4, m7 ; H pw_8192 4 ~
+ pmulhrsw m2, m5, m7 ; H pw_8192 5 ~
+ pmulhrsw m3, m6, m7 ; H pw_8192 6 ~
+ punpcklwd m4, m0, m1 ; 3 4 ~
+ punpcklwd m5, m1, m2 ; 4 5 ~
+ punpcklwd m6, m2, m3 ; 5 6 ~
+ ;
+ SAVELINE_W8 6, m3
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+.hv_w8_loop:
+ ; m8 accu for V a
+ ; m9 accu for V b
+ SAVELINE_W8 1, m3
+ SAVELINE_W8 2, m4
+ SAVELINE_W8 3, m5
+ SAVELINE_W8 4, m6
+%if ARCH_X86_32
+ pmaddwd m0, m1, subpelv0 ; a0
+ pmaddwd m7, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m0, m3
+ paddd m7, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m0, m5
+ paddd m7, m6
+ mova m5, [base+pd_32]
+ paddd m0, m5 ; pd_512
+ paddd m7, m5 ; pd_512
+ mova accuv0, m0
+ mova accuv1, m7
+%else
+ pmaddwd m8, m1, subpelv0 ; a0
+ pmaddwd m9, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ mova m7, [base+pd_32]
+ paddd m8, m7 ; pd_512
+ paddd m9, m7 ; pd_512
+ mova m7, [base+subpel_h_shufB]
+ mova m6, [base+subpel_h_shufC]
+ mova m5, [base+subpel_h_shufA]
+%endif
+ movu m0, [srcq+strideq*1] ; 7
+ movu m4, [srcq+strideq*2] ; 8
+ lea srcq, [srcq+strideq*2]
+ HV_H_W8 m0, m1, m2, m3, m5, m7, m6
+ HV_H_W8 m4, m1, m2, m3, m5, m7, m6
+ mova m5, [base+pw_8192]
+ pmulhrsw m0, m5 ; H pw_8192
+ pmulhrsw m4, m5 ; H pw_8192
+ RESTORELINE_W8 6, m6
+ punpcklwd m5, m6, m0 ; 6 7 ~
+ punpcklwd m6, m0, m4 ; 7 8 ~
+ pmaddwd m1, m5, subpelv3 ; a3
+ paddd m2, m1, accuv0
+ pmaddwd m1, m6, subpelv3 ; b3
+ paddd m1, m1, accuv1 ; H + V
+ psrad m2, 6
+ psrad m1, 6
+ packssdw m2, m1 ; d -> w
+ movq [tmpq+wq*0], m2
+ movhps [tmpq+wq*2], m2
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jle .hv_w8_outer
+ SAVELINE_W8 6, m4
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+ RESTORELINE_W8 4, m4
+ jmp .hv_w8_loop
+.hv_w8_outer:
+ movzx hd, r5w
+%if ARCH_X86_32
+ add dword tmpm, 8
+ mov tmpq, tmpm
+ mov srcq, srcm
+ add srcq, 4
+ mov srcm, srcq
+%else
+ add r8, 8
+ mov tmpq, r8
+ add r6, 4
+ mov srcq, r6
+%endif
+ sub r5d, 1<<16
+ jg .hv_w8_loop0
+ RET
+
+%if ARCH_X86_32
+ %macro SAVE_ALPHA_BETA 0
+ mov alpham, alphad
+ mov betam, betad
+ %endmacro
+
+ %macro SAVE_DELTA_GAMMA 0
+ mov deltam, deltad
+ mov gammam, gammad
+ %endmacro
+
+ %macro LOAD_ALPHA_BETA_MX 0
+ mov mym, myd
+ mov alphad, alpham
+ mov betad, betam
+ mov mxd, mxm
+ %endmacro
+
+ %macro LOAD_DELTA_GAMMA_MY 0
+ mov mxm, mxd
+ mov deltad, deltam
+ mov gammad, gammam
+ mov myd, mym
+ %endmacro
+
+ %define PIC_reg r2
+ %define PIC_base_offset $$
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+%else
+ %define SAVE_ALPHA_BETA
+ %define SAVE_DELTA_GAMMA
+ %define PIC_sym(sym) sym
+%endif
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < required_stack_alignment
+ %assign copy_args 8*4
+ %else
+ %assign copy_args 0
+ %endif
+%endif
+
+%macro RELOC_ARGS 0
+ %if copy_args
+ mov r0, r0m
+ mov r1, r1m
+ mov r2, r2m
+ mov r3, r3m
+ mov r5, r5m
+ mov dstm, r0
+ mov dsm, r1
+ mov srcm, r2
+ mov ssm, r3
+ mov mxm, r5
+ mov r0, r6m
+ mov mym, r0
+ %endif
+%endmacro
+
+%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2
+ %if cpuflag(sse4)
+ pblendw %1, %2, 0xAA
+ %else
+ pand %2, m10
+ por %1, %2
+ %endif
+%endmacro
+
+%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
+ %if ARCH_X86_32
+ %define m8 m4
+ %define m9 m5
+ %define m14 m6
+ %define m15 m7
+ %define m11 m7
+ %endif
+ %if notcpuflag(ssse3) || ARCH_X86_32
+ pxor m11, m11
+ %endif
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq m2, [filterq+myq *8] ; a
+ movq m8, [filterq+tmp1q*8] ; e
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+deltaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq m3, [filterq+tmp2q*8] ; b
+ movq m0, [filterq+tmp1q*8] ; f
+ punpcklwd m2, m3
+ punpcklwd m8, m0
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq m0, [filterq+myq *8] ; c
+ movq m9, [filterq+tmp1q*8] ; g
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+gammaq] ; my += gamma
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq m3, [filterq+tmp2q*8] ; d
+ movq m1, [filterq+tmp1q*8] ; h
+ punpcklwd m0, m3
+ punpcklwd m9, m1
+ punpckldq m1, m2, m0
+ punpckhdq m2, m0
+ punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
+ punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
+ punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
+ punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
+ pmaddwd m0, %3
+ pmaddwd m3, %5
+ pmaddwd m1, %7
+ pmaddwd m14, %9
+ paddd m0, m3
+ paddd m1, m14
+ paddd m0, m1
+ mova %1, m0
+ %if ARCH_X86_64
+ SWAP m3, m14
+ %endif
+ punpckldq m0, m8, m9
+ punpckhdq m8, m9
+ punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8
+ punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8
+ punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8
+ punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8
+ pmaddwd m1, %4
+ pmaddwd m14, %6
+ pmaddwd m2, %8
+ pmaddwd m15, %10
+ paddd m1, m14
+ paddd m2, m15
+ paddd m1, m2
+ mova %2, m1
+ %if ARCH_X86_64
+ SWAP m14, m3
+ %endif
+%endmacro
+
+%if ARCH_X86_64
+ %define counterd r4d
+%else
+ %if copy_args == 0
+ %define counterd dword r4m
+ %else
+ %define counterd dword [esp+stack_size-4*7]
+ %endif
+%endif
+
+%macro WARP_AFFINE_8X8T 0
+%if ARCH_X86_64
+cglobal warp_affine_8x8t, 6, 14, 16, 0x90, tmp, ts
+%else
+cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts
+ %if copy_args
+ %define tmpm [esp+stack_size-4*1]
+ %define tsm [esp+stack_size-4*2]
+ %endif
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main
+.loop:
+%if ARCH_X86_32
+ %define m12 m4
+ %define m13 m5
+ %define m14 m6
+ %define m15 m7
+ mova m12, [esp+0xC0]
+ mova m13, [esp+0xD0]
+ mova m14, [esp+0xE0]
+ mova m15, [esp+0xF0]
+%endif
+%if cpuflag(ssse3)
+ psrad m12, 13
+ psrad m13, 13
+ psrad m14, 13
+ psrad m15, 13
+ packssdw m12, m13
+ packssdw m14, m15
+ mova m13, [PIC_sym(pw_8192)]
+ pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7
+ pmulhrsw m14, m13
+%else
+ %if ARCH_X86_32
+ %define m10 m0
+ %endif
+ mova m10, [PIC_sym(pd_16384)]
+ paddd m12, m10
+ paddd m13, m10
+ paddd m14, m10
+ paddd m15, m10
+ psrad m12, 15
+ psrad m13, 15
+ psrad m14, 15
+ psrad m15, 15
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ mova [tmpq+tsq*0], m12
+ mova [tmpq+tsq*2], m14
+ dec counterd
+ jz mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).end
+%if ARCH_X86_32
+ mov tmpm, tmpd
+ mov r0, [esp+0x100]
+ mov r1, [esp+0x104]
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main2
+ lea tmpq, [tmpq+tsq*4]
+ jmp .loop
+%endmacro
+
+%macro WARP_AFFINE_8X8 0
+%if ARCH_X86_64
+cglobal warp_affine_8x8, 6, 14, 16, 0x90, \
+ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+ filter, tmp1, delta, my, gamma
+%else
+cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \
+ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+ filter, tmp1, delta, my, gamma
+ %define alphaq r0
+ %define alphad r0
+ %define alpham [esp+gprsize+0x100]
+ %define betaq r1
+ %define betad r1
+ %define betam [esp+gprsize+0x104]
+ %define deltaq r0
+ %define deltad r0
+ %define deltam [esp+gprsize+0x108]
+ %define gammaq r1
+ %define gammad r1
+ %define gammam [esp+gprsize+0x10C]
+ %define filterq r3
+ %define tmp1q r4
+ %define tmp1d r4
+ %define tmp1m [esp+gprsize+0x110]
+ %define myq r5
+ %define myd r5
+ %define mym r6m
+ %if copy_args
+ %define dstm [esp+stack_size-4*1]
+ %define dsm [esp+stack_size-4*2]
+ %define srcm [esp+stack_size-4*3]
+ %define ssm [esp+stack_size-4*4]
+ %define mxm [esp+stack_size-4*5]
+ %define mym [esp+stack_size-4*6]
+ %endif
+%endif
+ call .main
+ jmp .start
+.loop:
+%if ARCH_X86_32
+ mov dstm, dstd
+ mov alphad, [esp+0x100]
+ mov betad, [esp+0x104]
+%endif
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+%if notcpuflag(sse4)
+ %if cpuflag(ssse3)
+ %define roundval pw_8192
+ %else
+ %define roundval pd_262144
+ %endif
+ %if ARCH_X86_64
+ mova m10, [PIC_sym(roundval)]
+ %else
+ %define m10 [PIC_sym(roundval)]
+ %endif
+%endif
+%if ARCH_X86_32
+ %define m12 m5
+ %define m13 m6
+ mova m12, [esp+0xC0]
+ mova m13, [esp+0xD0]
+%endif
+%if cpuflag(sse4)
+ %if ARCH_X86_32
+ %define m11 m4
+ pxor m11, m11
+ %endif
+ psrad m12, 18
+ psrad m13, 18
+ packusdw m12, m13
+ pavgw m12, m11 ; (x + (1 << 10)) >> 11
+%else
+ %if cpuflag(ssse3)
+ psrad m12, 17
+ psrad m13, 17
+ packssdw m12, m13
+ pmulhrsw m12, m10
+ %else
+ paddd m12, m10
+ paddd m13, m10
+ psrad m12, 19
+ psrad m13, 19
+ packssdw m12, m13
+ %endif
+%endif
+%if ARCH_X86_32
+ %define m14 m6
+ %define m15 m7
+ mova m14, [esp+0xE0]
+ mova m15, [esp+0xF0]
+%endif
+%if cpuflag(sse4)
+ psrad m14, 18
+ psrad m15, 18
+ packusdw m14, m15
+ pavgw m14, m11 ; (x + (1 << 10)) >> 11
+%else
+ %if cpuflag(ssse3)
+ psrad m14, 17
+ psrad m15, 17
+ packssdw m14, m15
+ pmulhrsw m14, m10
+ %else
+ paddd m14, m10
+ paddd m15, m10
+ psrad m14, 19
+ psrad m15, 19
+ packssdw m14, m15
+ %endif
+%endif
+ packuswb m12, m14
+ movq [dstq+dsq*0], m12
+ movhps [dstq+dsq*1], m12
+ dec counterd
+ jg .loop
+.end:
+ RET
+ALIGN function_align
+.main:
+%assign stack_offset stack_offset+gprsize
+%if ARCH_X86_32
+ %assign stack_size stack_size+4
+ %if copy_args
+ %assign stack_offset stack_offset-4
+ %endif
+ RELOC_ARGS
+ LEA PIC_reg, $$
+ %define PIC_mem [esp+gprsize+0x114]
+ mov abcdd, abcdm
+ %if copy_args == 0
+ mov ssd, ssm
+ mov mxd, mxm
+ %endif
+ mov PIC_mem, PIC_reg
+ mov srcd, srcm
+%endif
+ movsx deltad, word [abcdq+2*2]
+ movsx gammad, word [abcdq+2*3]
+ lea tmp1d, [deltaq*3]
+ sub gammad, tmp1d ; gamma -= delta*3
+ SAVE_DELTA_GAMMA
+%if ARCH_X86_32
+ mov abcdd, abcdm
+%endif
+ movsx alphad, word [abcdq+2*0]
+ movsx betad, word [abcdq+2*1]
+ lea tmp1q, [ssq*3+3]
+ add mxd, 512+(64<<10)
+ lea tmp2d, [alphaq*3]
+ sub srcq, tmp1q ; src -= src_stride*3 + 3
+%if ARCH_X86_32
+ mov srcm, srcd
+ mov PIC_reg, PIC_mem
+%endif
+ sub betad, tmp2d ; beta -= alpha*3
+ lea filterq, [PIC_sym(mc_warp_filter)]
+%if ARCH_X86_64
+ mov myd, r6m
+ %if cpuflag(ssse3)
+ pxor m11, m11
+ %endif
+%endif
+ call .h
+ psrld m2, m0, 16
+ psrld m3, m1, 16
+%if ARCH_X86_32
+ %if notcpuflag(ssse3)
+ mova [esp+gprsize+0x00], m2
+ %endif
+ mova [esp+gprsize+0x10], m3
+%endif
+ call .h
+ psrld m4, m0, 16
+ psrld m5, m1, 16
+%if ARCH_X86_32
+ mova [esp+gprsize+0x20], m4
+ mova [esp+gprsize+0x30], m5
+%endif
+ call .h
+%if ARCH_X86_64
+ %define blendmask [rsp+gprsize+0x80]
+%else
+ %if notcpuflag(ssse3)
+ mova m2, [esp+gprsize+0x00]
+ %endif
+ mova m3, [esp+gprsize+0x10]
+ %define blendmask [esp+gprsize+0x120]
+ %define m10 m7
+%endif
+ pcmpeqd m10, m10
+ pslld m10, 16
+ mova blendmask, m10
+ BLENDHWDW m2, m0 ; 0
+ BLENDHWDW m3, m1 ; 2
+ mova [rsp+gprsize+0x00], m2
+ mova [rsp+gprsize+0x10], m3
+ call .h
+%if ARCH_X86_32
+ mova m4, [esp+gprsize+0x20]
+ mova m5, [esp+gprsize+0x30]
+%endif
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 1
+ BLENDHWDW m5, m1 ; 3
+ mova [rsp+gprsize+0x20], m4
+ mova [rsp+gprsize+0x30], m5
+ call .h
+%if ARCH_X86_32
+ %if notcpuflag(ssse3)
+ mova m2, [esp+gprsize+0x00]
+ %endif
+ mova m3, [esp+gprsize+0x10]
+ %define m10 m5
+%endif
+ psrld m6, m2, 16
+ psrld m7, m3, 16
+ mova m10, blendmask
+ BLENDHWDW m6, m0 ; 2
+ BLENDHWDW m7, m1 ; 4
+ mova [rsp+gprsize+0x40], m6
+ mova [rsp+gprsize+0x50], m7
+ call .h
+%if ARCH_X86_32
+ mova m4, [esp+gprsize+0x20]
+ mova m5, [esp+gprsize+0x30]
+%endif
+ psrld m2, m4, 16
+ psrld m3, m5, 16
+ mova m10, blendmask
+ BLENDHWDW m2, m0 ; 3
+ BLENDHWDW m3, m1 ; 5
+ mova [rsp+gprsize+0x60], m2
+ mova [rsp+gprsize+0x70], m3
+ call .h
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0x40]
+ mova m7, [esp+gprsize+0x50]
+ %define m10 m7
+%endif
+ psrld m4, m6, 16
+ psrld m5, m7, 16
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 4
+ BLENDHWDW m5, m1 ; 6
+%if ARCH_X86_64
+ add myd, 512+(64<<10)
+ mova m6, m2
+ mova m7, m3
+%else
+ mova [esp+gprsize+0x80], m4
+ mova [esp+gprsize+0x90], m5
+ add dword mym, 512+(64<<10)
+%endif
+ mov counterd, 4
+ SAVE_ALPHA_BETA
+.main2:
+ call .h
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0x60]
+ mova m7, [esp+gprsize+0x70]
+ %define m10 m5
+%endif
+ psrld m6, 16
+ psrld m7, 16
+ mova m10, blendmask
+ BLENDHWDW m6, m0 ; 5
+ BLENDHWDW m7, m1 ; 7
+%if ARCH_X86_64
+ WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
+ m4, m5, \
+ [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
+ m6, m7
+%else
+ mova [esp+gprsize+0xA0], m6
+ mova [esp+gprsize+0xB0], m7
+ LOAD_DELTA_GAMMA_MY
+ WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \
+ [esp+gprsize+0x00], [esp+gprsize+0x10], \
+ [esp+gprsize+0x80], [esp+gprsize+0x90], \
+ [esp+gprsize+0x20], [esp+gprsize+0x30], \
+ [esp+gprsize+0xA0], [esp+gprsize+0xB0]
+ LOAD_ALPHA_BETA_MX
+%endif
+ call .h
+ mova m2, [rsp+gprsize+0x40]
+ mova m3, [rsp+gprsize+0x50]
+%if ARCH_X86_32
+ mova m4, [rsp+gprsize+0x80]
+ mova m5, [rsp+gprsize+0x90]
+ %define m10 m7
+%endif
+ mova [rsp+gprsize+0x00], m2
+ mova [rsp+gprsize+0x10], m3
+ mova [rsp+gprsize+0x40], m4
+ mova [rsp+gprsize+0x50], m5
+ psrld m4, 16
+ psrld m5, 16
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 6
+ BLENDHWDW m5, m1 ; 8
+%if ARCH_X86_64
+ WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
+ m6, m7, \
+ [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
+ m4, m5
+%else
+ mova [esp+gprsize+0x80], m4
+ mova [esp+gprsize+0x90], m5
+ LOAD_DELTA_GAMMA_MY
+ WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \
+ [esp+gprsize+0x20], [esp+gprsize+0x30], \
+ [esp+gprsize+0xA0], [esp+gprsize+0xB0], \
+ [esp+gprsize+0x00], [esp+gprsize+0x10], \
+ [esp+gprsize+0x80], [esp+gprsize+0x90]
+ mov mym, myd
+ mov dstd, dstm
+ mov dsd, dsm
+ mov mxd, mxm
+%endif
+ mova m2, [rsp+gprsize+0x60]
+ mova m3, [rsp+gprsize+0x70]
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0xA0]
+ mova m7, [esp+gprsize+0xB0]
+%endif
+ mova [rsp+gprsize+0x20], m2
+ mova [rsp+gprsize+0x30], m3
+ mova [rsp+gprsize+0x60], m6
+ mova [rsp+gprsize+0x70], m7
+ ret
+ALIGN function_align
+.h:
+%if ARCH_X86_32
+ %define m8 m3
+ %define m9 m4
+ %define m10 m5
+ %define m14 m6
+ %define m15 m7
+%endif
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+%if ARCH_X86_32
+ %assign stack_offset stack_offset+4
+ %assign stack_size stack_size+4
+ %define PIC_mem [esp+gprsize*2+0x114]
+ mov PIC_mem, PIC_reg
+ mov srcd, srcm
+%endif
+ movu m10, [srcq]
+%if ARCH_X86_32
+ add srcd, ssm
+ mov srcm, srcd
+ mov PIC_reg, PIC_mem
+%else
+ add srcq, ssq
+%endif
+ shr mxd, 10
+ shr tmp1d, 10
+ movq m1, [filterq+mxq *8] ; 0 X
+ movq m8, [filterq+tmp1q*8] ; 4 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+alphaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m1, [filterq+tmp2q*8] ; 0 1
+ movhps m8, [filterq+tmp1q*8] ; 4 5
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ shr mxd, 10
+ shr tmp1d, 10
+%if cpuflag(ssse3)
+ movq m14, [filterq+mxq *8] ; 2 X
+ movq m9, [filterq+tmp1q*8] ; 6 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m14, [filterq+tmp2q*8] ; 2 3
+ movhps m9, [filterq+tmp1q*8] ; 6 7
+ pshufb m0, m10, [PIC_sym(warp_8x8_shufA)]
+ pmaddubsw m0, m1
+ pshufb m1, m10, [PIC_sym(warp_8x8_shufB)]
+ pmaddubsw m1, m8
+ pshufb m15, m10, [PIC_sym(warp_8x8_shufC)]
+ pmaddubsw m15, m14
+ pshufb m10, m10, [PIC_sym(warp_8x8_shufD)]
+ pmaddubsw m10, m9
+ phaddw m0, m15
+ phaddw m1, m10
+%else
+ %if ARCH_X86_32
+ %define m11 m2
+ %endif
+ pcmpeqw m0, m0
+ psrlw m14, m0, 8
+ psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15
+ pand m14, m10 ; 00 02 04 06 08 10 12 14
+ packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15
+ psrldq m9, m0, 4
+ pshufd m0, m14, q0220
+ pand m0, m9
+ psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __
+ pslldq m15, m14, 12
+ por m0, m15 ; shufA
+ psrlw m15, m0, 8
+ psraw m11, m1, 8
+ psllw m0, 8
+ psllw m1, 8
+ psrlw m0, 8
+ psraw m1, 8
+ pmullw m15, m11
+ pmullw m0, m1
+ paddw m0, m15 ; pmaddubsw m0, m1
+ pshufd m15, m14, q0220
+ pand m15, m9
+ psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __
+ pslldq m1, m14, 12
+ por m15, m1 ; shufC
+ pshufd m1, m14, q0220
+ pand m1, m9
+ psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __
+ pslldq m11, m14, 12
+ por m1, m11 ; shufB
+ pshufd m10, m14, q0220
+ pand m10, m9
+ psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __
+ pslldq m14, m14, 12
+ por m10, m14 ; shufD
+ psrlw m9, m1, 8
+ psraw m11, m8, 8
+ psllw m1, 8
+ psllw m8, 8
+ psrlw m1, 8
+ psraw m8, 8
+ pmullw m9, m11
+ pmullw m1, m8
+ paddw m1, m9 ; pmaddubsw m1, m8
+ movq m14, [filterq+mxq *8] ; 2 X
+ movq m9, [filterq+tmp1q*8] ; 6 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m14, [filterq+tmp2q*8] ; 2 3
+ movhps m9, [filterq+tmp1q*8] ; 6 7
+ psrlw m8, m15, 8
+ psraw m11, m14, 8
+ psllw m15, 8
+ psllw m14, 8
+ psrlw m15, 8
+ psraw m14, 8
+ pmullw m8, m11
+ pmullw m15, m14
+ paddw m15, m8 ; pmaddubsw m15, m14
+ psrlw m8, m10, 8
+ psraw m11, m9, 8
+ psllw m10, 8
+ psllw m9, 8
+ psrlw m10, 8
+ psraw m9, 8
+ pmullw m8, m11
+ pmullw m10, m9
+ paddw m10, m8 ; pmaddubsw m10, m9
+ pslld m8, m0, 16
+ pslld m9, m1, 16
+ pslld m14, m15, 16
+ pslld m11, m10, 16
+ paddw m0, m8
+ paddw m1, m9
+ paddw m15, m14
+ paddw m10, m11
+ psrad m0, 16
+ psrad m1, 16
+ psrad m15, 16
+ psrad m10, 16
+ packssdw m0, m15 ; phaddw m0, m15
+ packssdw m1, m10 ; phaddw m1, m10
+%endif
+ mova m14, [PIC_sym(pw_8192)]
+ mova m9, [PIC_sym(pd_32768)]
+ pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
+ pmaddwd m1, m14
+ paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword
+ paddd m1, m9
+ ret
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%macro BIDIR_FN 1 ; op
+ %1 0
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w4: ; tile 4x
+ movd [dstq ], m0 ; copy dw[0]
+ pshuflw m1, m0, q1032 ; swap dw[1] and dw[0]
+ movd [dstq+strideq*1], m1 ; copy dw[1]
+ punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0]
+ movd [dstq+strideq*2], m0 ; dw[2]
+ psrlq m0, 32 ; shift right in dw[3]
+ movd [dstq+stride3q ], m0 ; copy
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*2]
+.w8:
+ movq [dstq ], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq]
+.w16:
+ mova [dstq ], m0
+ dec hd
+ jg .w16_loop
+ RET
+.w32_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq]
+.w32:
+ mova [dstq ], m0
+ %1 2
+ mova [dstq + 16 ], m0
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ %1_INC_PTR 8
+ %1 0
+ add dstq, strideq
+.w64:
+ %assign i 0
+ %rep 4
+ mova [dstq + i*16 ], m0
+ %assign i i+1
+ %if i < 4
+ %1 2*i
+ %endif
+ %endrep
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ %1_INC_PTR 16
+ %1 0
+ add dstq, strideq
+.w128:
+ %assign i 0
+ %rep 8
+ mova [dstq + i*16 ], m0
+ %assign i i+1
+ %if i < 8
+ %1 2*i
+ %endif
+ %endrep
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
+ mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
+ paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ paddw m1, [tmp2q+(%1+1)*mmsize]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+ LEA r6, avg_ssse3_table
+ tzcnt wd, wm ; leading zeros
+ movifnidn hd, hm ; move h(stack) to h(register) if not already that register
+ movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
+ mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m2, [tmp1q+(%1+0)*mmsize]
+ mova m0, m2
+ psubw m2, [tmp2q+(%1+0)*mmsize]
+ mova m3, [tmp1q+(%1+1)*mmsize]
+ mova m1, m3
+ psubw m3, [tmp2q+(%1+1)*mmsize]
+ pmulhw m2, m4
+ pmulhw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+ LEA r6, w_avg_ssse3_table
+ tzcnt wd, wm
+ movd m4, r6m
+ movifnidn hd, hm
+ pxor m0, m0
+ movsxd wq, dword [r6+wq*4]
+ mova m5, [pw_2048+r6-w_avg_ssse3_table]
+ pshufb m4, m0
+ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
+ add wq, r6
+ cmp dword r6m, 7
+ jg .weight_gt7
+ mov r6, tmp1q
+ psubw m0, m4
+ mov tmp1q, tmp2q
+ mova m4, m0 ; -weight
+ mov tmp2q, r6
+.weight_gt7:
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+ mova m3, [maskq+(%1+0)*(mmsize/2)]
+ mova m0, [tmp2q+(%1+0)*mmsize] ; b
+ psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
+ mova m6, m3 ; m
+ psubb m3, m4, m6 ; -m
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3 ; -m << 1
+ punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
+ pmulhw m1, m2 ; (-m * (b - a)) << 10
+ paddw m0, m1 ; + b
+ mova m1, [tmp2q+(%1+1)*mmsize] ; b
+ psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
+ paddw m2, m2 ; (b - a) << 1
+ mova m6, m3 ; (-m << 1)
+ punpckhbw m3, m4, m6 ; (-m << 9)
+ pmulhw m2, m3 ; (-m << 9)
+ paddw m1, m2 ; (-m * (b - a)) << 10
+ pmulhrsw m0, m5 ; round
+ pmulhrsw m1, m5 ; round
+ packuswb m0, m1 ; interleave 16 -> 8
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*mmsize/2
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+%if ARCH_X86_64
+cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ movifnidn hd, hm
+%else
+cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
+%define hd dword r5m
+%endif
+%define base r6-mask_ssse3_table
+ LEA r6, mask_ssse3_table
+ tzcnt wd, wm
+ movsxd wq, dword [r6+wq*4]
+ pxor m4, m4
+ mova m5, [base+pw_2048]
+ add wq, r6
+ mov maskq, r6m
+ BIDIR_FN MASK
+%undef hd
+
+%macro W_MASK_420_B 2 ; src_offset in bytes, mask_out
+ ;**** do m0 = u16.dst[7..0], m%2 = u16.m[7..0] ****
+ mova m0, [tmp1q+(%1)]
+ mova m1, [tmp2q+(%1)]
+ mova m2, reg_pw_6903
+ psubw m1, m0
+ pabsw m%2, m1 ; abs(tmp1 - tmp2)
+ mova m3, m2
+ psubusw m2, m%2
+ psrlw m2, 8 ; 64 - m
+ mova m%2, m2
+ psllw m2, 10
+ pmulhw m1, m2 ; tmp2 * ()
+ paddw m0, m1 ; tmp1 + ()
+ ;**** do m1 = u16.dst[7..0], m%2 = u16.m[7..0] ****
+ mova m1, [tmp1q+(%1)+mmsize]
+ mova m2, [tmp2q+(%1)+mmsize]
+ psubw m2, m1
+ pabsw m7, m2 ; abs(tmp1 - tmp2)
+ psubusw m3, m7
+ psrlw m3, 8 ; 64 - m
+ phaddw m%2, m3 ; pack both u16.m[8..0]runs as u8.m [15..0]
+ psllw m3, 10
+ pmulhw m2, m3
+%if ARCH_X86_32
+ mova reg_pw_2048, [base+pw_2048]
+%endif
+ paddw m1, m2
+ pmulhrsw m0, reg_pw_2048 ; round/scale 2048
+ pmulhrsw m1, reg_pw_2048 ; round/scale 2048
+ packuswb m0, m1 ; concat m0 = u8.dst[15..0]
+%endmacro
+
+%macro W_MASK_420 2
+ W_MASK_420_B (%1*16), %2
+%endmacro
+
+%define base r6-w_mask_420_ssse3_table
+%if ARCH_X86_64
+%define reg_pw_6903 m8
+%define reg_pw_2048 m9
+; args: dst, stride, tmp1, tmp2, w, h, mask, sign
+cglobal w_mask_420, 4, 8, 10, dst, stride, tmp1, tmp2, w, h, mask
+ lea r6, [w_mask_420_ssse3_table]
+ mov wd, wm
+ tzcnt r7d, wd
+ movd m0, r7m ; sign
+ movifnidn hd, hm
+ movsxd r7, [r6+r7*4]
+ mova reg_pw_6903, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ mova reg_pw_2048, [base+pw_2048]
+ movd m6, [base+pw_258] ; 64 * 4 + 2
+ add r7, r6
+ mov maskq, maskmp
+ psubw m6, m0
+ pshuflw m6, m6, q0000
+ punpcklqdq m6, m6
+ W_MASK_420 0, 4
+ jmp r7
+ %define loop_w r7d
+%else
+%define reg_pw_6903 [base+pw_6903]
+%define reg_pw_2048 m3
+cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
+ tzcnt wd, wm
+ LEA r6, w_mask_420_ssse3_table
+ movd m0, r7m ; sign
+ mov maskq, r6mp
+ mov wd, [r6+wq*4]
+ movd m6, [base+pw_258]
+ add wq, r6
+ psubw m6, m0
+ pshuflw m6, m6, q0000
+ punpcklqdq m6, m6
+ W_MASK_420 0, 4
+ jmp wd
+ %define loop_w dword r0m
+ %define hd dword r5m
+%endif
+.w4_loop:
+ add tmp1q, 2*16
+ add tmp2q, 2*16
+ W_MASK_420 0, 4
+ lea dstq, [dstq+strideq*2]
+ add maskq, 4
+.w4:
+ movd [dstq ], m0 ; copy m0[0]
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1 ; copy m0[1]
+ lea dstq, [dstq+strideq*2]
+ punpckhqdq m0, m0
+ movd [dstq+strideq*0], m0 ; copy m0[2]
+ psrlq m0, 32
+ movd [dstq+strideq*1], m0 ; copy m0[3]
+ psubw m1, m6, m4 ; a _ c _
+ psrlq m4, 32 ; b _ d _
+ psubw m1, m4
+ psrlw m1, 2
+ packuswb m1, m1
+ pshuflw m1, m1, q2020
+ movd [maskq], m1
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ add tmp1q, 2*16
+ add tmp2q, 2*16
+ W_MASK_420 0, 4
+ lea dstq, [dstq+strideq*2]
+ add maskq, 4
+.w8:
+ movq [dstq ], m0
+ movhps [dstq+strideq*1], m0
+ psubw m0, m6, m4
+ punpckhqdq m4, m4
+ psubw m0, m4
+ psrlw m0, 2
+ packuswb m0, m0
+ movd [maskq], m0
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16: ; w32/64/128
+%if ARCH_X86_32
+ mov wd, wm ; because we altered it in 32bit setup
+%endif
+ mov loop_w, wd ; use width as counter
+ jmp .w16ge_inner_loop_first
+.w16ge_loop:
+ lea tmp1q, [tmp1q+wq*2] ; skip even line pixels
+ lea tmp2q, [tmp2q+wq*2] ; skip even line pixels
+ sub dstq, wq
+ mov loop_w, wd
+ lea dstq, [dstq+strideq*2]
+.w16ge_inner_loop:
+ W_MASK_420_B 0, 4
+.w16ge_inner_loop_first:
+ mova [dstq ], m0
+ W_MASK_420_B wq*2, 5 ; load matching even line (offset = widthpx * (16+16))
+ mova [dstq+strideq*1], m0
+ psubw m1, m6, m4 ; m9 == 64 * 4 + 2
+ psubw m1, m5 ; - odd line mask
+ psrlw m1, 2 ; >> 2
+ packuswb m1, m1
+ movq [maskq], m1
+ add tmp1q, 2*16
+ add tmp2q, 2*16
+ add maskq, 8
+ add dstq, 16
+ sub loop_w, 16
+ jg .w16ge_inner_loop
+ sub hd, 2
+ jg .w16ge_loop
+ RET
+
+%undef reg_pw_6903
+%undef reg_pw_2048
+%undef dst_bak
+%undef loop_w
+%undef orig_w
+%undef hd
+
+%macro BLEND_64M 4; a, b, mask1, mask2
+ punpcklbw m0, %1, %2; {b;a}[7..0]
+ punpckhbw %1, %2 ; {b;a}[15..8]
+ pmaddubsw m0, %3 ; {b*m[0] + (64-m[0])*a}[7..0] u16
+ pmaddubsw %1, %4 ; {b*m[1] + (64-m[1])*a}[15..8] u16
+ pmulhrsw m0, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
+ pmulhrsw %1, m5 ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16
+ packuswb m0, %1 ; {blendpx}[15..0] u8
+%endmacro
+
+%macro BLEND 2; a, b
+ psubb m3, m4, m0 ; m3 = (64 - m)
+ punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0]
+ punpckhbw m3, m0 ; {m;(64-m)}[15..8]
+ BLEND_64M %1, %2, m2, m3
+%endmacro
+
+cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_ssse3_table
+ LEA r6, blend_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movifnidn maskq, maskmp
+ movsxd wq, dword [r6+wq*4]
+ mova m4, [base+pb_64]
+ mova m5, [base+pw_512]
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ movq m0, [maskq]; m
+ movd m1, [dstq+dsq*0] ; a
+ movd m6, [dstq+dsq*1]
+ punpckldq m1, m6
+ movq m6, [tmpq] ; b
+ psubb m3, m4, m0 ; m3 = (64 - m)
+ punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0]
+ punpcklbw m1, m6 ; {b;a}[7..0]
+ pmaddubsw m1, m2 ; {b*m[0] + (64-m[0])*a}[7..0] u16
+ pmulhrsw m1, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
+ packuswb m1, m0 ; {blendpx}[15..0] u8
+ movd [dstq+dsq*0], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ add maskq, 8
+ add tmpq, 8
+ lea dstq, [dstq+dsq*2] ; dst_stride * 2
+ sub hd, 2
+ jg .w4
+ RET
+.w8:
+ mova m0, [maskq]; m
+ movq m1, [dstq+dsq*0] ; a
+ movhps m1, [dstq+dsq*1]
+ mova m6, [tmpq] ; b
+ BLEND m1, m6
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ add maskq, 16
+ add tmpq, 16
+ lea dstq, [dstq+dsq*2] ; dst_stride * 2
+ sub hd, 2
+ jg .w8
+ RET
+.w16:
+ mova m0, [maskq]; m
+ mova m1, [dstq] ; a
+ mova m6, [tmpq] ; b
+ BLEND m1, m6
+ mova [dstq], m0
+ add maskq, 16
+ add tmpq, 16
+ add dstq, dsq ; dst_stride
+ dec hd
+ jg .w16
+ RET
+.w32:
+ %assign i 0
+ %rep 2
+ mova m0, [maskq+16*i]; m
+ mova m1, [dstq+16*i] ; a
+ mova m6, [tmpq+16*i] ; b
+ BLEND m1, m6
+ mova [dstq+i*16], m0
+ %assign i i+1
+ %endrep
+ add maskq, 32
+ add tmpq, 32
+ add dstq, dsq ; dst_stride
+ dec hd
+ jg .w32
+ RET
+
+cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_ssse3_table
+ LEA r5, blend_v_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r5+wq*4]
+ mova m5, [base+pw_512]
+ add wq, r5
+ add maskq, obmc_masks-blend_v_ssse3_table
+ jmp wq
+.w2:
+ movd m3, [maskq+4]
+ punpckldq m3, m3
+ ; 2 mask blend is provided for 4 pixels / 2 lines
+.w2_loop:
+ movd m1, [dstq+dsq*0] ; a {..;a;a}
+ pinsrw m1, [dstq+dsq*1], 1
+ movd m2, [tmpq] ; b
+ punpcklbw m0, m1, m2; {b;a}[7..0]
+ pmaddubsw m0, m3 ; {b*m + (64-m)*a}[7..0] u16
+ pmulhrsw m0, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
+ packuswb m0, m1 ; {blendpx}[8..0] u8
+ movd r3d, m0
+ mov [dstq+dsq*0], r3w
+ shr r3d, 16
+ mov [dstq+dsq*1], r3w
+ add tmpq, 2*2
+ lea dstq, [dstq + dsq * 2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ movddup m3, [maskq+8]
+ ; 4 mask blend is provided for 8 pixels / 2 lines
+.w4_loop:
+ movd m1, [dstq+dsq*0] ; a
+ movd m2, [dstq+dsq*1] ;
+ punpckldq m1, m2
+ movq m2, [tmpq] ; b
+ punpcklbw m1, m2 ; {b;a}[7..0]
+ pmaddubsw m1, m3 ; {b*m + (64-m)*a}[7..0] u16
+ pmulhrsw m1, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
+ packuswb m1, m1 ; {blendpx}[8..0] u8
+ movd [dstq], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ add tmpq, 2*4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ mova m3, [maskq+16]
+ ; 8 mask blend is provided for 16 pixels
+.w8_loop:
+ movq m1, [dstq+dsq*0] ; a
+ movhps m1, [dstq+dsq*1]
+ mova m2, [tmpq]; b
+ BLEND_64M m1, m2, m3, m3
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ add tmpq, 16
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ ; 16 mask blend is provided for 32 pixels
+ mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0])
+ mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1])
+.w16_loop:
+ mova m1, [dstq] ; a
+ mova m2, [tmpq] ; b
+ BLEND_64M m1, m2, m3, m4
+ mova [dstq], m0
+ add tmpq, 16
+ add dstq, dsq
+ dec hd
+ jg .w16_loop
+ RET
+.w32:
+%if WIN64
+ mova [rsp+8], xmm6
+%endif
+ mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0])
+ mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1])
+ mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2])
+ ; 16 mask blend is provided for 64 pixels
+.w32_loop:
+ mova m1, [dstq+16*0] ; a
+ mova m2, [tmpq+16*0] ; b
+ BLEND_64M m1, m2, m3, m4
+ movq m1, [dstq+16*1] ; a
+ punpcklbw m1, [tmpq+16*1] ; b
+ pmaddubsw m1, m6
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ mova [dstq+16*0], m0
+ movq [dstq+16*1], m1
+ add tmpq, 32
+ add dstq, dsq
+ dec hd
+ jg .w32_loop
+%if WIN64
+ mova xmm6, [rsp+8]
+%endif
+ RET
+
+cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base t0-blend_h_ssse3_table
+%if ARCH_X86_32
+ ; We need to keep the PIC pointer for w4, reload wd from stack instead
+ DECLARE_REG_TMP 6
+%else
+ DECLARE_REG_TMP 5
+ mov r6d, wd
+%endif
+ LEA t0, blend_h_ssse3_table
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, dword [t0+wq*4]
+ mova m5, [base+pw_512]
+ add wq, t0
+ lea maskq, [base+obmc_masks+hq*2]
+ lea hd, [hq*3]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd m0, [dstq+dsq*0]
+ pinsrw m0, [dstq+dsq*1], 1
+ movd m2, [maskq+hq*2]
+ movd m1, [tmpq]
+ punpcklwd m2, m2
+ punpcklbw m0, m1
+ pmaddubsw m0, m2
+ pmulhrsw m0, m5
+ packuswb m0, m0
+ movd r3d, m0
+ mov [dstq+dsq*0], r3w
+ shr r3d, 16
+ mov [dstq+dsq*1], r3w
+ lea dstq, [dstq+dsq*2]
+ add tmpq, 2*2
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+%if ARCH_X86_32
+ mova m3, [base+blend_shuf]
+%else
+ mova m3, [blend_shuf]
+%endif
+.w4_loop:
+ movd m0, [dstq+dsq*0]
+ movd m2, [dstq+dsq*1]
+ punpckldq m0, m2 ; a
+ movq m1, [tmpq] ; b
+ movq m2, [maskq+hq*2] ; m
+ pshufb m2, m3
+ punpcklbw m0, m1
+ pmaddubsw m0, m2
+ pmulhrsw m0, m5
+ packuswb m0, m0
+ movd [dstq+dsq*0], m0
+ psrlq m0, 32
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add tmpq, 4*2
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ movd m4, [maskq+hq*2]
+ punpcklwd m4, m4
+ pshufd m3, m4, q0000
+ pshufd m4, m4, q1111
+ movq m1, [dstq+dsq*0] ; a
+ movhps m1, [dstq+dsq*1]
+ mova m2, [tmpq]
+ BLEND_64M m1, m2, m3, m4
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add tmpq, 8*2
+ add hq, 2
+ jl .w8
+ RET
+; w16/w32/w64/w128
+.w16:
+%if ARCH_X86_32
+ mov r6d, wm
+%endif
+ sub dsq, r6
+.w16_loop0:
+ movd m3, [maskq+hq*2]
+ pshuflw m3, m3, q0000
+ punpcklqdq m3, m3
+ mov wd, r6d
+.w16_loop:
+ mova m1, [dstq] ; a
+ mova m2, [tmpq] ; b
+ BLEND_64M m1, m2, m3, m3
+ mova [dstq], m0
+ add dstq, 16
+ add tmpq, 16
+ sub wd, 16
+ jg .w16_loop
+ add dstq, dsq
+ inc hq
+ jl .w16_loop0
+ RET
+
+; emu_edge args:
+; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
+; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
+; const pixel *ref, const ptrdiff_t ref_stride
+;
+; bw, bh total filled size
+; iw, ih, copied block -> fill bottom, right
+; x, y, offset in bw/bh -> fill top, left
+cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
+ y, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+ pxor m1, m1
+
+%if ARCH_X86_64
+ %define reg_zero r12q
+ %define reg_tmp r10
+ %define reg_src srcq
+ %define reg_bottomext bottomextq
+ %define reg_rightext rightextq
+ %define reg_blkm r9m
+%else
+ %define reg_zero r6
+ %define reg_tmp r0
+ %define reg_src r1
+ %define reg_bottomext r0
+ %define reg_rightext r1
+ %define reg_blkm r2m
+%endif
+ ;
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor reg_zero, reg_zero
+ lea reg_tmp, [ihq-1]
+ cmp yq, ihq
+ cmovs reg_tmp, yq
+ test yq, yq
+ cmovs reg_tmp, reg_zero
+%if ARCH_X86_64
+ imul reg_tmp, sstrideq
+ add srcq, reg_tmp
+%else
+ imul reg_tmp, sstridem
+ mov reg_src, srcm
+ add reg_src, reg_tmp
+%endif
+ ;
+ ; ref += iclip(x, 0, iw - 1)
+ lea reg_tmp, [iwq-1]
+ cmp xq, iwq
+ cmovs reg_tmp, xq
+ test xq, xq
+ cmovs reg_tmp, reg_zero
+ add reg_src, reg_tmp
+%if ARCH_X86_32
+ mov srcm, reg_src
+%endif
+ ;
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+%if ARCH_X86_32
+ mov r1, r1m ; restore bh
+%endif
+ lea reg_bottomext, [yq+bhq]
+ sub reg_bottomext, ihq
+ lea r3, [bhq-1]
+ cmovs reg_bottomext, reg_zero
+ ;
+
+ DEFINE_ARGS bw, bh, iw, ih, x, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, reg_zero
+ cmp reg_bottomext, bhq
+ cmovns reg_bottomext, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+ %if ARCH_X86_32
+ mov r4m, reg_bottomext
+ ;
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ mov r0, r0m ; restore bw
+ %endif
+ lea reg_rightext, [xq+bwq]
+ sub reg_rightext, iwq
+ lea r2, [bwq-1]
+ cmovs reg_rightext, reg_zero
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, reg_zero
+ cmp reg_rightext, bwq
+ cmovns reg_rightext, r2
+ %if ARCH_X86_32
+ mov r3m, r1
+ %endif
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+%undef reg_zero
+%undef reg_tmp
+%undef reg_src
+%undef reg_bottomext
+%undef reg_rightext
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; center_h = bh - top_ext - bottom_ext
+%if ARCH_X86_64
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+%else
+ mov r1, centerhm ; restore r1
+ sub centerhq, topextq
+ sub centerhq, r4m
+ mov r1m, centerhq
+%endif
+ ;
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+%if ARCH_X86_64
+ imul r2, dstrideq
+%else
+ mov r6, r6m ; restore dstq
+ imul r2, dstridem
+%endif
+ add dstq, r2
+ mov reg_blkm, dstq ; save pointer for ext
+ ;
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+%if ARCH_X86_64
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+%else
+ sub centerwq, r3m
+ sub centerwq, leftextq
+%endif
+
+; vloop Macro
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+ %if ARCH_X86_64
+ %define reg_tmp r12
+ %else
+ %define reg_tmp r0
+ %endif
+.v_loop_%3:
+ %if ARCH_X86_32
+ mov r0, r0m
+ mov r1, r1m
+ %endif
+%if %1
+ ; left extension
+ %if ARCH_X86_64
+ movd m0, [srcq]
+ %else
+ mov r3, srcm
+ movd m0, [r3]
+ %endif
+ pshufb m0, m1
+ xor r3, r3
+.left_loop_%3:
+ mova [dstq+r3], m0
+ add r3, mmsize
+ cmp r3, leftextq
+ jl .left_loop_%3
+ ; body
+ lea reg_tmp, [dstq+leftextq]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ %if ARCH_X86_64
+ movu m0, [srcq+r3]
+ %else
+ mov r1, srcm
+ movu m0, [r1+r3]
+ %endif
+%if %1
+ movu [reg_tmp+r3], m0
+%else
+ movu [dstq+r3], m0
+%endif
+ add r3, mmsize
+ cmp r3, centerwq
+ jl .body_loop_%3
+%if %2
+ ; right extension
+%if %1
+ add reg_tmp, centerwq
+%else
+ lea reg_tmp, [dstq+centerwq]
+%endif
+ %if ARCH_X86_64
+ movd m0, [srcq+centerwq-1]
+ %else
+ mov r3, srcm
+ movd m0, [r3+centerwq-1]
+ %endif
+ pshufb m0, m1
+ xor r3, r3
+.right_loop_%3:
+ movu [reg_tmp+r3], m0
+ add r3, mmsize
+ %if ARCH_X86_64
+ cmp r3, rightextq
+ %else
+ cmp r3, r3m
+ %endif
+ jl .right_loop_%3
+%endif
+ %if ARCH_X86_64
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+ %else
+ add dstq, dstridem
+ mov r0, sstridem
+ add srcm, r0
+ sub dword centerhm, 1
+ jg .v_loop_%3
+ mov r0, r0m ; restore r0
+ %endif
+%endmacro ; vloop MACRO
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ jnz .need_right_ext
+ %else
+ cmp leftextq, r3m ; leftextq == 0
+ jne .need_right_ext
+ %endif
+ v_loop 0, 0, 0
+ jmp .body_done
+
+ ;left right extensions
+.need_left_ext:
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ %else
+ mov r3, r3m
+ test r3, r3
+ %endif
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+; r0 ; bw
+; r1 ;; x loop
+; r4 ;; y loop
+; r5 ; topextq
+; r6 ;dstq
+; r7 ;dstrideq
+; r8 ; srcq
+%if ARCH_X86_64
+ %define reg_dstride dstrideq
+%else
+ %define reg_dstride r2
+%endif
+ ;
+ ; bottom edge extension
+ %if ARCH_X86_64
+ test bottomextq, bottomextq
+ jz .top
+ %else
+ xor r1, r1
+ cmp r1, r4m
+ je .top
+ %endif
+ ;
+ %if ARCH_X86_64
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+ %else
+ mov r3, dstq
+ mov reg_dstride, dstridem
+ sub r3, reg_dstride
+ mov srcm, r3
+ %endif
+ ;
+.bottom_x_loop:
+ %if ARCH_X86_64
+ mova m0, [srcq+r1]
+ lea r3, [dstq+r1]
+ mov r4, bottomextq
+ %else
+ mov r3, srcm
+ mova m0, [r3+r1]
+ lea r3, [dstq+r1]
+ mov r4, r4m
+ %endif
+ ;
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .bottom_y_loop
+ add r1, mmsize
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+%if ARCH_X86_64
+ mov srcq, reg_blkm
+%else
+ mov r3, reg_blkm
+ mov reg_dstride, dstridem
+%endif
+ mov dstq, dstm
+ xor r1, r1
+ ;
+.top_x_loop:
+%if ARCH_X86_64
+ mova m0, [srcq+r1]
+%else
+ mov r3, reg_blkm
+ mova m0, [r3+r1]
+%endif
+ lea r3, [dstq+r1]
+ mov r4, topextq
+ ;
+.top_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .top_y_loop
+ add r1, mmsize
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+%undef reg_dstride
+%undef reg_blkm
+%undef reg_tmp
+
+cextern resize_filter
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+cglobal resize, 0, 14, 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%elif STACK_ALIGNMENT >= 16
+cglobal resize, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%else
+cglobal resize, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%endif
+ movifnidn dstq, dstmp
+ movifnidn srcq, srcmp
+%if STACK_ALIGNMENT >= 16
+ movifnidn dst_wd, dst_wm
+%endif
+%if ARCH_X86_64
+ movifnidn hd, hm
+%endif
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ movd m7, dxm
+ movd m6, mx0m
+ movd m5, src_wm
+ pshufd m7, m7, q0000
+ pshufd m6, m6, q0000
+ pshufd m5, m5, q0000
+
+%if ARCH_X86_64
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
+ LEA r7, $$
+%define base r7-$$
+%else
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
+%if STACK_ALIGNMENT >= 16
+ LEA r6, $$
+%define base r6-$$
+%else
+ LEA r4, $$
+%define base r4-$$
+%endif
+%endif
+
+%if ARCH_X86_64
+ mova m12, [base+pw_m256]
+ mova m11, [base+pd_63]
+ mova m10, [base+pb_8x0_8x8]
+%else
+%define m12 [base+pw_m256]
+%define m11 [base+pd_63]
+%define m10 [base+pb_8x0_8x8]
+%endif
+ pmaddwd m4, m7, [base+resize_mul] ; dx*[0,1,2,3]
+ pslld m7, 2 ; dx*4
+ pslld m5, 14
+ paddd m6, m4 ; mx+[0..3]*dx
+ SCRATCH 7, 15, 0
+ SCRATCH 6, 14, 1
+ SCRATCH 5, 13, 2
+
+ ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
+ ; m8 = mx+[0..3]*dx, m5 = dx*4, m6 = src_w, m7 = 0x3f, m15=0,8
+
+.loop_y:
+ xor xd, xd
+ mova m0, m14 ; per-line working version of mx
+
+.loop_x:
+ pxor m1, m1
+ pcmpgtd m1, m0
+ pandn m1, m0
+ psrad m2, m0, 8 ; filter offset (unmasked)
+ pcmpgtd m3, m13, m1
+ pand m1, m3
+ pandn m3, m13
+ por m1, m3
+ psubd m3, m0, m1 ; pshufb offset
+ psrad m1, 14 ; clipped src_x offset
+ psrad m3, 14 ; pshufb edge_emu offset
+ pand m2, m11 ; filter offset (masked)
+
+ ; load source pixels
+%if ARCH_X86_64
+ movd r8d, xm1
+ pshuflw xm1, xm1, q3232
+ movd r9d, xm1
+ punpckhqdq xm1, xm1
+ movd r10d, xm1
+ psrlq xm1, 32
+ movd r11d, xm1
+ movq xm4, [srcq+r8]
+ movq xm5, [srcq+r10]
+ movhps xm4, [srcq+r9]
+ movhps xm5, [srcq+r11]
+%else
+ movd r3d, xm1
+ pshufd xm1, xm1, q3312
+ movd r1d, xm1
+ pshuflw xm1, xm1, q3232
+ movq xm4, [srcq+r3]
+ movq xm5, [srcq+r1]
+ movd r3d, xm1
+ punpckhqdq xm1, xm1
+ movd r1d, xm1
+ movhps xm4, [srcq+r3]
+ movhps xm5, [srcq+r1]
+%endif
+
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ ; this also saves 2 quasi-vpgatherdqs
+ pxor m6, m6
+ pcmpeqb m6, m3
+%if ARCH_X86_64
+ pmovmskb r8d, m6
+ cmp r8d, 0xffff
+%else
+ pmovmskb r3d, m6
+ cmp r3d, 0xffff
+%endif
+ je .filter
+
+%if ARCH_X86_64
+ movd r8d, xm3
+ pshuflw xm3, xm3, q3232
+ movd r9d, xm3
+ punpckhqdq xm3, xm3
+ movd r10d, xm3
+ psrlq xm3, 32
+ movd r11d, xm3
+ movsxd r8, r8d
+ movsxd r9, r9d
+ movsxd r10, r10d
+ movsxd r11, r11d
+ movq xm6, [base+resize_shuf+4+r8]
+ movq xm7, [base+resize_shuf+4+r10]
+ movhps xm6, [base+resize_shuf+4+r9]
+ movhps xm7, [base+resize_shuf+4+r11]
+%else
+ movd r3d, xm3
+ pshufd xm3, xm3, q3312
+ movd r1d, xm3
+ pshuflw xm3, xm3, q3232
+ movq xm6, [base+resize_shuf+4+r3]
+ movq xm7, [base+resize_shuf+4+r1]
+ movd r3d, xm3
+ punpckhqdq xm3, xm3
+ movd r1d, xm3
+ movhps xm6, [base+resize_shuf+4+r3]
+ movhps xm7, [base+resize_shuf+4+r1]
+%endif
+
+ paddb m6, m10
+ paddb m7, m10
+ pshufb m4, m6
+ pshufb m5, m7
+
+.filter:
+%if ARCH_X86_64
+ movd r8d, xm2
+ pshuflw xm2, xm2, q3232
+ movd r9d, xm2
+ punpckhqdq xm2, xm2
+ movd r10d, xm2
+ psrlq xm2, 32
+ movd r11d, xm2
+ movq xm6, [base+resize_filter+r8*8]
+ movq xm7, [base+resize_filter+r10*8]
+ movhps xm6, [base+resize_filter+r9*8]
+ movhps xm7, [base+resize_filter+r11*8]
+%else
+ movd r3d, xm2
+ pshufd xm2, xm2, q3312
+ movd r1d, xm2
+ pshuflw xm2, xm2, q3232
+ movq xm6, [base+resize_filter+r3*8]
+ movq xm7, [base+resize_filter+r1*8]
+ movd r3d, xm2
+ punpckhqdq xm2, xm2
+ movd r1d, xm2
+ movhps xm6, [base+resize_filter+r3*8]
+ movhps xm7, [base+resize_filter+r1*8]
+%endif
+
+ pmaddubsw m4, m6
+ pmaddubsw m5, m7
+ phaddw m4, m5
+ phaddsw m4, m4
+ pmulhrsw m4, m12 ; x=(x+64)>>7
+ packuswb m4, m4
+ movd [dstq+xq], m4
+
+ paddd m0, m15
+ add xd, 4
+%if STACK_ALIGNMENT >= 16
+ cmp xd, dst_wd
+%else
+ cmp xd, dst_wm
+%endif
+ jl .loop_x
+
+%if ARCH_X86_64
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+%else
+ add dstq, dst_stridem
+ add srcq, src_stridem
+ dec dword r5m
+%endif
+ jg .loop_y
+ RET
+
+INIT_XMM ssse3
+PREP_BILIN
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM sse4
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM sse2
+PREP_BILIN
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
--- a/src/x86/mc_ssse3.asm
+++ /dev/null
@@ -1,5304 +1,0 @@
-; Copyright © 2018, VideoLAN and dav1d authors
-; Copyright © 2018, Two Orioles, LLC
-; Copyright © 2018, VideoLabs
-; All rights reserved.
-;
-; Redistribution and use in source and binary forms, with or without
-; modification, are permitted provided that the following conditions are met:
-;
-; 1. Redistributions of source code must retain the above copyright notice, this
-; list of conditions and the following disclaimer.
-;
-; 2. Redistributions in binary form must reproduce the above copyright notice,
-; this list of conditions and the following disclaimer in the documentation
-; and/or other materials provided with the distribution.
-;
-; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-%include "ext/x86/x86inc.asm"
-
-SECTION_RODATA 16
-
-; dav1d_obmc_masks[] with 64-x interleaved
-obmc_masks: db 0, 0, 0, 0
- ; 2 @4
- db 45, 19, 64, 0
- ; 4 @8
- db 39, 25, 50, 14, 59, 5, 64, 0
- ; 8 @16
- db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
- ; 16 @32
- db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
- db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
- ; 32 @64
- db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
- db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
- db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
-
-warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
-warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
-warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
-warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
-blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
-subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
- db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
-subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
-subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
-bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
-
-pb_8x0_8x8: times 8 db 0
- times 8 db 8
-resize_mul: dd 0, 1, 2, 3
-resize_shuf: times 5 db 0
- db 1, 2, 3, 4, 5, 6
- times 5+16 db 7
-
-pb_64: times 16 db 64
-pw_m256: times 8 dw -256
-pw_8: times 8 dw 8
-pw_26: times 8 dw 26
-pw_34: times 8 dw 34
-pw_512: times 8 dw 512
-pw_1024: times 8 dw 1024
-pw_2048: times 8 dw 2048
-pw_6903: times 8 dw 6903
-pw_8192: times 8 dw 8192
-pd_32: times 4 dd 32
-pd_63: times 4 dd 63
-pd_512: times 4 dd 512
-pd_16384: times 4 dd 16484
-pd_32768: times 4 dd 32768
-pd_262144:times 4 dd 262144
-
-pw_258: times 2 dw 258
-
-cextern mc_subpel_filters
-%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
-
-%macro BIDIR_JMP_TABLE 1-*
- ;evaluated at definition time (in loop below)
- %xdefine %1_table (%%table - 2*%2)
- %xdefine %%base %1_table
- %xdefine %%prefix mangle(private_prefix %+ _%1)
- ; dynamically generated label
- %%table:
- %rep %0 - 1 ; repeat for num args
- dd %%prefix %+ .w%2 - %%base
- %rotate 1
- %endrep
-%endmacro
-
-BIDIR_JMP_TABLE avg_ssse3, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_avg_ssse3, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE mask_ssse3, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_mask_420_ssse3, 4, 8, 16, 16, 16, 16
-BIDIR_JMP_TABLE blend_ssse3, 4, 8, 16, 32
-BIDIR_JMP_TABLE blend_v_ssse3, 2, 4, 8, 16, 32
-BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16
-
-%macro BASE_JMP_TABLE 3-*
- %xdefine %1_%2_table (%%table - %3)
- %xdefine %%base %1_%2
- %%table:
- %rep %0 - 2
- dw %%base %+ _w%3 - %%base
- %rotate 1
- %endrep
-%endmacro
-
-%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put)
-%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_ssse3.prep)
-
-BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128
-BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
-
-%macro HV_JMP_TABLE 5-*
- %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
- %xdefine %%base %1_%3
- %assign %%types %4
- %if %%types & 1
- %xdefine %1_%2_h_%3_table (%%h - %5)
- %%h:
- %rep %0 - 4
- dw %%prefix %+ .h_w%5 - %%base
- %rotate 1
- %endrep
- %rotate 4
- %endif
- %if %%types & 2
- %xdefine %1_%2_v_%3_table (%%v - %5)
- %%v:
- %rep %0 - 4
- dw %%prefix %+ .v_w%5 - %%base
- %rotate 1
- %endrep
- %rotate 4
- %endif
- %if %%types & 4
- %xdefine %1_%2_hv_%3_table (%%hv - %5)
- %%hv:
- %rep %0 - 4
- dw %%prefix %+ .hv_w%5 - %%base
- %rotate 1
- %endrep
- %endif
-%endmacro
-
-HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128
-
-%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
-
-cextern mc_warp_filter
-
-SECTION .text
-
-INIT_XMM ssse3
-
-%if ARCH_X86_32
- DECLARE_REG_TMP 1
- %define base t0-put_ssse3
-%else
- DECLARE_REG_TMP 7
- %define base 0
-%endif
-;
-%macro RESTORE_DSQ_32 1
- %if ARCH_X86_32
- mov %1, dsm ; restore dsq
- %endif
-%endmacro
-;
-cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
- movifnidn mxyd, r6m ; mx
- LEA t0, put_ssse3
- tzcnt wd, wm
- mov hd, hm
- test mxyd, mxyd
- jnz .h
- mov mxyd, r7m ; my
- test mxyd, mxyd
- jnz .v
-.put:
- movzx wd, word [t0+wq*2+table_offset(put,)]
- add wq, t0
- RESTORE_DSQ_32 t0
- jmp wq
-.put_w2:
- movzx r4d, word [srcq+ssq*0]
- movzx r6d, word [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- mov [dstq+dsq*0], r4w
- mov [dstq+dsq*1], r6w
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .put_w2
- RET
-.put_w4:
- mov r4d, [srcq+ssq*0]
- mov r6d, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- mov [dstq+dsq*0], r4d
- mov [dstq+dsq*1], r6d
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .put_w4
- RET
-.put_w8:
- movq m0, [srcq+ssq*0]
- movq m1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- movq [dstq+dsq*0], m0
- movq [dstq+dsq*1], m1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .put_w8
- RET
-.put_w16:
- movu m0, [srcq+ssq*0]
- movu m1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- mova [dstq+dsq*0], m0
- mova [dstq+dsq*1], m1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .put_w16
- RET
-.put_w32:
- movu m0, [srcq+ssq*0+16*0]
- movu m1, [srcq+ssq*0+16*1]
- movu m2, [srcq+ssq*1+16*0]
- movu m3, [srcq+ssq*1+16*1]
- lea srcq, [srcq+ssq*2]
- mova [dstq+dsq*0+16*0], m0
- mova [dstq+dsq*0+16*1], m1
- mova [dstq+dsq*1+16*0], m2
- mova [dstq+dsq*1+16*1], m3
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .put_w32
- RET
-.put_w64:
- movu m0, [srcq+16*0]
- movu m1, [srcq+16*1]
- movu m2, [srcq+16*2]
- movu m3, [srcq+16*3]
- add srcq, ssq
- mova [dstq+16*0], m0
- mova [dstq+16*1], m1
- mova [dstq+16*2], m2
- mova [dstq+16*3], m3
- add dstq, dsq
- dec hd
- jg .put_w64
- RET
-.put_w128:
- movu m0, [srcq+16*0]
- movu m1, [srcq+16*1]
- movu m2, [srcq+16*2]
- movu m3, [srcq+16*3]
- mova [dstq+16*0], m0
- mova [dstq+16*1], m1
- mova [dstq+16*2], m2
- mova [dstq+16*3], m3
- movu m0, [srcq+16*4]
- movu m1, [srcq+16*5]
- movu m2, [srcq+16*6]
- movu m3, [srcq+16*7]
- mova [dstq+16*4], m0
- mova [dstq+16*5], m1
- mova [dstq+16*6], m2
- mova [dstq+16*7], m3
- add srcq, ssq
- add dstq, dsq
- dec hd
- jg .put_w128
- RET
-.h:
- ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
- ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
- imul mxyd, 0xff01
- mova m4, [base+bilin_h_shuf8]
- mova m0, [base+bilin_h_shuf4]
- add mxyd, 16 << 8
- movd m5, mxyd
- mov mxyd, r7m ; my
- pshuflw m5, m5, q0000
- punpcklqdq m5, m5
- test mxyd, mxyd
- jnz .hv
- movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)]
- mova m3, [base+pw_2048]
- add wq, t0
- RESTORE_DSQ_32 t0
- jmp wq
-.h_w2:
- pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
-.h_w2_loop:
- movd m0, [srcq+ssq*0]
- movd m1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- punpckldq m0, m1
- pshufb m0, m4
- pmaddubsw m0, m5
- pmulhrsw m0, m3
- packuswb m0, m0
- movd r6d, m0
- mov [dstq+dsq*0], r6w
- shr r6d, 16
- mov [dstq+dsq*1], r6w
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .h_w2_loop
- RET
-.h_w4:
- movq m4, [srcq+ssq*0]
- movhps m4, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- pshufb m4, m0
- pmaddubsw m4, m5
- pmulhrsw m4, m3
- packuswb m4, m4
- movd [dstq+dsq*0], m4
- psrlq m4, 32
- movd [dstq+dsq*1], m4
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .h_w4
- RET
-.h_w8:
- movu m0, [srcq+ssq*0]
- movu m1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmulhrsw m0, m3
- pmulhrsw m1, m3
- packuswb m0, m1
- movq [dstq+dsq*0], m0
- movhps [dstq+dsq*1], m0
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .h_w8
- RET
-.h_w16:
- movu m0, [srcq+8*0]
- movu m1, [srcq+8*1]
- add srcq, ssq
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmulhrsw m0, m3
- pmulhrsw m1, m3
- packuswb m0, m1
- mova [dstq], m0
- add dstq, dsq
- dec hd
- jg .h_w16
- RET
-.h_w32:
- movu m0, [srcq+mmsize*0+8*0]
- movu m1, [srcq+mmsize*0+8*1]
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmulhrsw m0, m3
- pmulhrsw m1, m3
- packuswb m0, m1
- movu m1, [srcq+mmsize*1+8*0]
- movu m2, [srcq+mmsize*1+8*1]
- add srcq, ssq
- pshufb m1, m4
- pshufb m2, m4
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmulhrsw m1, m3
- pmulhrsw m2, m3
- packuswb m1, m2
- mova [dstq+16*0], m0
- mova [dstq+16*1], m1
- add dstq, dsq
- dec hd
- jg .h_w32
- RET
-.h_w64:
- mov r6, -16*3
-.h_w64_loop:
- movu m0, [srcq+r6+16*3+8*0]
- movu m1, [srcq+r6+16*3+8*1]
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmulhrsw m0, m3
- pmulhrsw m1, m3
- packuswb m0, m1
- mova [dstq+r6+16*3], m0
- add r6, 16
- jle .h_w64_loop
- add srcq, ssq
- add dstq, dsq
- dec hd
- jg .h_w64
- RET
-.h_w128:
- mov r6, -16*7
-.h_w128_loop:
- movu m0, [srcq+r6+16*7+8*0]
- movu m1, [srcq+r6+16*7+8*1]
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmulhrsw m0, m3
- pmulhrsw m1, m3
- packuswb m0, m1
- mova [dstq+r6+16*7], m0
- add r6, 16
- jle .h_w128_loop
- add srcq, ssq
- add dstq, dsq
- dec hd
- jg .h_w128
- RET
-.v:
- movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)]
- imul mxyd, 0xff01
- mova m5, [base+pw_2048]
- add mxyd, 16 << 8
- add wq, t0
- movd m4, mxyd
- pshuflw m4, m4, q0000
- punpcklqdq m4, m4
- RESTORE_DSQ_32 t0
- jmp wq
-.v_w2:
- movd m0, [srcq+ssq*0]
-.v_w2_loop:
- pinsrw m0, [srcq+ssq*1], 1 ; 0 1
- lea srcq, [srcq+ssq*2]
- pshuflw m2, m0, q2301
- pinsrw m0, [srcq+ssq*0], 0 ; 2 1
- punpcklbw m1, m0, m2
- pmaddubsw m1, m4
- pmulhrsw m1, m5
- packuswb m1, m1
- movd r6d, m1
- mov [dstq+dsq*1], r6w
- shr r6d, 16
- mov [dstq+dsq*0], r6w
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .v_w2_loop
- RET
-.v_w4:
- movd m0, [srcq+ssq*0]
-.v_w4_loop:
- movd m1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- punpckldq m2, m0, m1 ; 0 1
- movd m0, [srcq+ssq*0]
- punpckldq m1, m0 ; 1 2
- punpcklbw m1, m2
- pmaddubsw m1, m4
- pmulhrsw m1, m5
- packuswb m1, m1
- movd [dstq+dsq*0], m1
- psrlq m1, 32
- movd [dstq+dsq*1], m1
- ;
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .v_w4_loop
- RET
-.v_w8:
- movq m0, [srcq+ssq*0]
-.v_w8_loop:
- movq m3, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- punpcklbw m1, m3, m0
- movq m0, [srcq+ssq*0]
- punpcklbw m2, m0, m3
- pmaddubsw m1, m4
- pmaddubsw m2, m4
- pmulhrsw m1, m5
- pmulhrsw m2, m5
- packuswb m1, m2
- movq [dstq+dsq*0], m1
- movhps [dstq+dsq*1], m1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .v_w8_loop
- RET
- ;
-%macro PUT_BILIN_V_W16 0
- movu m0, [srcq+ssq*0]
-%%loop:
- movu m3, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- punpcklbw m1, m3, m0
- punpckhbw m2, m3, m0
- movu m0, [srcq+ssq*0]
- pmaddubsw m1, m4
- pmaddubsw m2, m4
- pmulhrsw m1, m5
- pmulhrsw m2, m5
- packuswb m1, m2
- mova [dstq+dsq*0], m1
- punpcklbw m1, m0, m3
- punpckhbw m2, m0, m3
- pmaddubsw m1, m4
- pmaddubsw m2, m4
- pmulhrsw m1, m5
- pmulhrsw m2, m5
- packuswb m1, m2
- mova [dstq+dsq*1], m1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg %%loop
-%endmacro
- ;
-.v_w16:
- PUT_BILIN_V_W16
- RET
-.v_w16gt:
- mov r4, dstq
- mov r6, srcq
-.v_w16gt_loop:
-%if ARCH_X86_32
- mov bakm, t0q
- RESTORE_DSQ_32 t0
- PUT_BILIN_V_W16
- mov t0q, bakm
-%else
- PUT_BILIN_V_W16
-%endif
- mov hw, t0w
- add r4, mmsize
- add r6, mmsize
- mov dstq, r4
- mov srcq, r6
- sub t0d, 1<<16
- jg .v_w16gt
- RET
-.v_w32:
- lea t0d, [hq+(1<<16)]
- jmp .v_w16gt
-.v_w64:
- lea t0d, [hq+(3<<16)]
- jmp .v_w16gt
-.v_w128:
- lea t0d, [hq+(7<<16)]
- jmp .v_w16gt
-.hv:
- ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
- ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
- movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
- WIN64_SPILL_XMM 8
- shl mxyd, 11 ; can't shift by 12 due to signed overflow
- mova m7, [base+pw_2048]
- movd m6, mxyd
- add wq, t0
- pshuflw m6, m6, q0000
- punpcklqdq m6, m6
- jmp wq
-.hv_w2:
- RESTORE_DSQ_32 t0
- movd m0, [srcq+ssq*0]
- pshufd m0, m0, q0000 ; src[x - src_stride]
- pshufb m0, m4
- pmaddubsw m0, m5
-.hv_w2_loop:
- movd m1, [srcq+ssq*1] ; src[x]
- lea srcq, [srcq+ssq*2]
- movhps m1, [srcq+ssq*0] ; src[x + src_stride]
- pshufd m1, m1, q3120
- pshufb m1, m4
- pmaddubsw m1, m5 ; 1 _ 2 _
- shufps m2, m0, m1, q1032 ; 0 _ 1 _
- mova m0, m1
- psubw m1, m2 ; src[x + src_stride] - src[x]
- paddw m1, m1
- pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x])
- paddw m1, m2 ; src[x] + (my * (src[x + src_stride] - src[x])
- pmulhrsw m1, m7
- packuswb m1, m1
-%if ARCH_X86_64
- movq r6, m1
-%else
- pshuflw m1, m1, q2020
- movd r6d, m1
-%endif
- mov [dstq+dsq*0], r6w
- shr r6, gprsize*4
- mov [dstq+dsq*1], r6w
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .hv_w2_loop
- RET
-.hv_w4:
- mova m4, [base+bilin_h_shuf4]
- RESTORE_DSQ_32 t0
- movddup xm0, [srcq+ssq*0]
- pshufb m0, m4
- pmaddubsw m0, m5
-.hv_w4_loop:
- movq m1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- movhps m1, [srcq+ssq*0]
- pshufb m1, m4
- pmaddubsw m1, m5 ; 1 2
- shufps m2, m0, m1, q1032 ; 0 1
- mova m0, m1
- psubw m1, m2
- paddw m1, m1
- pmulhw m1, m6
- paddw m1, m2
- pmulhrsw m1, m7
- packuswb m1, m1
- movd [dstq+dsq*0], m1
- psrlq m1, 32
- movd [dstq+dsq*1], m1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .hv_w4_loop
- RET
-.hv_w8:
- RESTORE_DSQ_32 t0
- movu m0, [srcq+ssq*0+8*0]
- pshufb m0, m4
- pmaddubsw m0, m5
-.hv_w8_loop:
- movu m2, [srcq+ssq*1+8*0]
- lea srcq, [srcq+ssq*2]
- pshufb m2, m4
- pmaddubsw m2, m5
- psubw m1, m2, m0
- paddw m1, m1
- pmulhw m1, m6
- paddw m1, m0
- movu m0, [srcq+ssq*0+8*0]
- pshufb m0, m4
- pmaddubsw m0, m5
- psubw m3, m0, m2
- paddw m3, m3
- pmulhw m3, m6
- paddw m3, m2
- pmulhrsw m1, m7
- pmulhrsw m3, m7
- packuswb m1, m3
- movq [dstq+dsq*0], m1
- movhps [dstq+dsq*1], m1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .hv_w8_loop
- RET
-.hv_w16:
- xor t0d, t0d
-.hv_w16gt:
- mov r4, dstq
- mov r6, srcq
- %if WIN64
- movaps r4m, xmm8
- %endif
-.hv_w16_loop0:
- movu m0, [srcq+8*0]
- movu m1, [srcq+8*1]
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
-.hv_w16_loop:
-%if ARCH_X86_32
- %define m0tmp [dstq]
-%else
- %define m0tmp m8
-%endif
- add srcq, ssq
- movu m2, [srcq+8*0]
- movu m3, [srcq+8*1]
- pshufb m2, m4
- pshufb m3, m4
- pmaddubsw m2, m5
- pmaddubsw m3, m5
- mova m0tmp, m2
- psubw m2, m0
- paddw m2, m2
- pmulhw m2, m6
- paddw m2, m0
- mova m0, m3
- psubw m3, m1
- paddw m3, m3
- pmulhw m3, m6
- paddw m3, m1
- mova m1, m0
- mova m0, m0tmp
- pmulhrsw m2, m7
- pmulhrsw m3, m7
- packuswb m2, m3
- mova [dstq], m2
- add dstq, dsmp
- dec hd
- jg .hv_w16_loop
- movzx hd, t0w
- add r4, mmsize
- add r6, mmsize
- mov dstq, r4
- mov srcq, r6
- sub t0d, 1<<16
- jg .hv_w16_loop0
- %if WIN64
- movaps xmm8, r4m
- %endif
- RET
-.hv_w32:
- lea t0d, [hq+(1<<16)]
- jmp .hv_w16gt
-.hv_w64:
- lea t0d, [hq+(3<<16)]
- jmp .hv_w16gt
-.hv_w128:
- lea t0d, [hq+(7<<16)]
- jmp .hv_w16gt
-
-DECLARE_REG_TMP 3, 5, 6
-%if ARCH_X86_32
- %define base t2-prep_ssse3
-%else
- %define base 0
-%endif
-cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
- movifnidn mxyd, r5m ; mx
- LEA t2, prep_ssse3
- tzcnt wd, wm
- movifnidn hd, hm
- test mxyd, mxyd
- jnz .h
- mov mxyd, r6m ; my
- test mxyd, mxyd
- jnz .v
-.prep:
- movzx wd, word [t2+wq*2+table_offset(prep,)]
- add wq, t2
- lea stride3q, [strideq*3]
- jmp wq
-.prep_w4:
- movd m0, [srcq+strideq*0]
- movd m1, [srcq+strideq*1]
- movd m2, [srcq+strideq*2]
- movd m3, [srcq+stride3q ]
- punpckldq m0, m1
- punpckldq m2, m3
- lea srcq, [srcq+strideq*4]
- pxor m1, m1
- punpcklbw m0, m1
- punpcklbw m2, m1
- psllw m0, 4
- psllw m2, 4
- mova [tmpq+mmsize*0], m0
- mova [tmpq+mmsize*1], m2
- add tmpq, 32
- sub hd, 4
- jg .prep_w4
- RET
-.prep_w8:
- movq m0, [srcq+strideq*0]
- movq m1, [srcq+strideq*1]
- movq m2, [srcq+strideq*2]
- movq m3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- pxor m4, m4
- punpcklbw m0, m4
- punpcklbw m1, m4
- punpcklbw m2, m4
- punpcklbw m3, m4
- psllw m0, 4
- psllw m1, 4
- psllw m2, 4
- psllw m3, 4
- mova [tmpq+16*0], m0
- mova [tmpq+16*1], m1
- mova [tmpq+16*2], m2
- mova [tmpq+16*3], m3
- add tmpq, 16*4
- sub hd, 4
- jg .prep_w8
- RET
-.prep_w16:
- movq m0, [srcq+strideq*0+8*0]
- movq m1, [srcq+strideq*0+8*1]
- movq m2, [srcq+strideq*1+8*0]
- movq m3, [srcq+strideq*1+8*1]
- lea srcq, [srcq+strideq*2]
- pxor m4, m4
- punpcklbw m0, m4
- punpcklbw m1, m4
- punpcklbw m2, m4
- punpcklbw m3, m4
- psllw m0, 4
- psllw m1, 4
- psllw m2, 4
- psllw m3, 4
- mova [tmpq+16*0], m0
- mova [tmpq+16*1], m1
- mova [tmpq+16*2], m2
- mova [tmpq+16*3], m3
- add tmpq, 16*4
- sub hd, 2
- jg .prep_w16
- RET
-.prep_w16gt:
- mov t1q, srcq
- mov r3q, t2q
-.prep_w16gt_hloop:
- movq m0, [t1q+8*0]
- movq m1, [t1q+8*1]
- movq m2, [t1q+8*2]
- movq m3, [t1q+8*3]
- pxor m4, m4
- punpcklbw m0, m4
- punpcklbw m1, m4
- punpcklbw m2, m4
- punpcklbw m3, m4
- psllw m0, 4
- psllw m1, 4
- psllw m2, 4
- psllw m3, 4
- mova [tmpq+16*0], m0
- mova [tmpq+16*1], m1
- mova [tmpq+16*2], m2
- mova [tmpq+16*3], m3
- add tmpq, 16*4
- add t1q, 32
- sub r3q, 1
- jg .prep_w16gt_hloop
- lea srcq, [srcq+strideq]
- sub hd, 1
- jg .prep_w16gt
- RET
-.prep_w32:
- mov t2q, 1
- jmp .prep_w16gt
-.prep_w64:
- mov t2q, 2
- jmp .prep_w16gt
-.prep_w128:
- mov t2q, 4
- jmp .prep_w16gt
-.h:
- ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
- ; = (16 - mx) * src[x] + mx * src[x + 1]
- imul mxyd, 0xff01
- mova m4, [base+bilin_h_shuf8]
- add mxyd, 16 << 8
- movd xm5, mxyd
- mov mxyd, r6m ; my
- pshuflw m5, m5, q0000
- punpcklqdq m5, m5
- test mxyd, mxyd
- jnz .hv
-%if ARCH_X86_32
- mov t1, t2 ; save base reg for w4
-%endif
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
- add wq, t2
- lea stride3q, [strideq*3]
- jmp wq
-.h_w4:
-%if ARCH_X86_32
- mova m4, [t1-prep_ssse3+bilin_h_shuf4]
-%else
- mova m4, [bilin_h_shuf4]
-%endif
-.h_w4_loop:
- movq m0, [srcq+strideq*0]
- movhps m0, [srcq+strideq*1]
- movq m1, [srcq+strideq*2]
- movhps m1, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- pshufb m0, m4
- pmaddubsw m0, m5
- pshufb m1, m4
- pmaddubsw m1, m5
- mova [tmpq+0 ], m0
- mova [tmpq+16], m1
- add tmpq, 32
- sub hd, 4
- jg .h_w4_loop
- RET
-.h_w8:
- movu m0, [srcq+strideq*0]
- movu m1, [srcq+strideq*1]
- movu m2, [srcq+strideq*2]
- movu m3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- pshufb m0, m4
- pshufb m1, m4
- pshufb m2, m4
- pshufb m3, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmaddubsw m3, m5
- mova [tmpq+16*0], m0
- mova [tmpq+16*1], m1
- mova [tmpq+16*2], m2
- mova [tmpq+16*3], m3
- add tmpq, 16*4
- sub hd, 4
- jg .h_w8
- RET
-.h_w16:
- movu m0, [srcq+strideq*0+8*0]
- movu m1, [srcq+strideq*0+8*1]
- movu m2, [srcq+strideq*1+8*0]
- movu m3, [srcq+strideq*1+8*1]
- lea srcq, [srcq+strideq*2]
- pshufb m0, m4
- pshufb m1, m4
- pshufb m2, m4
- pshufb m3, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmaddubsw m3, m5
- mova [tmpq+16*0], m0
- mova [tmpq+16*1], m1
- mova [tmpq+16*2], m2
- mova [tmpq+16*3], m3
- add tmpq, 16*4
- sub hd, 2
- jg .h_w16
- RET
-.h_w16gt:
- mov t1q, srcq
- mov r3q, t2q
-.h_w16gt_hloop:
- movu m0, [t1q+8*0]
- movu m1, [t1q+8*1]
- movu m2, [t1q+8*2]
- movu m3, [t1q+8*3]
- pshufb m0, m4
- pshufb m1, m4
- pshufb m2, m4
- pshufb m3, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmaddubsw m3, m5
- mova [tmpq+16*0], m0
- mova [tmpq+16*1], m1
- mova [tmpq+16*2], m2
- mova [tmpq+16*3], m3
- add tmpq, 16*4
- add t1q, 32
- sub r3q, 1
- jg .h_w16gt_hloop
- lea srcq, [srcq+strideq]
- sub hd, 1
- jg .h_w16gt
- RET
-.h_w32:
- mov t2q, 1
- jmp .h_w16gt
-.h_w64:
- mov t2q, 2
- jmp .h_w16gt
-.h_w128:
- mov t2q, 4
- jmp .h_w16gt
-.v:
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
- imul mxyd, 0xff01
- add mxyd, 16 << 8
- add wq, t2
- lea stride3q, [strideq*3]
- movd m5, mxyd
- pshuflw m5, m5, q0000
- punpcklqdq m5, m5
- jmp wq
-.v_w4:
- movd m0, [srcq+strideq*0]
-.v_w4_loop:
- movd m1, [srcq+strideq*1]
- movd m2, [srcq+strideq*2]
- movd m3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- punpcklwd m0, m1 ; 0 1 _ _
- punpcklwd m1, m2 ; 1 2 _ _
- punpcklbw m1, m0
- pmaddubsw m1, m5
- pshufd m1, m1, q3120
- mova [tmpq+16*0], m1
- movd m0, [srcq+strideq*0]
- punpcklwd m2, m3 ; 2 3 _ _
- punpcklwd m3, m0 ; 3 4 _ _
- punpcklbw m3, m2
- pmaddubsw m3, m5
- pshufd m3, m3, q3120
- mova [tmpq+16*1], m3
- add tmpq, 32
- sub hd, 4
- jg .v_w4_loop
- RET
-.v_w8:
- movq m0, [srcq+strideq*0]
-.v_w8_loop:
- movq m1, [srcq+strideq*2]
- movq m2, [srcq+strideq*1]
- movq m3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- shufpd m4, m0, m1, 0x0c ; 0 2
- movq m0, [srcq+strideq*0]
- shufpd m2, m3, 0x0c ; 1 3
- shufpd m1, m0, 0x0c ; 2 4
- punpcklbw m3, m2, m4
- pmaddubsw m3, m5
- mova [tmpq+16*0], m3
- punpckhbw m3, m2, m4
- pmaddubsw m3, m5
- mova [tmpq+16*2], m3
- punpcklbw m3, m1, m2
- punpckhbw m1, m2
- pmaddubsw m3, m5
- pmaddubsw m1, m5
- mova [tmpq+16*1], m3
- mova [tmpq+16*3], m1
- add tmpq, 16*4
- sub hd, 4
- jg .v_w8_loop
- RET
-.v_w16:
- movu m0, [srcq+strideq*0]
-.v_w16_loop:
- movu m1, [srcq+strideq*1]
- movu m2, [srcq+strideq*2]
- punpcklbw m3, m1, m0
- punpckhbw m4, m1, m0
- pmaddubsw m3, m5
- pmaddubsw m4, m5
- mova [tmpq+16*0], m3
- mova [tmpq+16*1], m4
- punpcklbw m3, m2, m1
- punpckhbw m4, m2, m1
- pmaddubsw m3, m5
- pmaddubsw m4, m5
- mova [tmpq+16*2], m3
- mova [tmpq+16*3], m4
- movu m3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- movu m0, [srcq+strideq*0]
- add tmpq, 16*8
- punpcklbw m1, m3, m2
- punpckhbw m4, m3, m2
- pmaddubsw m1, m5
- pmaddubsw m4, m5
- mova [tmpq-16*4], m1
- mova [tmpq-16*3], m4
- punpcklbw m1, m0, m3
- punpckhbw m2, m0, m3
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- mova [tmpq-16*2], m1
- mova [tmpq-16*1], m2
- sub hd, 4
- jg .v_w16_loop
- RET
-.v_w32:
- lea t2d, [hq+(0<<16)]
- mov t0d, 64
-.v_w32_start:
-%if ARCH_X86_64
- %if WIN64
- PUSH r7
- %endif
- mov r7, tmpq
-%endif
- mov t1, srcq
-.v_w32_loop_h:
- movu m0, [srcq+strideq*0+16*0] ; 0L
- movu m1, [srcq+strideq*0+16*1] ; 0U
-.v_w32_loop_v:
- movu m2, [srcq+strideq*1+16*0] ; 1L
- movu m3, [srcq+strideq*1+16*1] ; 1U
- lea srcq, [srcq+strideq*2]
- punpcklbw m4, m2, m0
- pmaddubsw m4, m5
- mova [tmpq+16*0], m4
- punpckhbw m4, m2, m0
- pmaddubsw m4, m5
- mova [tmpq+16*1], m4
- punpcklbw m4, m3, m1
- pmaddubsw m4, m5
- mova [tmpq+16*2], m4
- punpckhbw m4, m3, m1
- pmaddubsw m4, m5
- mova [tmpq+16*3], m4
- add tmpq, t0q
- movu m0, [srcq+strideq*0+16*0] ; 2L
- movu m1, [srcq+strideq*0+16*1] ; 2U
- punpcklbw m4, m0, m2
- pmaddubsw m4, m5
- mova [tmpq+16*0], m4
- punpckhbw m4, m0, m2
- pmaddubsw m4, m5
- mova [tmpq+16*1], m4
- punpcklbw m4, m1, m3
- pmaddubsw m4, m5
- mova [tmpq+16*2], m4
- punpckhbw m4, m1, m3
- pmaddubsw m4, m5
- mova [tmpq+16*3], m4
- add tmpq, t0q
- sub hd, 2
- jg .v_w32_loop_v
- movzx hd, t2w
- add t1, 32
- mov srcq, t1
-%if ARCH_X86_64
- add r7, 2*16*2
- mov tmpq, r7
-%else
- mov tmpq, tmpmp
- add tmpq, 2*16*2
- mov tmpmp, tmpq
-%endif
- sub t2d, 1<<16
- jg .v_w32_loop_h
-%if WIN64
- POP r7
-%endif
- RET
-.v_w64:
- lea t2d, [hq+(1<<16)]
- mov t0d, 128
- jmp .v_w32_start
-.v_w128:
- lea t2d, [hq+(3<<16)]
- mov t0d, 256
- jmp .v_w32_start
-.hv:
- ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
- ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
- %assign stack_offset stack_offset - stack_size_padded
- WIN64_SPILL_XMM 8
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
- shl mxyd, 11
- movd xm6, mxyd
- add wq, t2
- pshuflw m6, m6, q0000
- punpcklqdq m6, m6
-%if ARCH_X86_32
- mov t1, t2 ; save base reg for w4
-%endif
- lea stride3q, [strideq*3]
- jmp wq
-.hv_w4:
-%if ARCH_X86_32
- mova m4, [t1-prep_ssse3+bilin_h_shuf4]
-%else
- mova m4, [bilin_h_shuf4]
-%endif
- movq m0, [srcq+strideq*0] ; 0 _
- punpcklqdq m0, m0
- pshufb m0, m4
- pmaddubsw m0, m5
-.hv_w4_loop:
- movq m1, [srcq+strideq*1]
- movhps m1, [srcq+strideq*2] ; 1 _ 2 _
- movq m2, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- movhps m2, [srcq+strideq*0] ; 3 _ 4 _
- pshufb m1, m4
- pshufb m2, m4
- pmaddubsw m1, m5 ; 1 + 2 +
- shufpd m3, m0, m1, 0x01 ; 0 + 1 +
- pmaddubsw m0, m2, m5 ; 3 + 4 +
- shufpd m2, m1, m0, 0x01 ; 2 + 3 +
- psubw m1, m3
- pmulhrsw m1, m6
- paddw m1, m3
- psubw m3, m0, m2
- pmulhrsw m3, m6
- paddw m3, m2
- mova [tmpq+16*0], m1
- mova [tmpq+16*1], m3
- add tmpq, 32
- sub hd, 4
- jg .hv_w4_loop
- RET
-.hv_w8:
- movu m0, [srcq+strideq*0]
- pshufb m0, m4
- pmaddubsw m0, m5 ; 0 +
-.hv_w8_loop:
- movu m1, [srcq+strideq*1] ; 1
- movu m2, [srcq+strideq*2] ; 2
- pshufb m1, m4
- pshufb m2, m4
- pmaddubsw m1, m5 ; 1 +
- pmaddubsw m2, m5 ; 2 +
- psubw m3, m1, m0 ; 1-0
- pmulhrsw m3, m6
- paddw m3, m0
- psubw m7, m2, m1 ; 2-1
- pmulhrsw m7, m6
- paddw m7, m1
- mova [tmpq+16*0], m3
- mova [tmpq+16*1], m7
- movu m1, [srcq+stride3q ] ; 3
- lea srcq, [srcq+strideq*4]
- movu m0, [srcq+strideq*0] ; 4
- pshufb m1, m4
- pshufb m0, m4
- pmaddubsw m1, m5 ; 3 +
- pmaddubsw m0, m5 ; 4 +
- psubw m3, m1, m2 ; 3-2
- pmulhrsw m3, m6
- paddw m3, m2
- psubw m7, m0, m1 ; 4-3
- pmulhrsw m7, m6
- paddw m7, m1
- mova [tmpq+16*2], m3
- mova [tmpq+16*3], m7
- add tmpq, 16*4
- sub hd, 4
- jg .hv_w8_loop
- RET
-.hv_w16:
- lea t2d, [hq+(0<<16)]
- mov t0d, 32
-.hv_w16_start:
-%if ARCH_X86_64
- %if WIN64
- PUSH r7
- %endif
- mov r7, tmpq
-%endif
- mov t1, srcq
-.hv_w16_loop_h:
- movu m0, [srcq+strideq*0+8*0] ; 0L
- movu m1, [srcq+strideq*0+8*1] ; 0U
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5 ; 0L +
- pmaddubsw m1, m5 ; 0U +
-.hv_w16_loop_v:
- movu m2, [srcq+strideq*1+8*0] ; 1L
- pshufb m2, m4
- pmaddubsw m2, m5 ; 1L +
- psubw m3, m2, m0 ; 1L-0L
- pmulhrsw m3, m6
- paddw m3, m0
- mova [tmpq+16*0], m3
- movu m3, [srcq+strideq*1+8*1] ; 1U
- lea srcq, [srcq+strideq*2]
- pshufb m3, m4
- pmaddubsw m3, m5 ; 1U +
- psubw m0, m3, m1 ; 1U-0U
- pmulhrsw m0, m6
- paddw m0, m1
- mova [tmpq+16*1], m0
- add tmpq, t0q
- movu m0, [srcq+strideq*0+8*0] ; 2L
- pshufb m0, m4
- pmaddubsw m0, m5 ; 2L +
- psubw m1, m0, m2 ; 2L-1L
- pmulhrsw m1, m6
- paddw m1, m2
- mova [tmpq+16*0], m1
- movu m1, [srcq+strideq*0+8*1] ; 2U
- pshufb m1, m4
- pmaddubsw m1, m5 ; 2U +
- psubw m2, m1, m3 ; 2U-1U
- pmulhrsw m2, m6
- paddw m2, m3
- mova [tmpq+16*1], m2
- add tmpq, t0q
- sub hd, 2
- jg .hv_w16_loop_v
- movzx hd, t2w
- add t1, 16
- mov srcq, t1
-%if ARCH_X86_64
- add r7, 2*16
- mov tmpq, r7
-%else
- mov tmpq, tmpmp
- add tmpq, 2*16
- mov tmpmp, tmpq
-%endif
- sub t2d, 1<<16
- jg .hv_w16_loop_h
-%if WIN64
- POP r7
-%endif
- RET
-.hv_w32:
- lea t2d, [hq+(1<<16)]
- mov t0d, 64
- jmp .hv_w16_start
-.hv_w64:
- lea t2d, [hq+(3<<16)]
- mov t0d, 128
- jmp .hv_w16_start
-.hv_w128:
- lea t2d, [hq+(7<<16)]
- mov t0d, 256
- jmp .hv_w16_start
-
-; int8_t subpel_filters[5][15][8]
-%assign FILTER_REGULAR (0*15 << 16) | 3*15
-%assign FILTER_SMOOTH (1*15 << 16) | 4*15
-%assign FILTER_SHARP (2*15 << 16) | 3*15
-
-%if ARCH_X86_32
-DECLARE_REG_TMP 1, 2
-%elif WIN64
-DECLARE_REG_TMP 4, 5
-%else
-DECLARE_REG_TMP 7, 8
-%endif
-
-%macro PUT_8TAP_FN 3 ; type, type_h, type_v
-cglobal put_8tap_%1
- mov t0d, FILTER_%2
- mov t1d, FILTER_%3
-%ifnidn %1, sharp_smooth ; skip the jump in the last filter
- jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX)
-%endif
-%endmacro
-
-PUT_8TAP_FN regular, REGULAR, REGULAR
-PUT_8TAP_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_FN sharp_regular, SHARP, REGULAR
-PUT_8TAP_FN sharp, SHARP, SHARP
-PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
-
-%if ARCH_X86_32
- %define base_reg r1
- %define base base_reg-put_ssse3
- %define W32_RESTORE_DSQ mov dsq, dsm
- %define W32_RESTORE_SSQ mov ssq, ssm
-%else
- %define base_reg r8
- %define base 0
- %define W32_RESTORE_DSQ
- %define W32_RESTORE_SSQ
-%endif
-
-cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
-%assign org_stack_offset stack_offset
- imul mxd, mxm, 0x010101
- add mxd, t0d ; 8tap_h, mx, 4tap_h
-%if ARCH_X86_64
- imul myd, mym, 0x010101
- add myd, t1d ; 8tap_v, my, 4tap_v
-%else
- imul ssd, mym, 0x010101
- add ssd, t1d ; 8tap_v, my, 4tap_v
- mov srcq, srcm
-%endif
- mov wd, wm
- movifnidn hd, hm
- LEA base_reg, put_ssse3
- test mxd, 0xf00
- jnz .h
-%if ARCH_X86_32
- test ssd, 0xf00
-%else
- test myd, 0xf00
-%endif
- jnz .v
- tzcnt wd, wd
- movzx wd, word [base_reg+wq*2+table_offset(put,)]
- add wq, base_reg
-; put_bilin mangling jump
-%assign stack_offset org_stack_offset
-%if ARCH_X86_32
- mov dsq, dsm
- mov ssq, ssm
-%elif WIN64
- pop r8
-%endif
- lea r6, [ssq*3]
- jmp wq
-.h:
-%if ARCH_X86_32
- test ssd, 0xf00
-%else
- test myd, 0xf00
-%endif
- jnz .hv
- W32_RESTORE_SSQ
- WIN64_SPILL_XMM 12
- cmp wd, 4
- jl .h_w2
- je .h_w4
- tzcnt wd, wd
-%if ARCH_X86_64
- mova m10, [base+subpel_h_shufA]
- mova m11, [base+subpel_h_shufB]
- mova m9, [base+subpel_h_shufC]
-%endif
- shr mxd, 16
- sub srcq, 3
- movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)]
- movd m5, [base_reg+mxq*8+subpel_filters-put_ssse3+0]
- pshufd m5, m5, q0000
- movd m6, [base_reg+mxq*8+subpel_filters-put_ssse3+4]
- pshufd m6, m6, q0000
- mova m7, [base+pw_34] ; 2 + (8 << 2)
- add wq, base_reg
- jmp wq
-.h_w2:
-%if ARCH_X86_32
- and mxd, 0x7f
-%else
- movzx mxd, mxb
-%endif
- dec srcq
- mova m4, [base+subpel_h_shuf4]
- movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
- pshufd m3, m3, q0000
- mova m5, [base+pw_34] ; 2 + (8 << 2)
- W32_RESTORE_DSQ
-.h_w2_loop:
- movq m0, [srcq+ssq*0]
- movhps m0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- pshufb m0, m4
- pmaddubsw m0, m3
- phaddw m0, m0
- paddw m0, m5 ; pw34
- psraw m0, 6
- packuswb m0, m0
- movd r4d, m0
- mov [dstq+dsq*0], r4w
- shr r4d, 16
- mov [dstq+dsq*1], r4w
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .h_w2_loop
- RET
-.h_w4:
-%if ARCH_X86_32
- and mxd, 0x7f
-%else
- movzx mxd, mxb
-%endif
- dec srcq
- movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
- pshufd m3, m3, q0000
- mova m5, [base+pw_34] ; 2 + (8 << 2)
- mova m6, [base+subpel_h_shufA]
- W32_RESTORE_DSQ
-.h_w4_loop:
- movq m0, [srcq+ssq*0] ; 1
- movq m1, [srcq+ssq*1] ; 2
- lea srcq, [srcq+ssq*2]
- pshufb m0, m6 ; subpel_h_shufA
- pshufb m1, m6 ; subpel_h_shufA
- pmaddubsw m0, m3 ; subpel_filters
- pmaddubsw m1, m3 ; subpel_filters
- phaddw m0, m1
- paddw m0, m5 ; pw34
- psraw m0, 6
- packuswb m0, m0
- movd [dstq+dsq*0], m0
- psrlq m0, 32
- movd [dstq+dsq*1], m0
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .h_w4_loop
- RET
- ;
-%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
- %if ARCH_X86_32
- pshufb %2, %1, [base+subpel_h_shufB]
- pshufb %3, %1, [base+subpel_h_shufC]
- pshufb %1, [base+subpel_h_shufA]
- %else
- pshufb %2, %1, m11; subpel_h_shufB
- pshufb %3, %1, m9 ; subpel_h_shufC
- pshufb %1, m10 ; subpel_h_shufA
- %endif
- pmaddubsw %4, %2, m5 ; subpel +0 B0
- pmaddubsw %2, m6 ; subpel +4 B4
- pmaddubsw %3, m6 ; C4
- pmaddubsw %1, m5 ; A0
- paddw %3, %4 ; C4+B0
- paddw %1, %2 ; A0+B4
- phaddw %1, %3
- paddw %1, m7 ; pw34
- psraw %1, 6
-%endmacro
- ;
-.h_w8:
- movu m0, [srcq+ssq*0]
- movu m1, [srcq+ssq*1]
- PUT_8TAP_H m0, m2, m3, m4
- lea srcq, [srcq+ssq*2]
- PUT_8TAP_H m1, m2, m3, m4
- packuswb m0, m1
-%if ARCH_X86_32
- movq [dstq ], m0
- add dstq, dsm
- movhps [dstq ], m0
- add dstq, dsm
-%else
- movq [dstq+dsq*0], m0
- movhps [dstq+dsq*1], m0
- lea dstq, [dstq+dsq*2]
-%endif
- sub hd, 2
- jg .h_w8
- RET
-.h_w16:
- xor r6d, r6d
- jmp .h_start
-.h_w32:
- mov r6, -16*1
- jmp .h_start
-.h_w64:
- mov r6, -16*3
- jmp .h_start
-.h_w128:
- mov r6, -16*7
-.h_start:
- sub srcq, r6
- sub dstq, r6
- mov r4, r6
-.h_loop:
- movu m0, [srcq+r6+8*0]
- movu m1, [srcq+r6+8*1]
- PUT_8TAP_H m0, m2, m3, m4
- PUT_8TAP_H m1, m2, m3, m4
- packuswb m0, m1
- mova [dstq+r6], m0
- add r6, mmsize
- jle .h_loop
- add srcq, ssq
-%if ARCH_X86_32
- add dstq, dsm
-%else
- add dstq, dsq
-%endif
- mov r6, r4
- dec hd
- jg .h_loop
- RET
-.v:
-%if ARCH_X86_32
- movzx mxd, ssb
- shr ssd, 16
- cmp hd, 6
- cmovs ssd, mxd
- lea ssq, [base_reg+ssq*8+subpel_filters-put_ssse3]
-%else
- %assign stack_offset org_stack_offset
- WIN64_SPILL_XMM 16
- movzx mxd, myb
- shr myd, 16
- cmp hd, 6
- cmovs myd, mxd
- lea myq, [base_reg+myq*8+subpel_filters-put_ssse3]
-%endif
- tzcnt r6d, wd
- movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)]
- mova m7, [base+pw_512]
- psrlw m2, m7, 1 ; 0x0100
- add r6, base_reg
-%if ARCH_X86_32
- %define subpel0 [rsp+mmsize*0]
- %define subpel1 [rsp+mmsize*1]
- %define subpel2 [rsp+mmsize*2]
- %define subpel3 [rsp+mmsize*3]
-%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
- ALLOC_STACK -mmsize*4
-%assign regs_used 7
- movd m0, [ssq+0]
- pshufb m0, m2
- mova subpel0, m0
- movd m0, [ssq+2]
- pshufb m0, m2
- mova subpel1, m0
- movd m0, [ssq+4]
- pshufb m0, m2
- mova subpel2, m0
- movd m0, [ssq+6]
- pshufb m0, m2
- mova subpel3, m0
- mov ssq, [rstk+stack_offset+gprsize*4]
- lea ssq, [ssq*3]
- sub srcq, ssq
- mov ssq, [rstk+stack_offset+gprsize*4]
- mov dsq, [rstk+stack_offset+gprsize*2]
-%else
- %define subpel0 m8
- %define subpel1 m9
- %define subpel2 m10
- %define subpel3 m11
- movd subpel0, [myq+0]
- pshufb subpel0, m2
- movd subpel1, [myq+2]
- pshufb subpel1, m2
- movd subpel2, [myq+4]
- pshufb subpel2, m2
- movd subpel3, [myq+6]
- pshufb subpel3, m2
- lea ss3q, [ssq*3]
- sub srcq, ss3q
-%endif
- jmp r6
-.v_w2:
- movd m2, [srcq+ssq*0] ; 0
- pinsrw m2, [srcq+ssq*1], 2 ; 0 1
- pinsrw m2, [srcq+ssq*2], 4 ; 0 1 2
-%if ARCH_X86_32
- lea srcq, [srcq+ssq*2]
- add srcq, ssq
- pinsrw m2, [srcq+ssq*0], 6 ; 0 1 2 3
- add srcq, ssq
-%else
- pinsrw m2, [srcq+ss3q ], 6 ; 0 1 2 3
- lea srcq, [srcq+ssq*4]
-%endif
- movd m3, [srcq+ssq*0] ; 4
- movd m1, [srcq+ssq*1] ; 5
- movd m0, [srcq+ssq*2] ; 6
-%if ARCH_X86_32
- lea srcq, [srcq+ssq*2]
- add srcq, ssq
-%else
- add srcq, ss3q
-%endif
- punpckldq m3, m1 ; 4 5 _ _
- punpckldq m1, m0 ; 5 6 _ _
- palignr m4, m3, m2, 4 ; 1 2 3 4
- punpcklbw m3, m1 ; 45 56
- punpcklbw m1, m2, m4 ; 01 12
- punpckhbw m2, m4 ; 23 34
-.v_w2_loop:
- pmaddubsw m5, m1, subpel0 ; a0 b0
- mova m1, m2
- pmaddubsw m2, subpel1 ; a1 b1
- paddw m5, m2
- mova m2, m3
- pmaddubsw m3, subpel2 ; a2 b2
- paddw m5, m3
- movd m4, [srcq+ssq*0] ; 7
- punpckldq m3, m0, m4 ; 6 7 _ _
- movd m0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- punpckldq m4, m0 ; 7 8 _ _
- punpcklbw m3, m4 ; 67 78
- pmaddubsw m4, m3, subpel3 ; a3 b3
- paddw m5, m4
- pmulhrsw m5, m7
- packuswb m5, m5
- pshuflw m5, m5, q2020
- movd r6d, m5
- mov [dstq+dsq*0], r6w
- shr r6d, 16
- mov [dstq+dsq*1], r6w
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .v_w2_loop
- RET
-.v_w4:
-%if ARCH_X86_32
-.v_w8:
-.v_w16:
-.v_w32:
-.v_w64:
-.v_w128:
-%endif ; ARCH_X86_32
- lea r6d, [wq - 4] ; horizontal loop
- mov r4, dstq
-%if ARCH_X86_32
-%if STACK_ALIGNMENT < mmsize
- %define srcm [rsp+mmsize*4+gprsize]
-%endif
- mov srcm, srcq
-%else
- mov r7, srcq
-%endif
- shl r6d, (16 - 2) ; (wq / 4) << 16
- mov r6w, hw
-.v_w4_loop0:
- movd m2, [srcq+ssq*0] ; 0
- movhps m2, [srcq+ssq*2] ; 0 _ 2
- movd m3, [srcq+ssq*1] ; 1
-%if ARCH_X86_32
- lea srcq, [srcq+ssq*2]
- add srcq, ssq
- movhps m3, [srcq+ssq*0] ; 1 _ 3
- lea srcq, [srcq+ssq*1]
-%else
- movhps m3, [srcq+ss3q ] ; 1 _ 3
- lea srcq, [srcq+ssq*4]
-%endif
- pshufd m2, m2, q2020 ; 0 2 0 2
- pshufd m3, m3, q2020 ; 1 3 1 3
- punpckldq m2, m3 ; 0 1 2 3
- movd m3, [srcq+ssq*0] ; 4
- movd m1, [srcq+ssq*1] ; 5
- movd m0, [srcq+ssq*2] ; 6
-%if ARCH_X86_32
- lea srcq, [srcq+ssq*2]
- add srcq, ssq
-%else
- add srcq, ss3q
-%endif
- punpckldq m3, m1 ; 4 5 _ _
- punpckldq m1, m0 ; 5 6 _ _
- palignr m4, m3, m2, 4 ; 1 2 3 4
- punpcklbw m3, m1 ; 45 56
- punpcklbw m1, m2, m4 ; 01 12
- punpckhbw m2, m4 ; 23 34
-.v_w4_loop:
- pmaddubsw m5, m1, subpel0 ; a0 b0
- mova m1, m2
- pmaddubsw m2, subpel1 ; a1 b1
- paddw m5, m2
- mova m2, m3
- pmaddubsw m3, subpel2 ; a2 b2
- paddw m5, m3
- movd m4, [srcq+ssq*0]
- punpckldq m3, m0, m4 ; 6 7 _ _
- movd m0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- punpckldq m4, m0 ; 7 8 _ _
- punpcklbw m3, m4 ; 67 78
- pmaddubsw m4, m3, subpel3 ; a3 b3
- paddw m5, m4
- pmulhrsw m5, m7
- packuswb m5, m5
- movd [dstq+dsq*0], m5
- pshufd m5, m5, q0101
- movd [dstq+dsq*1], m5
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .v_w4_loop
- mov hw, r6w ; reset vertical loop
- add r4, 4
- mov dstq, r4
-%if ARCH_X86_32
- mov srcq, srcm
- add srcq, 4
- mov srcm, srcq
-%else
- add r7, 4
- mov srcq, r7
-%endif
- sub r6d, 1<<16 ; horizontal--
- jg .v_w4_loop0
- RET
-%if ARCH_X86_64
-.v_w8:
-.v_w16:
-.v_w32:
-.v_w64:
-.v_w128:
- lea r6d, [wq - 8] ; horizontal loop
- mov r4, dstq
- mov r7, srcq
- shl r6d, 8 - 3; (wq / 8) << 8
- mov r6b, hb
-.v_w8_loop0:
- movq m4, [srcq+ssq*0] ; 0
- movq m5, [srcq+ssq*1] ; 1
- lea srcq, [srcq+ssq*2]
- movq m6, [srcq+ssq*0] ; 2
- movq m0, [srcq+ssq*1] ; 3
- lea srcq, [srcq+ssq*2]
- movq m1, [srcq+ssq*0] ; 4
- movq m2, [srcq+ssq*1] ; 5
- lea srcq, [srcq+ssq*2] ;
- movq m3, [srcq+ssq*0] ; 6
- shufpd m4, m0, 0x0c
- shufpd m5, m1, 0x0c
- punpcklbw m1, m4, m5 ; 01
- punpckhbw m4, m5 ; 34
- shufpd m6, m2, 0x0c
- punpcklbw m2, m5, m6 ; 12
- punpckhbw m5, m6 ; 45
- shufpd m0, m3, 0x0c
- punpcklbw m3, m6, m0 ; 23
- punpckhbw m6, m0 ; 56
-.v_w8_loop:
- movq m12, [srcq+ssq*1] ; 8
- lea srcq, [srcq+ssq*2]
- movq m13, [srcq+ssq*0] ; 9
- pmaddubsw m14, m1, subpel0 ; a0
- pmaddubsw m15, m2, subpel0 ; b0
- mova m1, m3
- mova m2, m4
- pmaddubsw m3, subpel1 ; a1
- pmaddubsw m4, subpel1 ; b1
- paddw m14, m3
- paddw m15, m4
- mova m3, m5
- mova m4, m6
- pmaddubsw m5, subpel2 ; a2
- pmaddubsw m6, subpel2 ; b2
- paddw m14, m5
- paddw m15, m6
- shufpd m6, m0, m12, 0x0d
- shufpd m0, m12, m13, 0x0c
- punpcklbw m5, m6, m0 ; 67
- punpckhbw m6, m0 ; 78
- pmaddubsw m12, m5, subpel3 ; a3
- pmaddubsw m13, m6, subpel3 ; b3
- paddw m14, m12
- paddw m15, m13
- pmulhrsw m14, m7
- pmulhrsw m15, m7
- packuswb m14, m15
- movq [dstq+dsq*0], xm14
- movhps [dstq+dsq*1], xm14
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .v_w8_loop
- movzx hd, r6b ; reset vertical loop
- add r4, 8
- add r7, 8
- mov dstq, r4
- mov srcq, r7
- sub r6d, 1<<8 ; horizontal--
- jg .v_w8_loop0
- RET
-%endif ;ARCH_X86_64
-%undef subpel0
-%undef subpel1
-%undef subpel2
-%undef subpel3
-.hv:
- %assign stack_offset org_stack_offset
- cmp wd, 4
- jg .hv_w8
-%if ARCH_X86_32
- and mxd, 0x7f
-%else
- movzx mxd, mxb
-%endif
- dec srcq
- movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
-%if ARCH_X86_32
- movzx mxd, ssb
- shr ssd, 16
- cmp hd, 6
- cmovs ssd, mxd
- movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
- W32_RESTORE_SSQ
- lea r6, [ssq*3]
- sub srcq, r6
- %define base_reg r6
- mov r6, r1; use as new base
- %assign regs_used 2
- ALLOC_STACK -mmsize*14
- %assign regs_used 7
- mov dsq, [rstk+stack_offset+gprsize*2]
- %define subpelv0 [rsp+mmsize*0]
- %define subpelv1 [rsp+mmsize*1]
- %define subpelv2 [rsp+mmsize*2]
- %define subpelv3 [rsp+mmsize*3]
- punpcklqdq m0, m0
- punpcklbw m0, m0
- psraw m0, 8 ; sign-extend
- pshufd m6, m0, q0000
- mova subpelv0, m6
- pshufd m6, m0, q1111
- mova subpelv1, m6
- pshufd m6, m0, q2222
- mova subpelv2, m6
- pshufd m6, m0, q3333
- mova subpelv3, m6
-%else
- movzx mxd, myb
- shr myd, 16
- cmp hd, 6
- cmovs myd, mxd
- movq m0, [base_reg+myq*8+subpel_filters-put_ssse3]
- ALLOC_STACK mmsize*14, 14
- lea ss3q, [ssq*3]
- sub srcq, ss3q
- %define subpelv0 m10
- %define subpelv1 m11
- %define subpelv2 m12
- %define subpelv3 m13
- punpcklqdq m0, m0
- punpcklbw m0, m0
- psraw m0, 8 ; sign-extend
- mova m8, [base+pw_8192]
- mova m9, [base+pd_512]
- pshufd m10, m0, q0000
- pshufd m11, m0, q1111
- pshufd m12, m0, q2222
- pshufd m13, m0, q3333
-%endif
- pshufd m7, m1, q0000
- cmp wd, 4
- je .hv_w4
-.hv_w2:
- mova m6, [base+subpel_h_shuf4]
- ;
- movq m2, [srcq+ssq*0] ; 0
- movhps m2, [srcq+ssq*1] ; 0 _ 1
- movq m0, [srcq+ssq*2] ; 2
-%if ARCH_X86_32
- %define w8192reg [base+pw_8192]
- %define d512reg [base+pd_512]
- lea srcq, [srcq+ssq*2]
- add srcq, ssq
- movhps m0, [srcq+ssq*0] ; 2 _ 3
- lea srcq, [srcq+ssq*1]
-%else
- %define w8192reg m8
- %define d512reg m9
- movhps m0, [srcq+ss3q ] ; 2 _ 3
- lea srcq, [srcq+ssq*4]
-%endif
- pshufb m2, m6 ; 0 ~ 1 ~
- pshufb m0, m6 ; 2 ~ 3 ~
- pmaddubsw m2, m7 ; subpel_filters
- pmaddubsw m0, m7 ; subpel_filters
- phaddw m2, m0 ; 0 1 2 3
- pmulhrsw m2, w8192reg
- ;
- movq m3, [srcq+ssq*0] ; 4
- movhps m3, [srcq+ssq*1] ; 4 _ 5
- movq m0, [srcq+ssq*2] ; 6
-%if ARCH_X86_32
- lea srcq, [srcq+ssq*2]
- add srcq, ssq
-%else
- add srcq, ss3q
-%endif
- pshufb m3, m6 ; 4 ~ 5 ~
- pshufb m0, m6 ; 6 ~
- pmaddubsw m3, m7 ; subpel_filters
- pmaddubsw m0, m7 ; subpel_filters
- phaddw m3, m0 ; 4 5 6 _
- pmulhrsw m3, w8192reg
- ;
- palignr m4, m3, m2, 4; V 1 2 3 4
- punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2
- punpckhwd m2, m4 ; V 23 34 2 3 3 4
- pshufd m0, m3, q2121; V 5 6 5 6
- punpcklwd m3, m0 ; V 45 56 4 5 5 6
-.hv_w2_loop:
- pmaddwd m5, m1, subpelv0; V a0 b0
- mova m1, m2 ; V
- pmaddwd m2, subpelv1 ; V a1 b1
- paddd m5, m2 ; V
- mova m2, m3 ; V
- pmaddwd m3, subpelv2 ; a2 b2
- paddd m5, m3 ; V
- movq m4, [srcq+ssq*0] ; V 7
- movhps m4, [srcq+ssq*1] ; V 7 8
- lea srcq, [srcq+ssq*2] ; V
- pshufb m4, m6
- pmaddubsw m4, m7
- phaddw m4, m4
- pmulhrsw m4, w8192reg
- palignr m3, m4, m0, 12
- mova m0, m4
- punpcklwd m3, m0 ; V 67 78
- pmaddwd m4, m3, subpelv3 ; V a3 b3
- paddd m5, d512reg
- paddd m5, m4
- psrad m5, 10
- packssdw m5, m5
- packuswb m5, m5
- movd r4d, m5
- mov [dstq+dsq*0], r4w
- shr r4d, 16
- mov [dstq+dsq*1], r4w
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .hv_w2_loop
- RET
-%undef w8192reg
-%undef d512reg
- ;
-.hv_w4:
-%define hv4_line_0_0 4
-%define hv4_line_0_1 5
-%define hv4_line_0_2 6
-%define hv4_line_0_3 7
-%define hv4_line_0_4 8
-%define hv4_line_0_5 9
-%define hv4_line_1_0 10
-%define hv4_line_1_1 11
-%define hv4_line_1_2 12
-%define hv4_line_1_3 13
- ;
-%macro SAVELINE_W4 3
- mova [rsp+mmsize*hv4_line_%3_%2], %1
-%endmacro
-%macro RESTORELINE_W4 3
- mova %1, [rsp+mmsize*hv4_line_%3_%2]
-%endmacro
- ;
-%if ARCH_X86_32
- %define w8192reg [base+pw_8192]
- %define d512reg [base+pd_512]
-%else
- %define w8192reg m8
- %define d512reg m9
-%endif
- ; lower shuffle 0 1 2 3 4
- mova m6, [base+subpel_h_shuf4]
- movq m5, [srcq+ssq*0] ; 0 _ _ _
- movhps m5, [srcq+ssq*1] ; 0 _ 1 _
- movq m4, [srcq+ssq*2] ; 2 _ _ _
-%if ARCH_X86_32
- lea srcq, [srcq+ssq*2]
- add srcq, ssq
- movhps m4, [srcq+ssq*0] ; 2 _ 3 _
- add srcq, ssq
-%else
- movhps m4, [srcq+ss3q ] ; 2 _ 3 _
- lea srcq, [srcq+ssq*4]
-%endif
- pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
- pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
- pmaddubsw m2, m7 ;H subpel_filters
- pmaddubsw m0, m7 ;H subpel_filters
- phaddw m2, m0 ;H 0 1 2 3
- pmulhrsw m2, w8192reg ;H pw_8192
- SAVELINE_W4 m2, 2, 0
- ; upper shuffle 2 3 4 5 6
- mova m6, [base+subpel_h_shuf4+16]
- pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
- pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
- pmaddubsw m2, m7 ;H subpel_filters
- pmaddubsw m0, m7 ;H subpel_filters
- phaddw m2, m0 ;H 0 1 2 3
- pmulhrsw m2, w8192reg ;H pw_8192
- ;
- ; lower shuffle
- mova m6, [base+subpel_h_shuf4]
- movq m5, [srcq+ssq*0] ; 4 _ _ _
- movhps m5, [srcq+ssq*1] ; 4 _ 5 _
- movq m4, [srcq+ssq*2] ; 6 _ _ _
- pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
- pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
- pmaddubsw m3, m7 ;H subpel_filters
- pmaddubsw m0, m7 ;H subpel_filters
- phaddw m3, m0 ;H 4 5 6 7
- pmulhrsw m3, w8192reg ;H pw_8192
- SAVELINE_W4 m3, 3, 0
- ; upper shuffle
- mova m6, [base+subpel_h_shuf4+16]
- pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
- pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
- pmaddubsw m3, m7 ;H subpel_filters
- pmaddubsw m0, m7 ;H subpel_filters
- phaddw m3, m0 ;H 4 5 6 7
- pmulhrsw m3, w8192reg ;H pw_8192
- ;
-%if ARCH_X86_32
- lea srcq, [srcq+ssq*2]
- add srcq, ssq
-%else
- add srcq, ss3q
-%endif
- ;process high
- palignr m4, m3, m2, 4;V 1 2 3 4
- punpcklwd m1, m2, m4 ; V 01 12
- punpckhwd m2, m4 ; V 23 34
- pshufd m0, m3, q2121;V 5 6 5 6
- punpcklwd m3, m0 ; V 45 56
- SAVELINE_W4 m0, 0, 1
- SAVELINE_W4 m1, 1, 1
- SAVELINE_W4 m2, 2, 1
- SAVELINE_W4 m3, 3, 1
- ;process low
- RESTORELINE_W4 m2, 2, 0
- RESTORELINE_W4 m3, 3, 0
- palignr m4, m3, m2, 4;V 1 2 3 4
- punpcklwd m1, m2, m4 ; V 01 12
- punpckhwd m2, m4 ; V 23 34
- pshufd m0, m3, q2121;V 5 6 5 6
- punpcklwd m3, m0 ; V 45 56
-.hv_w4_loop:
- ;process low
- pmaddwd m5, m1, subpelv0 ; V a0 b0
- mova m1, m2
- pmaddwd m2, subpelv1; V a1 b1
- paddd m5, m2
- mova m2, m3
- pmaddwd m3, subpelv2; V a2 b2
- paddd m5, m3
- ;
- mova m6, [base+subpel_h_shuf4]
- movq m4, [srcq+ssq*0] ; 7
- movhps m4, [srcq+ssq*1] ; 7 _ 8 _
- pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
- pmaddubsw m4, m7 ;H subpel_filters
- phaddw m4, m4 ;H 7 8 7 8
- pmulhrsw m4, w8192reg ;H pw_8192
- palignr m3, m4, m0, 12 ; 6 7 8 7
- mova m0, m4
- punpcklwd m3, m4 ; 67 78
- pmaddwd m4, m3, subpelv3; a3 b3
- paddd m5, d512reg ; pd_512
- paddd m5, m4
- psrad m5, 10
- SAVELINE_W4 m0, 0, 0
- SAVELINE_W4 m1, 1, 0
- SAVELINE_W4 m2, 2, 0
- SAVELINE_W4 m3, 3, 0
- SAVELINE_W4 m5, 5, 0
- ;process high
- RESTORELINE_W4 m0, 0, 1
- RESTORELINE_W4 m1, 1, 1
- RESTORELINE_W4 m2, 2, 1
- RESTORELINE_W4 m3, 3, 1
- pmaddwd m5, m1, subpelv0; V a0 b0
- mova m1, m2
- pmaddwd m2, subpelv1; V a1 b1
- paddd m5, m2
- mova m2, m3
- pmaddwd m3, subpelv2; V a2 b2
- paddd m5, m3
- ;
- mova m6, [base+subpel_h_shuf4+16]
- movq m4, [srcq+ssq*0] ; 7
- movhps m4, [srcq+ssq*1] ; 7 _ 8 _
- pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
- pmaddubsw m4, m7 ;H subpel_filters
- phaddw m4, m4 ;H 7 8 7 8
- pmulhrsw m4, w8192reg ;H pw_8192
- palignr m3, m4, m0, 12 ; 6 7 8 7
- mova m0, m4
- punpcklwd m3, m4 ; 67 78
- pmaddwd m4, m3, subpelv3; a3 b3
- paddd m5, d512reg ; pd_512
- paddd m5, m4
- psrad m4, m5, 10
- ;
- RESTORELINE_W4 m5, 5, 0
- packssdw m5, m4 ; d -> w
- packuswb m5, m5 ; w -> b
- pshuflw m5, m5, q3120
- lea srcq, [srcq+ssq*2]
- movd [dstq+dsq*0], m5
- psrlq m5, 32
- movd [dstq+dsq*1], m5
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- SAVELINE_W4 m0, 0, 1
- SAVELINE_W4 m1, 1, 1
- SAVELINE_W4 m2, 2, 1
- SAVELINE_W4 m3, 3, 1
- RESTORELINE_W4 m0, 0, 0
- RESTORELINE_W4 m1, 1, 0
- RESTORELINE_W4 m2, 2, 0
- RESTORELINE_W4 m3, 3, 0
- jg .hv_w4_loop
- RET
-%undef subpelv0
-%undef subpelv1
-%undef subpelv2
-%undef subpelv3
- ;
-.hv_w8:
- %assign stack_offset org_stack_offset
-%define hv8_line_1 0
-%define hv8_line_2 1
-%define hv8_line_3 2
-%define hv8_line_4 3
-%define hv8_line_6 4
-%macro SAVELINE_W8 2
- mova [rsp+hv8_line_%1*mmsize], %2
-%endmacro
-%macro RESTORELINE_W8 2
- mova %2, [rsp+hv8_line_%1*mmsize]
-%endmacro
- shr mxd, 16
- sub srcq, 3
-%if ARCH_X86_32
- %define base_reg r1
- %define subpelh0 [rsp+mmsize*5]
- %define subpelh1 [rsp+mmsize*6]
- %define subpelv0 [rsp+mmsize*7]
- %define subpelv1 [rsp+mmsize*8]
- %define subpelv2 [rsp+mmsize*9]
- %define subpelv3 [rsp+mmsize*10]
- %define accuv0 [rsp+mmsize*11]
- %define accuv1 [rsp+mmsize*12]
- movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
- movzx mxd, ssb
- shr ssd, 16
- cmp hd, 6
- cmovs ssd, mxd
- movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
- mov ssq, ssmp
- ALLOC_STACK -mmsize*13
-%if STACK_ALIGNMENT < 16
- %define srcm [rsp+mmsize*13+gprsize*1]
- %define dsm [rsp+mmsize*13+gprsize*2]
- mov r6, [rstk+stack_offset+gprsize*2]
- mov dsm, r6
-%endif
- pshufd m0, m1, q0000
- pshufd m1, m1, q1111
- punpcklbw m5, m5
- psraw m5, 8 ; sign-extend
- pshufd m2, m5, q0000
- pshufd m3, m5, q1111
- pshufd m4, m5, q2222
- pshufd m5, m5, q3333
- mova subpelh0, m0
- mova subpelh1, m1
- mova subpelv0, m2
- mova subpelv1, m3
- mova subpelv2, m4
- mova subpelv3, m5
- lea r6, [ssq*3]
- sub srcq, r6
- mov srcm, srcq
-%else
- ALLOC_STACK mmsize*5, 16
- %define subpelh0 m10
- %define subpelh1 m11
- %define subpelv0 m12
- %define subpelv1 m13
- %define subpelv2 m14
- %define subpelv3 m15
- %define accuv0 m8
- %define accuv1 m9
- movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
- movzx mxd, myb
- shr myd, 16
- cmp hd, 6
- cmovs myd, mxd
- movq m1, [base_reg+myq*8+subpel_filters-put_ssse3]
- pshufd subpelh0, m0, q0000
- pshufd subpelh1, m0, q1111
- punpcklqdq m1, m1
- punpcklbw m1, m1
- psraw m1, 8 ; sign-extend
- pshufd subpelv0, m1, q0000
- pshufd subpelv1, m1, q1111
- pshufd subpelv2, m1, q2222
- pshufd subpelv3, m1, q3333
- lea ss3q, [ssq*3]
- sub srcq, ss3q
- mov r7, srcq
-%endif
- lea r6d, [wq-4]
- mov r4, dstq
- shl r6d, (16 - 2)
- mov r6w, hw
-.hv_w8_loop0:
- movu m4, [srcq+ssq*0] ; 0 = _ _
- movu m5, [srcq+ssq*1] ; 1 = _ _
- lea srcq, [srcq+ssq*2]
- ;
-%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
- %if ARCH_X86_32
- pshufb %3, %1, [base+subpel_h_shufB]
- pshufb %4, %1, [base+subpel_h_shufC]
- pshufb %1, [base+subpel_h_shufA]
- %else
- pshufb %3, %1, %6 ; subpel_h_shufB
- pshufb %4, %1, %7 ; subpel_h_shufC
- pshufb %1, %5 ; subpel_h_shufA
- %endif
- pmaddubsw %2, %3, subpelh0 ; subpel +0 C0
- pmaddubsw %4, subpelh1; subpel +4 B4
- pmaddubsw %3, subpelh1; C4
- pmaddubsw %1, subpelh0; A0
- paddw %2, %4 ; C0+B4
- paddw %1, %3 ; A0+C4
- phaddw %1, %2
-%endmacro
- ;
-%if ARCH_X86_64
- mova m7, [base+subpel_h_shufA]
- mova m8, [base+subpel_h_shufB]
- mova m9, [base+subpel_h_shufC]
-%endif
- HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
- HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
- movu m6, [srcq+ssq*0] ; 2 = _ _
- movu m0, [srcq+ssq*1] ; 3 = _ _
- lea srcq, [srcq+ssq*2]
- HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
- HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
- ;
- mova m7, [base+pw_8192]
- pmulhrsw m4, m7 ; H pw_8192
- pmulhrsw m5, m7 ; H pw_8192
- pmulhrsw m6, m7 ; H pw_8192
- pmulhrsw m0, m7 ; H pw_8192
- punpcklwd m1, m4, m5 ; 0 1 ~
- punpcklwd m2, m5, m6 ; 1 2 ~
- punpcklwd m3, m6, m0 ; 2 3 ~
- SAVELINE_W8 1, m1
- SAVELINE_W8 2, m2
- SAVELINE_W8 3, m3
- ;
- mova m7, [base+subpel_h_shufA]
- movu m4, [srcq+ssq*0] ; 4 = _ _
- movu m5, [srcq+ssq*1] ; 5 = _ _
- lea srcq, [srcq+ssq*2]
- movu m6, [srcq+ssq*0] ; 6 = _ _
- HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
- HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
- HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
- mova m7, [base+pw_8192]
- pmulhrsw m1, m4, m7 ; H pw_8192 4 ~
- pmulhrsw m2, m5, m7 ; H pw_8192 5 ~
- pmulhrsw m3, m6, m7 ; H pw_8192 6 ~
- punpcklwd m4, m0, m1 ; 3 4 ~
- punpcklwd m5, m1, m2 ; 4 5 ~
- punpcklwd m6, m2, m3 ; 5 6 ~
- ;
- SAVELINE_W8 6, m3
- RESTORELINE_W8 1, m1
- RESTORELINE_W8 2, m2
- RESTORELINE_W8 3, m3
-.hv_w8_loop:
- ; m8 accu for V a
- ; m9 accu for V b
- SAVELINE_W8 1, m3
- SAVELINE_W8 2, m4
- SAVELINE_W8 3, m5
- SAVELINE_W8 4, m6
-%if ARCH_X86_32
- pmaddwd m0, m1, subpelv0 ; a0
- pmaddwd m7, m2, subpelv0 ; b0
- pmaddwd m3, subpelv1 ; a1
- pmaddwd m4, subpelv1 ; b1
- paddd m0, m3
- paddd m7, m4
- pmaddwd m5, subpelv2 ; a2
- pmaddwd m6, subpelv2 ; b2
- paddd m0, m5
- paddd m7, m6
- mova m5, [base+pd_512]
- paddd m0, m5 ; pd_512
- paddd m7, m5 ; pd_512
- mova accuv0, m0
- mova accuv1, m7
-%else
- pmaddwd m8, m1, subpelv0 ; a0
- pmaddwd m9, m2, subpelv0 ; b0
- pmaddwd m3, subpelv1 ; a1
- pmaddwd m4, subpelv1 ; b1
- paddd m8, m3
- paddd m9, m4
- pmaddwd m5, subpelv2 ; a2
- pmaddwd m6, subpelv2 ; b2
- paddd m8, m5
- paddd m9, m6
- mova m7, [base+pd_512]
- paddd m8, m7 ; pd_512
- paddd m9, m7 ; pd_512
- mova m7, [base+subpel_h_shufB]
- mova m6, [base+subpel_h_shufC]
- mova m5, [base+subpel_h_shufA]
-%endif
- movu m0, [srcq+ssq*1] ; 7
- movu m4, [srcq+ssq*2] ; 8
- lea srcq, [srcq+ssq*2]
- HV_H_W8 m0, m1, m2, m3, m5, m7, m6
- HV_H_W8 m4, m1, m2, m3, m5, m7, m6
- mova m5, [base+pw_8192]
- pmulhrsw m0, m5 ; H pw_8192
- pmulhrsw m4, m5 ; H pw_8192
- RESTORELINE_W8 6, m6
- punpcklwd m5, m6, m0 ; 6 7 ~
- punpcklwd m6, m0, m4 ; 7 8 ~
- pmaddwd m1, m5, subpelv3 ; a3
- paddd m2, m1, accuv0
- pmaddwd m1, m6, subpelv3 ; b3
- paddd m1, m1, accuv1 ; H + V
- psrad m2, 10
- psrad m1, 10
- packssdw m2, m1 ; d -> w
- packuswb m2, m1 ; w -> b
- movd [dstq+dsq*0], m2
- psrlq m2, 32
-%if ARCH_X86_32
- add dstq, dsm
- movd [dstq+dsq*0], m2
- add dstq, dsm
-%else
- movd [dstq+dsq*1], m2
- lea dstq, [dstq+dsq*2]
-%endif
- sub hd, 2
- jle .hv_w8_outer
- SAVELINE_W8 6, m4
- RESTORELINE_W8 1, m1
- RESTORELINE_W8 2, m2
- RESTORELINE_W8 3, m3
- RESTORELINE_W8 4, m4
- jmp .hv_w8_loop
-.hv_w8_outer:
- movzx hd, r6w
- add r4, 4
- mov dstq, r4
-%if ARCH_X86_32
- mov srcq, srcm
- add srcq, 4
- mov srcm, srcq
-%else
- add r7, 4
- mov srcq, r7
-%endif
- sub r6d, 1<<16
- jg .hv_w8_loop0
- RET
-
-%if ARCH_X86_32
-DECLARE_REG_TMP 1, 2
-%elif WIN64
-DECLARE_REG_TMP 6, 4
-%else
-DECLARE_REG_TMP 6, 7
-%endif
-%macro PREP_8TAP_FN 3 ; type, type_h, type_v
-cglobal prep_8tap_%1
- mov t0d, FILTER_%2
- mov t1d, FILTER_%3
-%ifnidn %1, sharp_smooth ; skip the jump in the last filter
- jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
-%endif
-%endmacro
-
-PREP_8TAP_FN regular, REGULAR, REGULAR
-PREP_8TAP_FN regular_sharp, REGULAR, SHARP
-PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PREP_8TAP_FN smooth, SMOOTH, SMOOTH
-PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PREP_8TAP_FN sharp_regular, SHARP, REGULAR
-PREP_8TAP_FN sharp, SHARP, SHARP
-PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
-
-%if ARCH_X86_32
- %define base_reg r2
- %define base base_reg-prep_ssse3
- %define W32_RESTORE_SSQ mov strideq, stridem
-%else
- %define base_reg r7
- %define base 0
- %define W32_RESTORE_SSQ
-%endif
-
-cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
-%assign org_stack_offset stack_offset
- imul mxd, mxm, 0x010101
- add mxd, t0d ; 8tap_h, mx, 4tap_h
- imul myd, mym, 0x010101
- add myd, t1d ; 8tap_v, my, 4tap_v
- movsxd wq, wm
- movifnidn srcd, srcm
- movifnidn hd, hm
- LEA base_reg, prep_ssse3
- test mxd, 0xf00
- jnz .h
- test myd, 0xf00
- jnz .v
- tzcnt wd, wd
- movzx wd, word [base_reg+wq*2+table_offset(prep,)]
- add wq, base_reg
- movifnidn strided, stridem
- lea r6, [strideq*3]
- %assign stack_offset org_stack_offset
-%if WIN64
- pop r8
- pop r7
-%endif
- jmp wq
-.h:
- test myd, 0xf00
- jnz .hv
- WIN64_SPILL_XMM 12
- cmp wd, 4
- je .h_w4
- tzcnt wd, wd
-%if ARCH_X86_64
- mova m10, [base+subpel_h_shufA]
- mova m11, [base+subpel_h_shufB]
- mova m9, [base+subpel_h_shufC]
-%endif
- shr mxd, 16
- sub srcq, 3
- movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
- movd m5, [base_reg+mxq*8+subpel_filters-prep_ssse3+0]
- pshufd m5, m5, q0000
- movd m6, [base_reg+mxq*8+subpel_filters-prep_ssse3+4]
- pshufd m6, m6, q0000
- mova m7, [base+pw_8192]
- add wq, base_reg
- jmp wq
-.h_w4:
-%if ARCH_X86_32
- and mxd, 0x7f
-%else
- movzx mxd, mxb
-%endif
- dec srcq
- movd m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
- pshufd m4, m4, q0000
- mova m6, [base+pw_8192]
- mova m5, [base+subpel_h_shufA]
- W32_RESTORE_SSQ
-%if ARCH_X86_64
- lea stride3q, [strideq*3]
-%endif
-.h_w4_loop:
- movq m0, [srcq+strideq*0] ; 0
- movq m1, [srcq+strideq*1] ; 1
-%if ARCH_X86_32
- lea srcq, [srcq+strideq*2]
- movq m2, [srcq+strideq*0] ; 2
- movq m3, [srcq+strideq*1] ; 3
- lea srcq, [srcq+strideq*2]
-%else
- movq m2, [srcq+strideq*2] ; 2
- movq m3, [srcq+stride3q ] ; 3
- lea srcq, [srcq+strideq*4]
-%endif
- pshufb m0, m5 ; subpel_h_shufA
- pshufb m1, m5
- pshufb m2, m5
- pshufb m3, m5
- pmaddubsw m0, m4 ; subpel_filters + 2
- pmaddubsw m1, m4
- pmaddubsw m2, m4
- pmaddubsw m3, m4
- phaddw m0, m1
- phaddw m2, m3
- pmulhrsw m0, m6 ; pw_8192
- pmulhrsw m2, m6 ; pw_8192
- mova [tmpq+16*0], m0
- mova [tmpq+16*1], m2
- add tmpq, 32
- sub hd, 4
- jg .h_w4_loop
- RET
- ;
-%macro PREP_8TAP_H 4 ; dst/src, tmp[1-3]
-%if ARCH_X86_32
- pshufb %2, %1, [base+subpel_h_shufB]
- pshufb %3, %1, [base+subpel_h_shufC]
- pshufb %1, [base+subpel_h_shufA]
-%else
- pshufb %2, %1, m11; subpel_h_shufB
- pshufb %3, %1, m9 ; subpel_h_shufC
- pshufb %1, m10 ; subpel_h_shufA
-%endif
- pmaddubsw %4, %2, m5 ; subpel +0 B0
- pmaddubsw %2, m6 ; subpel +4 B4
- pmaddubsw %3, m6 ; subpel +4 C4
- pmaddubsw %1, m5 ; subpel +0 A0
- paddw %3, %4
- paddw %1, %2
- phaddw %1, %3
- pmulhrsw %1, m7 ; 8192
-%endmacro
- ;
-.h_w8:
-%if ARCH_X86_32
- mov r3, r2
- %define base_reg r3
- W32_RESTORE_SSQ
-%endif
-.h_w8_loop:
- movu m0, [srcq+strideq*0]
- movu m1, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- PREP_8TAP_H m0, m2, m3, m4
- PREP_8TAP_H m1, m2, m3, m4
- mova [tmpq+16*0], m0
- mova [tmpq+16*1], m1
- add tmpq, 32
- sub hd, 2
- jg .h_w8_loop
- RET
-.h_w16:
- xor r6d, r6d
- jmp .h_start
-.h_w32:
- mov r6, -16*1
- jmp .h_start
-.h_w64:
- mov r6, -16*3
- jmp .h_start
-.h_w128:
- mov r6, -16*7
-.h_start:
-%if ARCH_X86_32
- mov r3, r2
- %define base_reg r3
-%endif
- sub srcq, r6
- mov r5, r6
- W32_RESTORE_SSQ
-.h_loop:
- movu m0, [srcq+r6+8*0]
- movu m1, [srcq+r6+8*1]
- PREP_8TAP_H m0, m2, m3, m4
- PREP_8TAP_H m1, m2, m3, m4
- mova [tmpq+16*0], m0
- mova [tmpq+16*1], m1
- add tmpq, 32
- add r6, 16
- jle .h_loop
- add srcq, strideq
- mov r6, r5
- dec hd
- jg .h_loop
- RET
-%if ARCH_X86_32
- %define base_reg r2
-%endif
-
-.v:
-%if ARCH_X86_32
- mov mxd, myd
- and mxd, 0x7f
-%else
- %assign stack_offset org_stack_offset
- WIN64_SPILL_XMM 16
- movzx mxd, myb
-%endif
- shr myd, 16
- cmp hd, 6
- cmovs myd, mxd
- lea myq, [base_reg+myq*8+subpel_filters-prep_ssse3]
- mova m2, [base+pw_512]
- psrlw m2, m2, 1 ; 0x0100
- mova m7, [base+pw_8192]
-%if ARCH_X86_32
- %define subpel0 [rsp+mmsize*0]
- %define subpel1 [rsp+mmsize*1]
- %define subpel2 [rsp+mmsize*2]
- %define subpel3 [rsp+mmsize*3]
-%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed
- ALLOC_STACK -mmsize*4
-%assign regs_used 7
- movd m0, [myq+0]
- pshufb m0, m2
- mova subpel0, m0
- movd m0, [myq+2]
- pshufb m0, m2
- mova subpel1, m0
- movd m0, [myq+4]
- pshufb m0, m2
- mova subpel2, m0
- movd m0, [myq+6]
- pshufb m0, m2
- mova subpel3, m0
- mov strideq, [rstk+stack_offset+gprsize*3]
- lea strideq, [strideq*3]
- sub [rstk+stack_offset+gprsize*2], strideq
- mov strideq, [rstk+stack_offset+gprsize*3]
- mov srcq, [rstk+stack_offset+gprsize*2]
-%else
- %define subpel0 m8
- %define subpel1 m9
- %define subpel2 m10
- %define subpel3 m11
- movd subpel0, [myq+0]
- pshufb subpel0, m2
- movd subpel1, [myq+2]
- pshufb subpel1, m2
- movd subpel2, [myq+4]
- pshufb subpel2, m2
- movd subpel3, [myq+6]
- pshufb subpel3, m2
- lea stride3q, [strideq*3]
- sub srcq, stride3q
- cmp wd, 8
- jg .v_w16
- je .v_w8
-%endif
-.v_w4:
-%if ARCH_X86_32
-%if STACK_ALIGNMENT < mmsize
- %define srcm [rsp+mmsize*4+gprsize*1]
- %define tmpm [rsp+mmsize*4+gprsize*2]
-%endif
- mov tmpm, tmpq
- mov srcm, srcq
- lea r5d, [wq - 4] ; horizontal loop
- shl r5d, (16 - 2) ; (wq / 4) << 16
- mov r5w, hw
-.v_w4_loop0:
-%endif
- movd m2, [srcq+strideq*0] ; 0
- movhps m2, [srcq+strideq*2] ; 0 _ 2
- movd m3, [srcq+strideq*1] ; 1
-%if ARCH_X86_32
- lea srcq, [srcq+strideq*2]
- movhps m3, [srcq+strideq*1] ; 1 _ 3
- lea srcq, [srcq+strideq*2]
-%else
- movhps m3, [srcq+stride3q ] ; 1 _ 3
- lea srcq, [srcq+strideq*4]
-%endif
- pshufd m2, m2, q2020 ; 0 2 0 2
- pshufd m3, m3, q2020 ; 1 3 1 3
- punpckldq m2, m3 ; 0 1 2 3
- movd m3, [srcq+strideq*0] ; 4
- movd m1, [srcq+strideq*1] ; 5
- movd m0, [srcq+strideq*2] ; 6
-%if ARCH_X86_32
- lea srcq, [srcq+strideq*2]
- add srcq, strideq
-%else
- add srcq, stride3q
-%endif
- punpckldq m3, m1 ; 4 5 _ _
- punpckldq m1, m0 ; 5 6 _ _
- palignr m4, m3, m2, 4 ; 1 2 3 4
- punpcklbw m3, m1 ; 45 56
- punpcklbw m1, m2, m4 ; 01 12
- punpckhbw m2, m4 ; 23 34
-.v_w4_loop:
- pmaddubsw m5, m1, subpel0 ; a0 b0
- mova m1, m2
- pmaddubsw m2, subpel1 ; a1 b1
- paddw m5, m2
- mova m2, m3
- pmaddubsw m3, subpel2 ; a2 b2
- paddw m5, m3
- movd m4, [srcq+strideq*0]
- punpckldq m3, m0, m4 ; 6 7 _ _
- movd m0, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- punpckldq m4, m0 ; 7 8 _ _
- punpcklbw m3, m4 ; 67 78
- pmaddubsw m4, m3, subpel3 ; a3 b3
- paddw m5, m4
- pmulhrsw m5, m7
- movq [tmpq+wq*0], m5
- movhps [tmpq+wq*2], m5
- lea tmpq, [tmpq+wq*4]
- sub hd, 2
- jg .v_w4_loop
-%if ARCH_X86_32
- mov hw, r5w ; reset vertical loop
- mov tmpq, tmpm
- mov srcq, srcm
- add tmpq, 8
- add srcq, 4
- mov tmpm, tmpq
- mov srcm, srcq
- sub r5d, 1<<16 ; horizontal--
- jg .v_w4_loop0
-%endif
- RET
-
-%if ARCH_X86_64
-.v_w8:
-.v_w16:
- lea r5d, [wq - 8] ; horizontal loop
- mov r8, tmpq
- mov r6, srcq
- shl r5d, 8 - 3; (wq / 8) << 8
- mov r5b, hb
-.v_w8_loop0:
- movq m4, [srcq+strideq*0] ; 0
- movq m5, [srcq+strideq*1] ; 1
- lea srcq, [srcq+strideq*2]
- movq m6, [srcq+strideq*0] ; 2
- movq m0, [srcq+strideq*1] ; 3
- lea srcq, [srcq+strideq*2]
- movq m1, [srcq+strideq*0] ; 4
- movq m2, [srcq+strideq*1] ; 5
- lea srcq, [srcq+strideq*2] ;
- movq m3, [srcq+strideq*0] ; 6
- shufpd m4, m0, 0x0c
- shufpd m5, m1, 0x0c
- punpcklbw m1, m4, m5 ; 01
- punpckhbw m4, m5 ; 34
- shufpd m6, m2, 0x0c
- punpcklbw m2, m5, m6 ; 12
- punpckhbw m5, m6 ; 45
- shufpd m0, m3, 0x0c
- punpcklbw m3, m6, m0 ; 23
- punpckhbw m6, m0 ; 56
-.v_w8_loop:
- movq m12, [srcq+strideq*1] ; 8
- lea srcq, [srcq+strideq*2]
- movq m13, [srcq+strideq*0] ; 9
- pmaddubsw m14, m1, subpel0 ; a0
- pmaddubsw m15, m2, subpel0 ; b0
- mova m1, m3
- mova m2, m4
- pmaddubsw m3, subpel1 ; a1
- pmaddubsw m4, subpel1 ; b1
- paddw m14, m3
- paddw m15, m4
- mova m3, m5
- mova m4, m6
- pmaddubsw m5, subpel2 ; a2
- pmaddubsw m6, subpel2 ; b2
- paddw m14, m5
- paddw m15, m6
- shufpd m6, m0, m12, 0x0d
- shufpd m0, m12, m13, 0x0c
- punpcklbw m5, m6, m0 ; 67
- punpckhbw m6, m0 ; 78
- pmaddubsw m12, m5, subpel3 ; a3
- pmaddubsw m13, m6, subpel3 ; b3
- paddw m14, m12
- paddw m15, m13
- pmulhrsw m14, m7
- pmulhrsw m15, m7
- movu [tmpq+wq*0], xm14
- movu [tmpq+wq*2], xm15
- lea tmpq, [tmpq+wq*4]
- sub hd, 2
- jg .v_w8_loop
- movzx hd, r5b ; reset vertical loop
- add r8, 16
- add r6, 8
- mov tmpq, r8
- mov srcq, r6
- sub r5d, 1<<8 ; horizontal--
- jg .v_w8_loop0
- RET
-%endif ;ARCH_X86_64
-%undef subpel0
-%undef subpel1
-%undef subpel2
-%undef subpel3
-
-.hv:
- %assign stack_offset org_stack_offset
- cmp wd, 4
- jg .hv_w8
- and mxd, 0x7f
- movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
-%if ARCH_X86_32
- mov mxd, myd
- shr myd, 16
- and mxd, 0x7f
- cmp hd, 6
- cmovs myd, mxd
- movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
- mov r5, r2; use as new base
- %define base_reg r5
- %assign regs_used 2
- ALLOC_STACK -mmsize*14
- %assign regs_used 7
- mov strideq, [rstk+stack_offset+gprsize*3]
- lea strideq, [strideq*3 + 1]
- sub [rstk+stack_offset+gprsize*2], strideq
- mov strideq, [rstk+stack_offset+gprsize*3]
- mov srcq, [rstk+stack_offset+gprsize*2]
- %define subpelv0 [rsp+mmsize*0]
- %define subpelv1 [rsp+mmsize*1]
- %define subpelv2 [rsp+mmsize*2]
- %define subpelv3 [rsp+mmsize*3]
- punpcklbw m0, m0
- psraw m0, 8 ; sign-extend
- pshufd m6, m0, q0000
- mova subpelv0, m6
- pshufd m6, m0, q1111
- mova subpelv1, m6
- pshufd m6, m0, q2222
- mova subpelv2, m6
- pshufd m6, m0, q3333
- mova subpelv3, m6
-%else
- movzx mxd, myb
- shr myd, 16
- cmp hd, 6
- cmovs myd, mxd
- movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
- ALLOC_STACK mmsize*14, 14
- lea stride3q, [strideq*3]
- sub srcq, stride3q
- dec srcq
- %define subpelv0 m10
- %define subpelv1 m11
- %define subpelv2 m12
- %define subpelv3 m13
- punpcklbw m0, m0
- psraw m0, 8 ; sign-extend
- mova m8, [base+pw_8192]
- mova m9, [base+pd_32]
- pshufd m10, m0, q0000
- pshufd m11, m0, q1111
- pshufd m12, m0, q2222
- pshufd m13, m0, q3333
-%endif
- pshufd m7, m1, q0000
-.hv_w4:
-%define hv4_line_0_0 4
-%define hv4_line_0_1 5
-%define hv4_line_0_2 6
-%define hv4_line_0_3 7
-%define hv4_line_0_4 8
-%define hv4_line_0_5 9
-%define hv4_line_1_0 10
-%define hv4_line_1_1 11
-%define hv4_line_1_2 12
-%define hv4_line_1_3 13
- ;
- ;
-%if ARCH_X86_32
- %define w8192reg [base+pw_8192]
- %define d32reg [base+pd_32]
-%else
- %define w8192reg m8
- %define d32reg m9
-%endif
- ; lower shuffle 0 1 2 3 4
- mova m6, [base+subpel_h_shuf4]
- movq m5, [srcq+strideq*0] ; 0 _ _ _
- movhps m5, [srcq+strideq*1] ; 0 _ 1 _
- movq m4, [srcq+strideq*2] ; 2 _ _ _
-%if ARCH_X86_32
- lea srcq, [srcq+strideq*2]
- add srcq, strideq
- movhps m4, [srcq+strideq*0] ; 2 _ 3 _
- add srcq, strideq
-%else
- movhps m4, [srcq+stride3q ] ; 2 _ 3 _
- lea srcq, [srcq+strideq*4]
-%endif
- pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
- pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
- pmaddubsw m2, m7 ;H subpel_filters
- pmaddubsw m0, m7 ;H subpel_filters
- phaddw m2, m0 ;H 0 1 2 3
- pmulhrsw m2, w8192reg ;H pw_8192
- SAVELINE_W4 m2, 2, 0
- ; upper shuffle 2 3 4 5 6
- mova m6, [base+subpel_h_shuf4+16]
- pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
- pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
- pmaddubsw m2, m7 ;H subpel_filters
- pmaddubsw m0, m7 ;H subpel_filters
- phaddw m2, m0 ;H 0 1 2 3
- pmulhrsw m2, w8192reg ;H pw_8192
- ;
- ; lower shuffle
- mova m6, [base+subpel_h_shuf4]
- movq m5, [srcq+strideq*0] ; 4 _ _ _
- movhps m5, [srcq+strideq*1] ; 4 _ 5 _
- movq m4, [srcq+strideq*2] ; 6 _ _ _
- pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
- pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
- pmaddubsw m3, m7 ;H subpel_filters
- pmaddubsw m0, m7 ;H subpel_filters
- phaddw m3, m0 ;H 4 5 6 7
- pmulhrsw m3, w8192reg ;H pw_8192
- SAVELINE_W4 m3, 3, 0
- ; upper shuffle
- mova m6, [base+subpel_h_shuf4+16]
- pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
- pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
- pmaddubsw m3, m7 ;H subpel_filters
- pmaddubsw m0, m7 ;H subpel_filters
- phaddw m3, m0 ;H 4 5 6 7
- pmulhrsw m3, w8192reg ;H pw_8192
- ;
-%if ARCH_X86_32
- lea srcq, [srcq+strideq*2]
- add srcq, strideq
-%else
- add srcq, stride3q
-%endif
- ;process high
- palignr m4, m3, m2, 4;V 1 2 3 4
- punpcklwd m1, m2, m4 ; V 01 12
- punpckhwd m2, m4 ; V 23 34
- pshufd m0, m3, q2121;V 5 6 5 6
- punpcklwd m3, m0 ; V 45 56
- SAVELINE_W4 m0, 0, 1
- SAVELINE_W4 m1, 1, 1
- SAVELINE_W4 m2, 2, 1
- SAVELINE_W4 m3, 3, 1
- ;process low
- RESTORELINE_W4 m2, 2, 0
- RESTORELINE_W4 m3, 3, 0
- palignr m4, m3, m2, 4;V 1 2 3 4
- punpcklwd m1, m2, m4 ; V 01 12
- punpckhwd m2, m4 ; V 23 34
- pshufd m0, m3, q2121;V 5 6 5 6
- punpcklwd m3, m0 ; V 45 56
-.hv_w4_loop:
- ;process low
- pmaddwd m5, m1, subpelv0 ; V a0 b0
- mova m1, m2
- pmaddwd m2, subpelv1; V a1 b1
- paddd m5, m2
- mova m2, m3
- pmaddwd m3, subpelv2; V a2 b2
- paddd m5, m3
- ;
- mova m6, [base+subpel_h_shuf4]
- movq m4, [srcq+strideq*0] ; 7
- movhps m4, [srcq+strideq*1] ; 7 _ 8 _
- pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
- pmaddubsw m4, m7 ;H subpel_filters
- phaddw m4, m4 ;H 7 8 7 8
- pmulhrsw m4, w8192reg ;H pw_8192
- palignr m3, m4, m0, 12 ; 6 7 8 7
- mova m0, m4
- punpcklwd m3, m4 ; 67 78
- pmaddwd m4, m3, subpelv3; a3 b3
- paddd m5, d32reg ; pd_32
- paddd m5, m4
- psrad m5, 6
- SAVELINE_W4 m0, 0, 0
- SAVELINE_W4 m1, 1, 0
- SAVELINE_W4 m2, 2, 0
- SAVELINE_W4 m3, 3, 0
- SAVELINE_W4 m5, 5, 0
- ;process high
- RESTORELINE_W4 m0, 0, 1
- RESTORELINE_W4 m1, 1, 1
- RESTORELINE_W4 m2, 2, 1
- RESTORELINE_W4 m3, 3, 1
- pmaddwd m5, m1, subpelv0; V a0 b0
- mova m1, m2
- pmaddwd m2, subpelv1; V a1 b1
- paddd m5, m2
- mova m2, m3
- pmaddwd m3, subpelv2; V a2 b2
- paddd m5, m3
- ;
- mova m6, [base+subpel_h_shuf4+16]
- movq m4, [srcq+strideq*0] ; 7
- movhps m4, [srcq+strideq*1] ; 7 _ 8 _
- pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
- pmaddubsw m4, m7 ;H subpel_filters
- phaddw m4, m4 ;H 7 8 7 8
- pmulhrsw m4, w8192reg ;H pw_8192
- palignr m3, m4, m0, 12 ; 6 7 8 7
- mova m0, m4
- punpcklwd m3, m4 ; 67 78
- pmaddwd m4, m3, subpelv3; a3 b3
- paddd m5, d32reg ; pd_32
- paddd m5, m4
- psrad m4, m5, 6
- ;
- RESTORELINE_W4 m5, 5, 0
- packssdw m5, m4
- pshufd m5, m5, q3120
- movu [tmpq], m5
- lea srcq, [srcq+strideq*2]
- add tmpq, 16
- sub hd, 2
- SAVELINE_W4 m0, 0, 1
- SAVELINE_W4 m1, 1, 1
- SAVELINE_W4 m2, 2, 1
- SAVELINE_W4 m3, 3, 1
- RESTORELINE_W4 m0, 0, 0
- RESTORELINE_W4 m1, 1, 0
- RESTORELINE_W4 m2, 2, 0
- RESTORELINE_W4 m3, 3, 0
- jg .hv_w4_loop
- RET
-%undef subpelv0
-%undef subpelv1
-%undef subpelv2
-%undef subpelv3
- ;
-
-
-.hv_w8:
- %assign stack_offset org_stack_offset
-%define hv8_line_1 0
-%define hv8_line_2 1
-%define hv8_line_3 2
-%define hv8_line_4 3
-%define hv8_line_6 4
- shr mxd, 16
-%if ARCH_X86_32
- %define base_reg r2
- %define subpelh0 [rsp+mmsize*5]
- %define subpelh1 [rsp+mmsize*6]
- %define subpelv0 [rsp+mmsize*7]
- %define subpelv1 [rsp+mmsize*8]
- %define subpelv2 [rsp+mmsize*9]
- %define subpelv3 [rsp+mmsize*10]
- %define accuv0 [rsp+mmsize*11]
- %define accuv1 [rsp+mmsize*12]
- movq m1, [base_reg+mxq*8+subpel_filters-prep_ssse3]
- mov mxd, myd
- shr myd, 16
- and mxd, 0x7f
- cmp hd, 6
- cmovs myd, mxd
- movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
- ALLOC_STACK -mmsize*13
-%if STACK_ALIGNMENT < mmsize
- mov rstk, r2m
- %define tmpm [rsp+mmsize*13+gprsize*1]
- %define srcm [rsp+mmsize*13+gprsize*2]
- %define stridem [rsp+mmsize*13+gprsize*3]
- mov stridem, rstk
-%endif
- mov r6, r2
-%define base_reg r6
- pshufd m0, m1, q0000
- pshufd m1, m1, q1111
- punpcklbw m5, m5
- psraw m5, 8 ; sign-extend
- pshufd m2, m5, q0000
- pshufd m3, m5, q1111
- pshufd m4, m5, q2222
- pshufd m5, m5, q3333
- mova subpelh0, m0
- mova subpelh1, m1
- mova subpelv0, m2
- mova subpelv1, m3
- mova subpelv2, m4
- mova subpelv3, m5
- W32_RESTORE_SSQ
- lea strided, [strided*3]
- sub srcd, strided
- sub srcd, 3
- mov srcm, srcd
- W32_RESTORE_SSQ
-%else
- ALLOC_STACK mmsize*5, 16
- %define subpelh0 m10
- %define subpelh1 m11
- %define subpelv0 m12
- %define subpelv1 m13
- %define subpelv2 m14
- %define subpelv3 m15
- %define accuv0 m8
- %define accuv1 m9
- movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
- movzx mxd, myb
- shr myd, 16
- cmp hd, 6
- cmovs myd, mxd
- movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
- pshufd subpelh0, m0, q0000
- pshufd subpelh1, m0, q1111
- punpcklbw m1, m1
- psraw m1, 8 ; sign-extend
- pshufd subpelv0, m1, q0000
- pshufd subpelv1, m1, q1111
- pshufd subpelv2, m1, q2222
- pshufd subpelv3, m1, q3333
- lea stride3q, [strideq*3]
- sub srcq, 3
- sub srcq, stride3q
- mov r6, srcq
-%endif
- lea r5d, [wq-4]
-%if ARCH_X86_64
- mov r8, tmpq
-%else
- mov tmpm, tmpq
-%endif
- shl r5d, (16 - 2)
- mov r5w, hw
-.hv_w8_loop0:
- movu m4, [srcq+strideq*0] ; 0 = _ _
- movu m5, [srcq+strideq*1] ; 1 = _ _
- lea srcq, [srcq+strideq*2]
-%if ARCH_X86_64
- mova m7, [base+subpel_h_shufA]
- mova m8, [base+subpel_h_shufB]
- mova m9, [base+subpel_h_shufC]
-%endif
- HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
- HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
- movu m6, [srcq+strideq*0] ; 2 = _ _
- movu m0, [srcq+strideq*1] ; 3 = _ _
- lea srcq, [srcq+strideq*2]
- HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
- HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
- ;
- mova m7, [base+pw_8192]
- pmulhrsw m4, m7 ; H pw_8192
- pmulhrsw m5, m7 ; H pw_8192
- pmulhrsw m6, m7 ; H pw_8192
- pmulhrsw m0, m7 ; H pw_8192
- punpcklwd m1, m4, m5 ; 0 1 ~
- punpcklwd m2, m5, m6 ; 1 2 ~
- punpcklwd m3, m6, m0 ; 2 3 ~
- SAVELINE_W8 1, m1
- SAVELINE_W8 2, m2
- SAVELINE_W8 3, m3
- ;
- mova m7, [base+subpel_h_shufA]
- movu m4, [srcq+strideq*0] ; 4 = _ _
- movu m5, [srcq+strideq*1] ; 5 = _ _
- lea srcq, [srcq+strideq*2]
- movu m6, [srcq+strideq*0] ; 6 = _ _
- HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
- HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
- HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
- mova m7, [base+pw_8192]
- pmulhrsw m1, m4, m7 ; H pw_8192 4 ~
- pmulhrsw m2, m5, m7 ; H pw_8192 5 ~
- pmulhrsw m3, m6, m7 ; H pw_8192 6 ~
- punpcklwd m4, m0, m1 ; 3 4 ~
- punpcklwd m5, m1, m2 ; 4 5 ~
- punpcklwd m6, m2, m3 ; 5 6 ~
- ;
- SAVELINE_W8 6, m3
- RESTORELINE_W8 1, m1
- RESTORELINE_W8 2, m2
- RESTORELINE_W8 3, m3
-.hv_w8_loop:
- ; m8 accu for V a
- ; m9 accu for V b
- SAVELINE_W8 1, m3
- SAVELINE_W8 2, m4
- SAVELINE_W8 3, m5
- SAVELINE_W8 4, m6
-%if ARCH_X86_32
- pmaddwd m0, m1, subpelv0 ; a0
- pmaddwd m7, m2, subpelv0 ; b0
- pmaddwd m3, subpelv1 ; a1
- pmaddwd m4, subpelv1 ; b1
- paddd m0, m3
- paddd m7, m4
- pmaddwd m5, subpelv2 ; a2
- pmaddwd m6, subpelv2 ; b2
- paddd m0, m5
- paddd m7, m6
- mova m5, [base+pd_32]
- paddd m0, m5 ; pd_512
- paddd m7, m5 ; pd_512
- mova accuv0, m0
- mova accuv1, m7
-%else
- pmaddwd m8, m1, subpelv0 ; a0
- pmaddwd m9, m2, subpelv0 ; b0
- pmaddwd m3, subpelv1 ; a1
- pmaddwd m4, subpelv1 ; b1
- paddd m8, m3
- paddd m9, m4
- pmaddwd m5, subpelv2 ; a2
- pmaddwd m6, subpelv2 ; b2
- paddd m8, m5
- paddd m9, m6
- mova m7, [base+pd_32]
- paddd m8, m7 ; pd_512
- paddd m9, m7 ; pd_512
- mova m7, [base+subpel_h_shufB]
- mova m6, [base+subpel_h_shufC]
- mova m5, [base+subpel_h_shufA]
-%endif
- movu m0, [srcq+strideq*1] ; 7
- movu m4, [srcq+strideq*2] ; 8
- lea srcq, [srcq+strideq*2]
- HV_H_W8 m0, m1, m2, m3, m5, m7, m6
- HV_H_W8 m4, m1, m2, m3, m5, m7, m6
- mova m5, [base+pw_8192]
- pmulhrsw m0, m5 ; H pw_8192
- pmulhrsw m4, m5 ; H pw_8192
- RESTORELINE_W8 6, m6
- punpcklwd m5, m6, m0 ; 6 7 ~
- punpcklwd m6, m0, m4 ; 7 8 ~
- pmaddwd m1, m5, subpelv3 ; a3
- paddd m2, m1, accuv0
- pmaddwd m1, m6, subpelv3 ; b3
- paddd m1, m1, accuv1 ; H + V
- psrad m2, 6
- psrad m1, 6
- packssdw m2, m1 ; d -> w
- movq [tmpq+wq*0], m2
- movhps [tmpq+wq*2], m2
- lea tmpq, [tmpq+wq*4]
- sub hd, 2
- jle .hv_w8_outer
- SAVELINE_W8 6, m4
- RESTORELINE_W8 1, m1
- RESTORELINE_W8 2, m2
- RESTORELINE_W8 3, m3
- RESTORELINE_W8 4, m4
- jmp .hv_w8_loop
-.hv_w8_outer:
- movzx hd, r5w
-%if ARCH_X86_32
- add dword tmpm, 8
- mov tmpq, tmpm
- mov srcq, srcm
- add srcq, 4
- mov srcm, srcq
-%else
- add r8, 8
- mov tmpq, r8
- add r6, 4
- mov srcq, r6
-%endif
- sub r5d, 1<<16
- jg .hv_w8_loop0
- RET
-
-%if ARCH_X86_32
- %macro SAVE_ALPHA_BETA 0
- mov alpham, alphad
- mov betam, betad
- %endmacro
-
- %macro SAVE_DELTA_GAMMA 0
- mov deltam, deltad
- mov gammam, gammad
- %endmacro
-
- %macro LOAD_ALPHA_BETA_MX 0
- mov mym, myd
- mov alphad, alpham
- mov betad, betam
- mov mxd, mxm
- %endmacro
-
- %macro LOAD_DELTA_GAMMA_MY 0
- mov mxm, mxd
- mov deltad, deltam
- mov gammad, gammam
- mov myd, mym
- %endmacro
-
- %define PIC_reg r2
- %define PIC_base_offset $$
- %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
-%else
- %define SAVE_ALPHA_BETA
- %define SAVE_DELTA_GAMMA
- %define PIC_sym(sym) sym
-%endif
-
-%if ARCH_X86_32
- %if STACK_ALIGNMENT < required_stack_alignment
- %assign copy_args 8*4
- %else
- %assign copy_args 0
- %endif
-%endif
-
-%macro RELOC_ARGS 0
- %if copy_args
- mov r0, r0m
- mov r1, r1m
- mov r2, r2m
- mov r3, r3m
- mov r5, r5m
- mov dstm, r0
- mov dsm, r1
- mov srcm, r2
- mov ssm, r3
- mov mxm, r5
- mov r0, r6m
- mov mym, r0
- %endif
-%endmacro
-
-%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2
- %if cpuflag(sse4)
- pblendw %1, %2, 0xAA
- %else
- pand %2, m10
- por %1, %2
- %endif
-%endmacro
-
-%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
- ; Can be done using gathers, but that's terribly slow on many CPU:s
- %if ARCH_X86_32
- %define m8 m4
- %define m9 m5
- %define m14 m6
- %define m15 m7
- %define m11 m7
- %endif
- %if notcpuflag(ssse3) || ARCH_X86_32
- pxor m11, m11
- %endif
- lea tmp1d, [myq+deltaq*4]
- lea tmp2d, [myq+deltaq*1]
- shr myd, 10
- shr tmp1d, 10
- movq m2, [filterq+myq *8] ; a
- movq m8, [filterq+tmp1q*8] ; e
- lea tmp1d, [tmp2q+deltaq*4]
- lea myd, [tmp2q+deltaq*1]
- shr tmp2d, 10
- shr tmp1d, 10
- movq m3, [filterq+tmp2q*8] ; b
- movq m0, [filterq+tmp1q*8] ; f
- punpcklwd m2, m3
- punpcklwd m8, m0
- lea tmp1d, [myq+deltaq*4]
- lea tmp2d, [myq+deltaq*1]
- shr myd, 10
- shr tmp1d, 10
- movq m0, [filterq+myq *8] ; c
- movq m9, [filterq+tmp1q*8] ; g
- lea tmp1d, [tmp2q+deltaq*4]
- lea myd, [tmp2q+gammaq] ; my += gamma
- shr tmp2d, 10
- shr tmp1d, 10
- movq m3, [filterq+tmp2q*8] ; d
- movq m1, [filterq+tmp1q*8] ; h
- punpcklwd m0, m3
- punpcklwd m9, m1
- punpckldq m1, m2, m0
- punpckhdq m2, m0
- punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
- punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
- punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
- punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
- pmaddwd m0, %3
- pmaddwd m3, %5
- pmaddwd m1, %7
- pmaddwd m14, %9
- paddd m0, m3
- paddd m1, m14
- paddd m0, m1
- mova %1, m0
- %if ARCH_X86_64
- SWAP m3, m14
- %endif
- punpckldq m0, m8, m9
- punpckhdq m8, m9
- punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8
- punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8
- punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8
- punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8
- pmaddwd m1, %4
- pmaddwd m14, %6
- pmaddwd m2, %8
- pmaddwd m15, %10
- paddd m1, m14
- paddd m2, m15
- paddd m1, m2
- mova %2, m1
- %if ARCH_X86_64
- SWAP m14, m3
- %endif
-%endmacro
-
-%if ARCH_X86_64
- %define counterd r4d
-%else
- %if copy_args == 0
- %define counterd dword r4m
- %else
- %define counterd dword [esp+stack_size-4*7]
- %endif
-%endif
-
-%macro WARP_AFFINE_8X8T 0
-%if ARCH_X86_64
-cglobal warp_affine_8x8t, 6, 14, 16, 0x90, tmp, ts
-%else
-cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts
- %if copy_args
- %define tmpm [esp+stack_size-4*1]
- %define tsm [esp+stack_size-4*2]
- %endif
-%endif
- call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main
-.loop:
-%if ARCH_X86_32
- %define m12 m4
- %define m13 m5
- %define m14 m6
- %define m15 m7
- mova m12, [esp+0xC0]
- mova m13, [esp+0xD0]
- mova m14, [esp+0xE0]
- mova m15, [esp+0xF0]
-%endif
-%if cpuflag(ssse3)
- psrad m12, 13
- psrad m13, 13
- psrad m14, 13
- psrad m15, 13
- packssdw m12, m13
- packssdw m14, m15
- mova m13, [PIC_sym(pw_8192)]
- pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7
- pmulhrsw m14, m13
-%else
- %if ARCH_X86_32
- %define m10 m0
- %endif
- mova m10, [PIC_sym(pd_16384)]
- paddd m12, m10
- paddd m13, m10
- paddd m14, m10
- paddd m15, m10
- psrad m12, 15
- psrad m13, 15
- psrad m14, 15
- psrad m15, 15
- packssdw m12, m13
- packssdw m14, m15
-%endif
- mova [tmpq+tsq*0], m12
- mova [tmpq+tsq*2], m14
- dec counterd
- jz mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).end
-%if ARCH_X86_32
- mov tmpm, tmpd
- mov r0, [esp+0x100]
- mov r1, [esp+0x104]
-%endif
- call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main2
- lea tmpq, [tmpq+tsq*4]
- jmp .loop
-%endmacro
-
-%macro WARP_AFFINE_8X8 0
-%if ARCH_X86_64
-cglobal warp_affine_8x8, 6, 14, 16, 0x90, \
- dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
- filter, tmp1, delta, my, gamma
-%else
-cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \
- dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
- filter, tmp1, delta, my, gamma
- %define alphaq r0
- %define alphad r0
- %define alpham [esp+gprsize+0x100]
- %define betaq r1
- %define betad r1
- %define betam [esp+gprsize+0x104]
- %define deltaq r0
- %define deltad r0
- %define deltam [esp+gprsize+0x108]
- %define gammaq r1
- %define gammad r1
- %define gammam [esp+gprsize+0x10C]
- %define filterq r3
- %define tmp1q r4
- %define tmp1d r4
- %define tmp1m [esp+gprsize+0x110]
- %define myq r5
- %define myd r5
- %define mym r6m
- %if copy_args
- %define dstm [esp+stack_size-4*1]
- %define dsm [esp+stack_size-4*2]
- %define srcm [esp+stack_size-4*3]
- %define ssm [esp+stack_size-4*4]
- %define mxm [esp+stack_size-4*5]
- %define mym [esp+stack_size-4*6]
- %endif
-%endif
- call .main
- jmp .start
-.loop:
-%if ARCH_X86_32
- mov dstm, dstd
- mov alphad, [esp+0x100]
- mov betad, [esp+0x104]
-%endif
- call .main2
- lea dstq, [dstq+dsq*2]
-.start:
-%if notcpuflag(sse4)
- %if cpuflag(ssse3)
- %define roundval pw_8192
- %else
- %define roundval pd_262144
- %endif
- %if ARCH_X86_64
- mova m10, [PIC_sym(roundval)]
- %else
- %define m10 [PIC_sym(roundval)]
- %endif
-%endif
-%if ARCH_X86_32
- %define m12 m5
- %define m13 m6
- mova m12, [esp+0xC0]
- mova m13, [esp+0xD0]
-%endif
-%if cpuflag(sse4)
- %if ARCH_X86_32
- %define m11 m4
- pxor m11, m11
- %endif
- psrad m12, 18
- psrad m13, 18
- packusdw m12, m13
- pavgw m12, m11 ; (x + (1 << 10)) >> 11
-%else
- %if cpuflag(ssse3)
- psrad m12, 17
- psrad m13, 17
- packssdw m12, m13
- pmulhrsw m12, m10
- %else
- paddd m12, m10
- paddd m13, m10
- psrad m12, 19
- psrad m13, 19
- packssdw m12, m13
- %endif
-%endif
-%if ARCH_X86_32
- %define m14 m6
- %define m15 m7
- mova m14, [esp+0xE0]
- mova m15, [esp+0xF0]
-%endif
-%if cpuflag(sse4)
- psrad m14, 18
- psrad m15, 18
- packusdw m14, m15
- pavgw m14, m11 ; (x + (1 << 10)) >> 11
-%else
- %if cpuflag(ssse3)
- psrad m14, 17
- psrad m15, 17
- packssdw m14, m15
- pmulhrsw m14, m10
- %else
- paddd m14, m10
- paddd m15, m10
- psrad m14, 19
- psrad m15, 19
- packssdw m14, m15
- %endif
-%endif
- packuswb m12, m14
- movq [dstq+dsq*0], m12
- movhps [dstq+dsq*1], m12
- dec counterd
- jg .loop
-.end:
- RET
-ALIGN function_align
-.main:
-%assign stack_offset stack_offset+gprsize
-%if ARCH_X86_32
- %assign stack_size stack_size+4
- %if copy_args
- %assign stack_offset stack_offset-4
- %endif
- RELOC_ARGS
- LEA PIC_reg, $$
- %define PIC_mem [esp+gprsize+0x114]
- mov abcdd, abcdm
- %if copy_args == 0
- mov ssd, ssm
- mov mxd, mxm
- %endif
- mov PIC_mem, PIC_reg
- mov srcd, srcm
-%endif
- movsx deltad, word [abcdq+2*2]
- movsx gammad, word [abcdq+2*3]
- lea tmp1d, [deltaq*3]
- sub gammad, tmp1d ; gamma -= delta*3
- SAVE_DELTA_GAMMA
-%if ARCH_X86_32
- mov abcdd, abcdm
-%endif
- movsx alphad, word [abcdq+2*0]
- movsx betad, word [abcdq+2*1]
- lea tmp1q, [ssq*3+3]
- add mxd, 512+(64<<10)
- lea tmp2d, [alphaq*3]
- sub srcq, tmp1q ; src -= src_stride*3 + 3
-%if ARCH_X86_32
- mov srcm, srcd
- mov PIC_reg, PIC_mem
-%endif
- sub betad, tmp2d ; beta -= alpha*3
- lea filterq, [PIC_sym(mc_warp_filter)]
-%if ARCH_X86_64
- mov myd, r6m
- %if cpuflag(ssse3)
- pxor m11, m11
- %endif
-%endif
- call .h
- psrld m2, m0, 16
- psrld m3, m1, 16
-%if ARCH_X86_32
- %if notcpuflag(ssse3)
- mova [esp+gprsize+0x00], m2
- %endif
- mova [esp+gprsize+0x10], m3
-%endif
- call .h
- psrld m4, m0, 16
- psrld m5, m1, 16
-%if ARCH_X86_32
- mova [esp+gprsize+0x20], m4
- mova [esp+gprsize+0x30], m5
-%endif
- call .h
-%if ARCH_X86_64
- %define blendmask [rsp+gprsize+0x80]
-%else
- %if notcpuflag(ssse3)
- mova m2, [esp+gprsize+0x00]
- %endif
- mova m3, [esp+gprsize+0x10]
- %define blendmask [esp+gprsize+0x120]
- %define m10 m7
-%endif
- pcmpeqd m10, m10
- pslld m10, 16
- mova blendmask, m10
- BLENDHWDW m2, m0 ; 0
- BLENDHWDW m3, m1 ; 2
- mova [rsp+gprsize+0x00], m2
- mova [rsp+gprsize+0x10], m3
- call .h
-%if ARCH_X86_32
- mova m4, [esp+gprsize+0x20]
- mova m5, [esp+gprsize+0x30]
-%endif
- mova m10, blendmask
- BLENDHWDW m4, m0 ; 1
- BLENDHWDW m5, m1 ; 3
- mova [rsp+gprsize+0x20], m4
- mova [rsp+gprsize+0x30], m5
- call .h
-%if ARCH_X86_32
- %if notcpuflag(ssse3)
- mova m2, [esp+gprsize+0x00]
- %endif
- mova m3, [esp+gprsize+0x10]
- %define m10 m5
-%endif
- psrld m6, m2, 16
- psrld m7, m3, 16
- mova m10, blendmask
- BLENDHWDW m6, m0 ; 2
- BLENDHWDW m7, m1 ; 4
- mova [rsp+gprsize+0x40], m6
- mova [rsp+gprsize+0x50], m7
- call .h
-%if ARCH_X86_32
- mova m4, [esp+gprsize+0x20]
- mova m5, [esp+gprsize+0x30]
-%endif
- psrld m2, m4, 16
- psrld m3, m5, 16
- mova m10, blendmask
- BLENDHWDW m2, m0 ; 3
- BLENDHWDW m3, m1 ; 5
- mova [rsp+gprsize+0x60], m2
- mova [rsp+gprsize+0x70], m3
- call .h
-%if ARCH_X86_32
- mova m6, [esp+gprsize+0x40]
- mova m7, [esp+gprsize+0x50]
- %define m10 m7
-%endif
- psrld m4, m6, 16
- psrld m5, m7, 16
- mova m10, blendmask
- BLENDHWDW m4, m0 ; 4
- BLENDHWDW m5, m1 ; 6
-%if ARCH_X86_64
- add myd, 512+(64<<10)
- mova m6, m2
- mova m7, m3
-%else
- mova [esp+gprsize+0x80], m4
- mova [esp+gprsize+0x90], m5
- add dword mym, 512+(64<<10)
-%endif
- mov counterd, 4
- SAVE_ALPHA_BETA
-.main2:
- call .h
-%if ARCH_X86_32
- mova m6, [esp+gprsize+0x60]
- mova m7, [esp+gprsize+0x70]
- %define m10 m5
-%endif
- psrld m6, 16
- psrld m7, 16
- mova m10, blendmask
- BLENDHWDW m6, m0 ; 5
- BLENDHWDW m7, m1 ; 7
-%if ARCH_X86_64
- WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
- m4, m5, \
- [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
- m6, m7
-%else
- mova [esp+gprsize+0xA0], m6
- mova [esp+gprsize+0xB0], m7
- LOAD_DELTA_GAMMA_MY
- WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \
- [esp+gprsize+0x00], [esp+gprsize+0x10], \
- [esp+gprsize+0x80], [esp+gprsize+0x90], \
- [esp+gprsize+0x20], [esp+gprsize+0x30], \
- [esp+gprsize+0xA0], [esp+gprsize+0xB0]
- LOAD_ALPHA_BETA_MX
-%endif
- call .h
- mova m2, [rsp+gprsize+0x40]
- mova m3, [rsp+gprsize+0x50]
-%if ARCH_X86_32
- mova m4, [rsp+gprsize+0x80]
- mova m5, [rsp+gprsize+0x90]
- %define m10 m7
-%endif
- mova [rsp+gprsize+0x00], m2
- mova [rsp+gprsize+0x10], m3
- mova [rsp+gprsize+0x40], m4
- mova [rsp+gprsize+0x50], m5
- psrld m4, 16
- psrld m5, 16
- mova m10, blendmask
- BLENDHWDW m4, m0 ; 6
- BLENDHWDW m5, m1 ; 8
-%if ARCH_X86_64
- WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
- m6, m7, \
- [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
- m4, m5
-%else
- mova [esp+gprsize+0x80], m4
- mova [esp+gprsize+0x90], m5
- LOAD_DELTA_GAMMA_MY
- WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \
- [esp+gprsize+0x20], [esp+gprsize+0x30], \
- [esp+gprsize+0xA0], [esp+gprsize+0xB0], \
- [esp+gprsize+0x00], [esp+gprsize+0x10], \
- [esp+gprsize+0x80], [esp+gprsize+0x90]
- mov mym, myd
- mov dstd, dstm
- mov dsd, dsm
- mov mxd, mxm
-%endif
- mova m2, [rsp+gprsize+0x60]
- mova m3, [rsp+gprsize+0x70]
-%if ARCH_X86_32
- mova m6, [esp+gprsize+0xA0]
- mova m7, [esp+gprsize+0xB0]
-%endif
- mova [rsp+gprsize+0x20], m2
- mova [rsp+gprsize+0x30], m3
- mova [rsp+gprsize+0x60], m6
- mova [rsp+gprsize+0x70], m7
- ret
-ALIGN function_align
-.h:
-%if ARCH_X86_32
- %define m8 m3
- %define m9 m4
- %define m10 m5
- %define m14 m6
- %define m15 m7
-%endif
- lea tmp1d, [mxq+alphaq*4]
- lea tmp2d, [mxq+alphaq*1]
-%if ARCH_X86_32
- %assign stack_offset stack_offset+4
- %assign stack_size stack_size+4
- %define PIC_mem [esp+gprsize*2+0x114]
- mov PIC_mem, PIC_reg
- mov srcd, srcm
-%endif
- movu m10, [srcq]
-%if ARCH_X86_32
- add srcd, ssm
- mov srcm, srcd
- mov PIC_reg, PIC_mem
-%else
- add srcq, ssq
-%endif
- shr mxd, 10
- shr tmp1d, 10
- movq m1, [filterq+mxq *8] ; 0 X
- movq m8, [filterq+tmp1q*8] ; 4 X
- lea tmp1d, [tmp2q+alphaq*4]
- lea mxd, [tmp2q+alphaq*1]
- shr tmp2d, 10
- shr tmp1d, 10
- movhps m1, [filterq+tmp2q*8] ; 0 1
- movhps m8, [filterq+tmp1q*8] ; 4 5
- lea tmp1d, [mxq+alphaq*4]
- lea tmp2d, [mxq+alphaq*1]
- shr mxd, 10
- shr tmp1d, 10
-%if cpuflag(ssse3)
- movq m14, [filterq+mxq *8] ; 2 X
- movq m9, [filterq+tmp1q*8] ; 6 X
- lea tmp1d, [tmp2q+alphaq*4]
- lea mxd, [tmp2q+betaq] ; mx += beta
- shr tmp2d, 10
- shr tmp1d, 10
- movhps m14, [filterq+tmp2q*8] ; 2 3
- movhps m9, [filterq+tmp1q*8] ; 6 7
- pshufb m0, m10, [PIC_sym(warp_8x8_shufA)]
- pmaddubsw m0, m1
- pshufb m1, m10, [PIC_sym(warp_8x8_shufB)]
- pmaddubsw m1, m8
- pshufb m15, m10, [PIC_sym(warp_8x8_shufC)]
- pmaddubsw m15, m14
- pshufb m10, m10, [PIC_sym(warp_8x8_shufD)]
- pmaddubsw m10, m9
- phaddw m0, m15
- phaddw m1, m10
-%else
- %if ARCH_X86_32
- %define m11 m2
- %endif
- pcmpeqw m0, m0
- psrlw m14, m0, 8
- psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15
- pand m14, m10 ; 00 02 04 06 08 10 12 14
- packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15
- psrldq m9, m0, 4
- pshufd m0, m14, q0220
- pand m0, m9
- psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __
- pslldq m15, m14, 12
- por m0, m15 ; shufA
- psrlw m15, m0, 8
- psraw m11, m1, 8
- psllw m0, 8
- psllw m1, 8
- psrlw m0, 8
- psraw m1, 8
- pmullw m15, m11
- pmullw m0, m1
- paddw m0, m15 ; pmaddubsw m0, m1
- pshufd m15, m14, q0220
- pand m15, m9
- psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __
- pslldq m1, m14, 12
- por m15, m1 ; shufC
- pshufd m1, m14, q0220
- pand m1, m9
- psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __
- pslldq m11, m14, 12
- por m1, m11 ; shufB
- pshufd m10, m14, q0220
- pand m10, m9
- psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __
- pslldq m14, m14, 12
- por m10, m14 ; shufD
- psrlw m9, m1, 8
- psraw m11, m8, 8
- psllw m1, 8
- psllw m8, 8
- psrlw m1, 8
- psraw m8, 8
- pmullw m9, m11
- pmullw m1, m8
- paddw m1, m9 ; pmaddubsw m1, m8
- movq m14, [filterq+mxq *8] ; 2 X
- movq m9, [filterq+tmp1q*8] ; 6 X
- lea tmp1d, [tmp2q+alphaq*4]
- lea mxd, [tmp2q+betaq] ; mx += beta
- shr tmp2d, 10
- shr tmp1d, 10
- movhps m14, [filterq+tmp2q*8] ; 2 3
- movhps m9, [filterq+tmp1q*8] ; 6 7
- psrlw m8, m15, 8
- psraw m11, m14, 8
- psllw m15, 8
- psllw m14, 8
- psrlw m15, 8
- psraw m14, 8
- pmullw m8, m11
- pmullw m15, m14
- paddw m15, m8 ; pmaddubsw m15, m14
- psrlw m8, m10, 8
- psraw m11, m9, 8
- psllw m10, 8
- psllw m9, 8
- psrlw m10, 8
- psraw m9, 8
- pmullw m8, m11
- pmullw m10, m9
- paddw m10, m8 ; pmaddubsw m10, m9
- pslld m8, m0, 16
- pslld m9, m1, 16
- pslld m14, m15, 16
- pslld m11, m10, 16
- paddw m0, m8
- paddw m1, m9
- paddw m15, m14
- paddw m10, m11
- psrad m0, 16
- psrad m1, 16
- psrad m15, 16
- psrad m10, 16
- packssdw m0, m15 ; phaddw m0, m15
- packssdw m1, m10 ; phaddw m1, m10
-%endif
- mova m14, [PIC_sym(pw_8192)]
- mova m9, [PIC_sym(pd_32768)]
- pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
- pmaddwd m1, m14
- paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword
- paddd m1, m9
- ret
-%endmacro
-
-INIT_XMM sse4
-WARP_AFFINE_8X8
-WARP_AFFINE_8X8T
-
-INIT_XMM ssse3
-WARP_AFFINE_8X8
-WARP_AFFINE_8X8T
-
-INIT_XMM sse2
-WARP_AFFINE_8X8
-WARP_AFFINE_8X8T
-
-INIT_XMM ssse3
-
-%if WIN64
-DECLARE_REG_TMP 6, 4
-%else
-DECLARE_REG_TMP 6, 7
-%endif
-
-%macro BIDIR_FN 1 ; op
- %1 0
- lea stride3q, [strideq*3]
- jmp wq
-.w4_loop:
- %1_INC_PTR 2
- %1 0
- lea dstq, [dstq+strideq*4]
-.w4: ; tile 4x
- movd [dstq ], m0 ; copy dw[0]
- pshuflw m1, m0, q1032 ; swap dw[1] and dw[0]
- movd [dstq+strideq*1], m1 ; copy dw[1]
- punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0]
- movd [dstq+strideq*2], m0 ; dw[2]
- psrlq m0, 32 ; shift right in dw[3]
- movd [dstq+stride3q ], m0 ; copy
- sub hd, 4
- jg .w4_loop
- RET
-.w8_loop:
- %1_INC_PTR 2
- %1 0
- lea dstq, [dstq+strideq*2]
-.w8:
- movq [dstq ], m0
- movhps [dstq+strideq*1], m0
- sub hd, 2
- jg .w8_loop
- RET
-.w16_loop:
- %1_INC_PTR 2
- %1 0
- lea dstq, [dstq+strideq]
-.w16:
- mova [dstq ], m0
- dec hd
- jg .w16_loop
- RET
-.w32_loop:
- %1_INC_PTR 4
- %1 0
- lea dstq, [dstq+strideq]
-.w32:
- mova [dstq ], m0
- %1 2
- mova [dstq + 16 ], m0
- dec hd
- jg .w32_loop
- RET
-.w64_loop:
- %1_INC_PTR 8
- %1 0
- add dstq, strideq
-.w64:
- %assign i 0
- %rep 4
- mova [dstq + i*16 ], m0
- %assign i i+1
- %if i < 4
- %1 2*i
- %endif
- %endrep
- dec hd
- jg .w64_loop
- RET
-.w128_loop:
- %1_INC_PTR 16
- %1 0
- add dstq, strideq
-.w128:
- %assign i 0
- %rep 8
- mova [dstq + i*16 ], m0
- %assign i i+1
- %if i < 8
- %1 2*i
- %endif
- %endrep
- dec hd
- jg .w128_loop
- RET
-%endmacro
-
-%macro AVG 1 ; src_offset
- ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
- mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
- paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
- mova m1, [tmp1q+(%1+1)*mmsize]
- paddw m1, [tmp2q+(%1+1)*mmsize]
- pmulhrsw m0, m2
- pmulhrsw m1, m2
- packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
-%endmacro
-
-%macro AVG_INC_PTR 1
- add tmp1q, %1*mmsize
- add tmp2q, %1*mmsize
-%endmacro
-
-cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
- LEA r6, avg_ssse3_table
- tzcnt wd, wm ; leading zeros
- movifnidn hd, hm ; move h(stack) to h(register) if not already that register
- movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
- mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
- add wq, r6
- BIDIR_FN AVG
-
-%macro W_AVG 1 ; src_offset
- ; (a * weight + b * (16 - weight) + 128) >> 8
- ; = ((a - b) * weight + (b << 4) + 128) >> 8
- ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
- ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
- mova m2, [tmp1q+(%1+0)*mmsize]
- mova m0, m2
- psubw m2, [tmp2q+(%1+0)*mmsize]
- mova m3, [tmp1q+(%1+1)*mmsize]
- mova m1, m3
- psubw m3, [tmp2q+(%1+1)*mmsize]
- pmulhw m2, m4
- pmulhw m3, m4
- paddw m0, m2
- paddw m1, m3
- pmulhrsw m0, m5
- pmulhrsw m1, m5
- packuswb m0, m1
-%endmacro
-
-%define W_AVG_INC_PTR AVG_INC_PTR
-
-cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
- LEA r6, w_avg_ssse3_table
- tzcnt wd, wm
- movd m4, r6m
- movifnidn hd, hm
- pxor m0, m0
- movsxd wq, dword [r6+wq*4]
- mova m5, [pw_2048+r6-w_avg_ssse3_table]
- pshufb m4, m0
- psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
- add wq, r6
- cmp dword r6m, 7
- jg .weight_gt7
- mov r6, tmp1q
- psubw m0, m4
- mov tmp1q, tmp2q
- mova m4, m0 ; -weight
- mov tmp2q, r6
-.weight_gt7:
- BIDIR_FN W_AVG
-
-%macro MASK 1 ; src_offset
- ; (a * m + b * (64 - m) + 512) >> 10
- ; = ((a - b) * m + (b << 6) + 512) >> 10
- ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
- mova m3, [maskq+(%1+0)*(mmsize/2)]
- mova m0, [tmp2q+(%1+0)*mmsize] ; b
- psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
- mova m6, m3 ; m
- psubb m3, m4, m6 ; -m
- paddw m1, m1 ; (b - a) << 1
- paddb m3, m3 ; -m << 1
- punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
- pmulhw m1, m2 ; (-m * (b - a)) << 10
- paddw m0, m1 ; + b
- mova m1, [tmp2q+(%1+1)*mmsize] ; b
- psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
- paddw m2, m2 ; (b - a) << 1
- mova m6, m3 ; (-m << 1)
- punpckhbw m3, m4, m6 ; (-m << 9)
- pmulhw m2, m3 ; (-m << 9)
- paddw m1, m2 ; (-m * (b - a)) << 10
- pmulhrsw m0, m5 ; round
- pmulhrsw m1, m5 ; round
- packuswb m0, m1 ; interleave 16 -> 8
-%endmacro
-
-%macro MASK_INC_PTR 1
- add maskq, %1*mmsize/2
- add tmp1q, %1*mmsize
- add tmp2q, %1*mmsize
-%endmacro
-
-%if ARCH_X86_64
-cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
- movifnidn hd, hm
-%else
-cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
-%define hd dword r5m
-%endif
-%define base r6-mask_ssse3_table
- LEA r6, mask_ssse3_table
- tzcnt wd, wm
- movsxd wq, dword [r6+wq*4]
- pxor m4, m4
- mova m5, [base+pw_2048]
- add wq, r6
- mov maskq, r6m
- BIDIR_FN MASK
-%undef hd
-
-%macro W_MASK_420_B 2 ; src_offset in bytes, mask_out
- ;**** do m0 = u16.dst[7..0], m%2 = u16.m[7..0] ****
- mova m0, [tmp1q+(%1)]
- mova m1, [tmp2q+(%1)]
- mova m2, reg_pw_6903
- psubw m1, m0
- pabsw m%2, m1 ; abs(tmp1 - tmp2)
- mova m3, m2
- psubusw m2, m%2
- psrlw m2, 8 ; 64 - m
- mova m%2, m2
- psllw m2, 10
- pmulhw m1, m2 ; tmp2 * ()
- paddw m0, m1 ; tmp1 + ()
- ;**** do m1 = u16.dst[7..0], m%2 = u16.m[7..0] ****
- mova m1, [tmp1q+(%1)+mmsize]
- mova m2, [tmp2q+(%1)+mmsize]
- psubw m2, m1
- pabsw m7, m2 ; abs(tmp1 - tmp2)
- psubusw m3, m7
- psrlw m3, 8 ; 64 - m
- phaddw m%2, m3 ; pack both u16.m[8..0]runs as u8.m [15..0]
- psllw m3, 10
- pmulhw m2, m3
-%if ARCH_X86_32
- mova reg_pw_2048, [base+pw_2048]
-%endif
- paddw m1, m2
- pmulhrsw m0, reg_pw_2048 ; round/scale 2048
- pmulhrsw m1, reg_pw_2048 ; round/scale 2048
- packuswb m0, m1 ; concat m0 = u8.dst[15..0]
-%endmacro
-
-%macro W_MASK_420 2
- W_MASK_420_B (%1*16), %2
-%endmacro
-
-%define base r6-w_mask_420_ssse3_table
-%if ARCH_X86_64
-%define reg_pw_6903 m8
-%define reg_pw_2048 m9
-; args: dst, stride, tmp1, tmp2, w, h, mask, sign
-cglobal w_mask_420, 4, 8, 10, dst, stride, tmp1, tmp2, w, h, mask
- lea r6, [w_mask_420_ssse3_table]
- mov wd, wm
- tzcnt r7d, wd
- movd m0, r7m ; sign
- movifnidn hd, hm
- movsxd r7, [r6+r7*4]
- mova reg_pw_6903, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
- mova reg_pw_2048, [base+pw_2048]
- movd m6, [base+pw_258] ; 64 * 4 + 2
- add r7, r6
- mov maskq, maskmp
- psubw m6, m0
- pshuflw m6, m6, q0000
- punpcklqdq m6, m6
- W_MASK_420 0, 4
- jmp r7
- %define loop_w r7d
-%else
-%define reg_pw_6903 [base+pw_6903]
-%define reg_pw_2048 m3
-cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
- tzcnt wd, wm
- LEA r6, w_mask_420_ssse3_table
- movd m0, r7m ; sign
- mov maskq, r6mp
- mov wd, [r6+wq*4]
- movd m6, [base+pw_258]
- add wq, r6
- psubw m6, m0
- pshuflw m6, m6, q0000
- punpcklqdq m6, m6
- W_MASK_420 0, 4
- jmp wd
- %define loop_w dword r0m
- %define hd dword r5m
-%endif
-.w4_loop:
- add tmp1q, 2*16
- add tmp2q, 2*16
- W_MASK_420 0, 4
- lea dstq, [dstq+strideq*2]
- add maskq, 4
-.w4:
- movd [dstq ], m0 ; copy m0[0]
- pshuflw m1, m0, q1032
- movd [dstq+strideq*1], m1 ; copy m0[1]
- lea dstq, [dstq+strideq*2]
- punpckhqdq m0, m0
- movd [dstq+strideq*0], m0 ; copy m0[2]
- psrlq m0, 32
- movd [dstq+strideq*1], m0 ; copy m0[3]
- psubw m1, m6, m4 ; a _ c _
- psrlq m4, 32 ; b _ d _
- psubw m1, m4
- psrlw m1, 2
- packuswb m1, m1
- pshuflw m1, m1, q2020
- movd [maskq], m1
- sub hd, 4
- jg .w4_loop
- RET
-.w8_loop:
- add tmp1q, 2*16
- add tmp2q, 2*16
- W_MASK_420 0, 4
- lea dstq, [dstq+strideq*2]
- add maskq, 4
-.w8:
- movq [dstq ], m0
- movhps [dstq+strideq*1], m0
- psubw m0, m6, m4
- punpckhqdq m4, m4
- psubw m0, m4
- psrlw m0, 2
- packuswb m0, m0
- movd [maskq], m0
- sub hd, 2
- jg .w8_loop
- RET
-.w16: ; w32/64/128
-%if ARCH_X86_32
- mov wd, wm ; because we altered it in 32bit setup
-%endif
- mov loop_w, wd ; use width as counter
- jmp .w16ge_inner_loop_first
-.w16ge_loop:
- lea tmp1q, [tmp1q+wq*2] ; skip even line pixels
- lea tmp2q, [tmp2q+wq*2] ; skip even line pixels
- sub dstq, wq
- mov loop_w, wd
- lea dstq, [dstq+strideq*2]
-.w16ge_inner_loop:
- W_MASK_420_B 0, 4
-.w16ge_inner_loop_first:
- mova [dstq ], m0
- W_MASK_420_B wq*2, 5 ; load matching even line (offset = widthpx * (16+16))
- mova [dstq+strideq*1], m0
- psubw m1, m6, m4 ; m9 == 64 * 4 + 2
- psubw m1, m5 ; - odd line mask
- psrlw m1, 2 ; >> 2
- packuswb m1, m1
- movq [maskq], m1
- add tmp1q, 2*16
- add tmp2q, 2*16
- add maskq, 8
- add dstq, 16
- sub loop_w, 16
- jg .w16ge_inner_loop
- sub hd, 2
- jg .w16ge_loop
- RET
-
-%undef reg_pw_6903
-%undef reg_pw_2048
-%undef dst_bak
-%undef loop_w
-%undef orig_w
-%undef hd
-
-%macro BLEND_64M 4; a, b, mask1, mask2
- punpcklbw m0, %1, %2; {b;a}[7..0]
- punpckhbw %1, %2 ; {b;a}[15..8]
- pmaddubsw m0, %3 ; {b*m[0] + (64-m[0])*a}[7..0] u16
- pmaddubsw %1, %4 ; {b*m[1] + (64-m[1])*a}[15..8] u16
- pmulhrsw m0, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
- pmulhrsw %1, m5 ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16
- packuswb m0, %1 ; {blendpx}[15..0] u8
-%endmacro
-
-%macro BLEND 2; a, b
- psubb m3, m4, m0 ; m3 = (64 - m)
- punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0]
- punpckhbw m3, m0 ; {m;(64-m)}[15..8]
- BLEND_64M %1, %2, m2, m3
-%endmacro
-
-cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
-%define base r6-blend_ssse3_table
- LEA r6, blend_ssse3_table
- tzcnt wd, wm
- movifnidn hd, hm
- movifnidn maskq, maskmp
- movsxd wq, dword [r6+wq*4]
- mova m4, [base+pb_64]
- mova m5, [base+pw_512]
- add wq, r6
- lea r6, [dsq*3]
- jmp wq
-.w4:
- movq m0, [maskq]; m
- movd m1, [dstq+dsq*0] ; a
- movd m6, [dstq+dsq*1]
- punpckldq m1, m6
- movq m6, [tmpq] ; b
- psubb m3, m4, m0 ; m3 = (64 - m)
- punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0]
- punpcklbw m1, m6 ; {b;a}[7..0]
- pmaddubsw m1, m2 ; {b*m[0] + (64-m[0])*a}[7..0] u16
- pmulhrsw m1, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
- packuswb m1, m0 ; {blendpx}[15..0] u8
- movd [dstq+dsq*0], m1
- psrlq m1, 32
- movd [dstq+dsq*1], m1
- add maskq, 8
- add tmpq, 8
- lea dstq, [dstq+dsq*2] ; dst_stride * 2
- sub hd, 2
- jg .w4
- RET
-.w8:
- mova m0, [maskq]; m
- movq m1, [dstq+dsq*0] ; a
- movhps m1, [dstq+dsq*1]
- mova m6, [tmpq] ; b
- BLEND m1, m6
- movq [dstq+dsq*0], m0
- movhps [dstq+dsq*1], m0
- add maskq, 16
- add tmpq, 16
- lea dstq, [dstq+dsq*2] ; dst_stride * 2
- sub hd, 2
- jg .w8
- RET
-.w16:
- mova m0, [maskq]; m
- mova m1, [dstq] ; a
- mova m6, [tmpq] ; b
- BLEND m1, m6
- mova [dstq], m0
- add maskq, 16
- add tmpq, 16
- add dstq, dsq ; dst_stride
- dec hd
- jg .w16
- RET
-.w32:
- %assign i 0
- %rep 2
- mova m0, [maskq+16*i]; m
- mova m1, [dstq+16*i] ; a
- mova m6, [tmpq+16*i] ; b
- BLEND m1, m6
- mova [dstq+i*16], m0
- %assign i i+1
- %endrep
- add maskq, 32
- add tmpq, 32
- add dstq, dsq ; dst_stride
- dec hd
- jg .w32
- RET
-
-cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
-%define base r5-blend_v_ssse3_table
- LEA r5, blend_v_ssse3_table
- tzcnt wd, wm
- movifnidn hd, hm
- movsxd wq, dword [r5+wq*4]
- mova m5, [base+pw_512]
- add wq, r5
- add maskq, obmc_masks-blend_v_ssse3_table
- jmp wq
-.w2:
- movd m3, [maskq+4]
- punpckldq m3, m3
- ; 2 mask blend is provided for 4 pixels / 2 lines
-.w2_loop:
- movd m1, [dstq+dsq*0] ; a {..;a;a}
- pinsrw m1, [dstq+dsq*1], 1
- movd m2, [tmpq] ; b
- punpcklbw m0, m1, m2; {b;a}[7..0]
- pmaddubsw m0, m3 ; {b*m + (64-m)*a}[7..0] u16
- pmulhrsw m0, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
- packuswb m0, m1 ; {blendpx}[8..0] u8
- movd r3d, m0
- mov [dstq+dsq*0], r3w
- shr r3d, 16
- mov [dstq+dsq*1], r3w
- add tmpq, 2*2
- lea dstq, [dstq + dsq * 2]
- sub hd, 2
- jg .w2_loop
- RET
-.w4:
- movddup m3, [maskq+8]
- ; 4 mask blend is provided for 8 pixels / 2 lines
-.w4_loop:
- movd m1, [dstq+dsq*0] ; a
- movd m2, [dstq+dsq*1] ;
- punpckldq m1, m2
- movq m2, [tmpq] ; b
- punpcklbw m1, m2 ; {b;a}[7..0]
- pmaddubsw m1, m3 ; {b*m + (64-m)*a}[7..0] u16
- pmulhrsw m1, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
- packuswb m1, m1 ; {blendpx}[8..0] u8
- movd [dstq], m1
- psrlq m1, 32
- movd [dstq+dsq*1], m1
- add tmpq, 2*4
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .w4_loop
- RET
-.w8:
- mova m3, [maskq+16]
- ; 8 mask blend is provided for 16 pixels
-.w8_loop:
- movq m1, [dstq+dsq*0] ; a
- movhps m1, [dstq+dsq*1]
- mova m2, [tmpq]; b
- BLEND_64M m1, m2, m3, m3
- movq [dstq+dsq*0], m0
- movhps [dstq+dsq*1], m0
- add tmpq, 16
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .w8_loop
- RET
-.w16:
- ; 16 mask blend is provided for 32 pixels
- mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0])
- mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1])
-.w16_loop:
- mova m1, [dstq] ; a
- mova m2, [tmpq] ; b
- BLEND_64M m1, m2, m3, m4
- mova [dstq], m0
- add tmpq, 16
- add dstq, dsq
- dec hd
- jg .w16_loop
- RET
-.w32:
-%if WIN64
- mova [rsp+8], xmm6
-%endif
- mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0])
- mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1])
- mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2])
- ; 16 mask blend is provided for 64 pixels
-.w32_loop:
- mova m1, [dstq+16*0] ; a
- mova m2, [tmpq+16*0] ; b
- BLEND_64M m1, m2, m3, m4
- movq m1, [dstq+16*1] ; a
- punpcklbw m1, [tmpq+16*1] ; b
- pmaddubsw m1, m6
- pmulhrsw m1, m5
- packuswb m1, m1
- mova [dstq+16*0], m0
- movq [dstq+16*1], m1
- add tmpq, 32
- add dstq, dsq
- dec hd
- jg .w32_loop
-%if WIN64
- mova xmm6, [rsp+8]
-%endif
- RET
-
-cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask
-%define base t0-blend_h_ssse3_table
-%if ARCH_X86_32
- ; We need to keep the PIC pointer for w4, reload wd from stack instead
- DECLARE_REG_TMP 6
-%else
- DECLARE_REG_TMP 5
- mov r6d, wd
-%endif
- LEA t0, blend_h_ssse3_table
- tzcnt wd, wm
- mov hd, hm
- movsxd wq, dword [t0+wq*4]
- mova m5, [base+pw_512]
- add wq, t0
- lea maskq, [base+obmc_masks+hq*2]
- lea hd, [hq*3]
- shr hd, 2 ; h * 3/4
- lea maskq, [maskq+hq*2]
- neg hq
- jmp wq
-.w2:
- movd m0, [dstq+dsq*0]
- pinsrw m0, [dstq+dsq*1], 1
- movd m2, [maskq+hq*2]
- movd m1, [tmpq]
- punpcklwd m2, m2
- punpcklbw m0, m1
- pmaddubsw m0, m2
- pmulhrsw m0, m5
- packuswb m0, m0
- movd r3d, m0
- mov [dstq+dsq*0], r3w
- shr r3d, 16
- mov [dstq+dsq*1], r3w
- lea dstq, [dstq+dsq*2]
- add tmpq, 2*2
- add hq, 2
- jl .w2
- RET
-.w4:
-%if ARCH_X86_32
- mova m3, [base+blend_shuf]
-%else
- mova m3, [blend_shuf]
-%endif
-.w4_loop:
- movd m0, [dstq+dsq*0]
- movd m2, [dstq+dsq*1]
- punpckldq m0, m2 ; a
- movq m1, [tmpq] ; b
- movq m2, [maskq+hq*2] ; m
- pshufb m2, m3
- punpcklbw m0, m1
- pmaddubsw m0, m2
- pmulhrsw m0, m5
- packuswb m0, m0
- movd [dstq+dsq*0], m0
- psrlq m0, 32
- movd [dstq+dsq*1], m0
- lea dstq, [dstq+dsq*2]
- add tmpq, 4*2
- add hq, 2
- jl .w4_loop
- RET
-.w8:
- movd m4, [maskq+hq*2]
- punpcklwd m4, m4
- pshufd m3, m4, q0000
- pshufd m4, m4, q1111
- movq m1, [dstq+dsq*0] ; a
- movhps m1, [dstq+dsq*1]
- mova m2, [tmpq]
- BLEND_64M m1, m2, m3, m4
- movq [dstq+dsq*0], m0
- movhps [dstq+dsq*1], m0
- lea dstq, [dstq+dsq*2]
- add tmpq, 8*2
- add hq, 2
- jl .w8
- RET
-; w16/w32/w64/w128
-.w16:
-%if ARCH_X86_32
- mov r6d, wm
-%endif
- sub dsq, r6
-.w16_loop0:
- movd m3, [maskq+hq*2]
- pshuflw m3, m3, q0000
- punpcklqdq m3, m3
- mov wd, r6d
-.w16_loop:
- mova m1, [dstq] ; a
- mova m2, [tmpq] ; b
- BLEND_64M m1, m2, m3, m3
- mova [dstq], m0
- add dstq, 16
- add tmpq, 16
- sub wd, 16
- jg .w16_loop
- add dstq, dsq
- inc hq
- jl .w16_loop0
- RET
-
-; emu_edge args:
-; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
-; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
-; const pixel *ref, const ptrdiff_t ref_stride
-;
-; bw, bh total filled size
-; iw, ih, copied block -> fill bottom, right
-; x, y, offset in bw/bh -> fill top, left
-cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
- y, dst, dstride, src, sstride, \
- bottomext, rightext, blk
- ; we assume that the buffer (stride) is larger than width, so we can
- ; safely overwrite by a few bytes
- pxor m1, m1
-
-%if ARCH_X86_64
- %define reg_zero r12q
- %define reg_tmp r10
- %define reg_src srcq
- %define reg_bottomext bottomextq
- %define reg_rightext rightextq
- %define reg_blkm r9m
-%else
- %define reg_zero r6
- %define reg_tmp r0
- %define reg_src r1
- %define reg_bottomext r0
- %define reg_rightext r1
- %define reg_blkm r2m
-%endif
- ;
- ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
- xor reg_zero, reg_zero
- lea reg_tmp, [ihq-1]
- cmp yq, ihq
- cmovs reg_tmp, yq
- test yq, yq
- cmovs reg_tmp, reg_zero
-%if ARCH_X86_64
- imul reg_tmp, sstrideq
- add srcq, reg_tmp
-%else
- imul reg_tmp, sstridem
- mov reg_src, srcm
- add reg_src, reg_tmp
-%endif
- ;
- ; ref += iclip(x, 0, iw - 1)
- lea reg_tmp, [iwq-1]
- cmp xq, iwq
- cmovs reg_tmp, xq
- test xq, xq
- cmovs reg_tmp, reg_zero
- add reg_src, reg_tmp
-%if ARCH_X86_32
- mov srcm, reg_src
-%endif
- ;
- ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
-%if ARCH_X86_32
- mov r1, r1m ; restore bh
-%endif
- lea reg_bottomext, [yq+bhq]
- sub reg_bottomext, ihq
- lea r3, [bhq-1]
- cmovs reg_bottomext, reg_zero
- ;
-
- DEFINE_ARGS bw, bh, iw, ih, x, \
- topext, dst, dstride, src, sstride, \
- bottomext, rightext, blk
-
- ; top_ext = iclip(-y, 0, bh - 1)
- neg topextq
- cmovs topextq, reg_zero
- cmp reg_bottomext, bhq
- cmovns reg_bottomext, r3
- cmp topextq, bhq
- cmovg topextq, r3
- %if ARCH_X86_32
- mov r4m, reg_bottomext
- ;
- ; right_ext = iclip(x + bw - iw, 0, bw - 1)
- mov r0, r0m ; restore bw
- %endif
- lea reg_rightext, [xq+bwq]
- sub reg_rightext, iwq
- lea r2, [bwq-1]
- cmovs reg_rightext, reg_zero
-
- DEFINE_ARGS bw, bh, iw, ih, leftext, \
- topext, dst, dstride, src, sstride, \
- bottomext, rightext, blk
-
- ; left_ext = iclip(-x, 0, bw - 1)
- neg leftextq
- cmovs leftextq, reg_zero
- cmp reg_rightext, bwq
- cmovns reg_rightext, r2
- %if ARCH_X86_32
- mov r3m, r1
- %endif
- cmp leftextq, bwq
- cmovns leftextq, r2
-
-%undef reg_zero
-%undef reg_tmp
-%undef reg_src
-%undef reg_bottomext
-%undef reg_rightext
-
- DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
- topext, dst, dstride, src, sstride, \
- bottomext, rightext, blk
-
- ; center_h = bh - top_ext - bottom_ext
-%if ARCH_X86_64
- lea r3, [bottomextq+topextq]
- sub centerhq, r3
-%else
- mov r1, centerhm ; restore r1
- sub centerhq, topextq
- sub centerhq, r4m
- mov r1m, centerhq
-%endif
- ;
- ; blk += top_ext * PXSTRIDE(dst_stride)
- mov r2, topextq
-%if ARCH_X86_64
- imul r2, dstrideq
-%else
- mov r6, r6m ; restore dstq
- imul r2, dstridem
-%endif
- add dstq, r2
- mov reg_blkm, dstq ; save pointer for ext
- ;
- ; center_w = bw - left_ext - right_ext
- mov centerwq, bwq
-%if ARCH_X86_64
- lea r3, [rightextq+leftextq]
- sub centerwq, r3
-%else
- sub centerwq, r3m
- sub centerwq, leftextq
-%endif
-
-; vloop Macro
-%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
- %if ARCH_X86_64
- %define reg_tmp r12
- %else
- %define reg_tmp r0
- %endif
-.v_loop_%3:
- %if ARCH_X86_32
- mov r0, r0m
- mov r1, r1m
- %endif
-%if %1
- ; left extension
- %if ARCH_X86_64
- movd m0, [srcq]
- %else
- mov r3, srcm
- movd m0, [r3]
- %endif
- pshufb m0, m1
- xor r3, r3
-.left_loop_%3:
- mova [dstq+r3], m0
- add r3, mmsize
- cmp r3, leftextq
- jl .left_loop_%3
- ; body
- lea reg_tmp, [dstq+leftextq]
-%endif
- xor r3, r3
-.body_loop_%3:
- %if ARCH_X86_64
- movu m0, [srcq+r3]
- %else
- mov r1, srcm
- movu m0, [r1+r3]
- %endif
-%if %1
- movu [reg_tmp+r3], m0
-%else
- movu [dstq+r3], m0
-%endif
- add r3, mmsize
- cmp r3, centerwq
- jl .body_loop_%3
-%if %2
- ; right extension
-%if %1
- add reg_tmp, centerwq
-%else
- lea reg_tmp, [dstq+centerwq]
-%endif
- %if ARCH_X86_64
- movd m0, [srcq+centerwq-1]
- %else
- mov r3, srcm
- movd m0, [r3+centerwq-1]
- %endif
- pshufb m0, m1
- xor r3, r3
-.right_loop_%3:
- movu [reg_tmp+r3], m0
- add r3, mmsize
- %if ARCH_X86_64
- cmp r3, rightextq
- %else
- cmp r3, r3m
- %endif
- jl .right_loop_%3
-%endif
- %if ARCH_X86_64
- add dstq, dstrideq
- add srcq, sstrideq
- dec centerhq
- jg .v_loop_%3
- %else
- add dstq, dstridem
- mov r0, sstridem
- add srcm, r0
- sub dword centerhm, 1
- jg .v_loop_%3
- mov r0, r0m ; restore r0
- %endif
-%endmacro ; vloop MACRO
-
- test leftextq, leftextq
- jnz .need_left_ext
- %if ARCH_X86_64
- test rightextq, rightextq
- jnz .need_right_ext
- %else
- cmp leftextq, r3m ; leftextq == 0
- jne .need_right_ext
- %endif
- v_loop 0, 0, 0
- jmp .body_done
-
- ;left right extensions
-.need_left_ext:
- %if ARCH_X86_64
- test rightextq, rightextq
- %else
- mov r3, r3m
- test r3, r3
- %endif
- jnz .need_left_right_ext
- v_loop 1, 0, 1
- jmp .body_done
-
-.need_left_right_ext:
- v_loop 1, 1, 2
- jmp .body_done
-
-.need_right_ext:
- v_loop 0, 1, 3
-
-.body_done:
-; r0 ; bw
-; r1 ;; x loop
-; r4 ;; y loop
-; r5 ; topextq
-; r6 ;dstq
-; r7 ;dstrideq
-; r8 ; srcq
-%if ARCH_X86_64
- %define reg_dstride dstrideq
-%else
- %define reg_dstride r2
-%endif
- ;
- ; bottom edge extension
- %if ARCH_X86_64
- test bottomextq, bottomextq
- jz .top
- %else
- xor r1, r1
- cmp r1, r4m
- je .top
- %endif
- ;
- %if ARCH_X86_64
- mov srcq, dstq
- sub srcq, dstrideq
- xor r1, r1
- %else
- mov r3, dstq
- mov reg_dstride, dstridem
- sub r3, reg_dstride
- mov srcm, r3
- %endif
- ;
-.bottom_x_loop:
- %if ARCH_X86_64
- mova m0, [srcq+r1]
- lea r3, [dstq+r1]
- mov r4, bottomextq
- %else
- mov r3, srcm
- mova m0, [r3+r1]
- lea r3, [dstq+r1]
- mov r4, r4m
- %endif
- ;
-.bottom_y_loop:
- mova [r3], m0
- add r3, reg_dstride
- dec r4
- jg .bottom_y_loop
- add r1, mmsize
- cmp r1, bwq
- jl .bottom_x_loop
-
-.top:
- ; top edge extension
- test topextq, topextq
- jz .end
-%if ARCH_X86_64
- mov srcq, reg_blkm
-%else
- mov r3, reg_blkm
- mov reg_dstride, dstridem
-%endif
- mov dstq, dstm
- xor r1, r1
- ;
-.top_x_loop:
-%if ARCH_X86_64
- mova m0, [srcq+r1]
-%else
- mov r3, reg_blkm
- mova m0, [r3+r1]
-%endif
- lea r3, [dstq+r1]
- mov r4, topextq
- ;
-.top_y_loop:
- mova [r3], m0
- add r3, reg_dstride
- dec r4
- jg .top_y_loop
- add r1, mmsize
- cmp r1, bwq
- jl .top_x_loop
-
-.end:
- RET
-
-%undef reg_dstride
-%undef reg_blkm
-%undef reg_tmp
-
-cextern resize_filter
-
-%macro SCRATCH 3
-%if ARCH_X86_32
- mova [rsp+%3*mmsize], m%1
-%define m%2 [rsp+%3*mmsize]
-%else
- SWAP %1, %2
-%endif
-%endmacro
-
-INIT_XMM ssse3
-%if ARCH_X86_64
-cglobal resize, 0, 14, 16, dst, dst_stride, src, src_stride, \
- dst_w, h, src_w, dx, mx0
-%elif STACK_ALIGNMENT >= 16
-cglobal resize, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \
- dst_w, h, src_w, dx, mx0
-%else
-cglobal resize, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
- dst_w, h, src_w, dx, mx0
-%endif
- movifnidn dstq, dstmp
- movifnidn srcq, srcmp
-%if STACK_ALIGNMENT >= 16
- movifnidn dst_wd, dst_wm
-%endif
-%if ARCH_X86_64
- movifnidn hd, hm
-%endif
- sub dword mx0m, 4<<14
- sub dword src_wm, 8
- movd m7, dxm
- movd m6, mx0m
- movd m5, src_wm
- pshufd m7, m7, q0000
- pshufd m6, m6, q0000
- pshufd m5, m5, q0000
-
-%if ARCH_X86_64
- DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
- LEA r7, $$
-%define base r7-$$
-%else
- DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
-%if STACK_ALIGNMENT >= 16
- LEA r6, $$
-%define base r6-$$
-%else
- LEA r4, $$
-%define base r4-$$
-%endif
-%endif
-
-%if ARCH_X86_64
- mova m12, [base+pw_m256]
- mova m11, [base+pd_63]
- mova m10, [base+pb_8x0_8x8]
-%else
-%define m12 [base+pw_m256]
-%define m11 [base+pd_63]
-%define m10 [base+pb_8x0_8x8]
-%endif
- pmaddwd m4, m7, [base+resize_mul] ; dx*[0,1,2,3]
- pslld m7, 2 ; dx*4
- pslld m5, 14
- paddd m6, m4 ; mx+[0..3]*dx
- SCRATCH 7, 15, 0
- SCRATCH 6, 14, 1
- SCRATCH 5, 13, 2
-
- ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
- ; m8 = mx+[0..3]*dx, m5 = dx*4, m6 = src_w, m7 = 0x3f, m15=0,8
-
-.loop_y:
- xor xd, xd
- mova m0, m14 ; per-line working version of mx
-
-.loop_x:
- pxor m1, m1
- pcmpgtd m1, m0
- pandn m1, m0
- psrad m2, m0, 8 ; filter offset (unmasked)
- pcmpgtd m3, m13, m1
- pand m1, m3
- pandn m3, m13
- por m1, m3
- psubd m3, m0, m1 ; pshufb offset
- psrad m1, 14 ; clipped src_x offset
- psrad m3, 14 ; pshufb edge_emu offset
- pand m2, m11 ; filter offset (masked)
-
- ; load source pixels
-%if ARCH_X86_64
- movd r8d, xm1
- pshuflw xm1, xm1, q3232
- movd r9d, xm1
- punpckhqdq xm1, xm1
- movd r10d, xm1
- psrlq xm1, 32
- movd r11d, xm1
- movq xm4, [srcq+r8]
- movq xm5, [srcq+r10]
- movhps xm4, [srcq+r9]
- movhps xm5, [srcq+r11]
-%else
- movd r3d, xm1
- pshufd xm1, xm1, q3312
- movd r1d, xm1
- pshuflw xm1, xm1, q3232
- movq xm4, [srcq+r3]
- movq xm5, [srcq+r1]
- movd r3d, xm1
- punpckhqdq xm1, xm1
- movd r1d, xm1
- movhps xm4, [srcq+r3]
- movhps xm5, [srcq+r1]
-%endif
-
- ; if no emulation is required, we don't need to shuffle or emulate edges
- ; this also saves 2 quasi-vpgatherdqs
- pxor m6, m6
- pcmpeqb m6, m3
-%if ARCH_X86_64
- pmovmskb r8d, m6
- cmp r8d, 0xffff
-%else
- pmovmskb r3d, m6
- cmp r3d, 0xffff
-%endif
- je .filter
-
-%if ARCH_X86_64
- movd r8d, xm3
- pshuflw xm3, xm3, q3232
- movd r9d, xm3
- punpckhqdq xm3, xm3
- movd r10d, xm3
- psrlq xm3, 32
- movd r11d, xm3
- movsxd r8, r8d
- movsxd r9, r9d
- movsxd r10, r10d
- movsxd r11, r11d
- movq xm6, [base+resize_shuf+4+r8]
- movq xm7, [base+resize_shuf+4+r10]
- movhps xm6, [base+resize_shuf+4+r9]
- movhps xm7, [base+resize_shuf+4+r11]
-%else
- movd r3d, xm3
- pshufd xm3, xm3, q3312
- movd r1d, xm3
- pshuflw xm3, xm3, q3232
- movq xm6, [base+resize_shuf+4+r3]
- movq xm7, [base+resize_shuf+4+r1]
- movd r3d, xm3
- punpckhqdq xm3, xm3
- movd r1d, xm3
- movhps xm6, [base+resize_shuf+4+r3]
- movhps xm7, [base+resize_shuf+4+r1]
-%endif
-
- paddb m6, m10
- paddb m7, m10
- pshufb m4, m6
- pshufb m5, m7
-
-.filter:
-%if ARCH_X86_64
- movd r8d, xm2
- pshuflw xm2, xm2, q3232
- movd r9d, xm2
- punpckhqdq xm2, xm2
- movd r10d, xm2
- psrlq xm2, 32
- movd r11d, xm2
- movq xm6, [base+resize_filter+r8*8]
- movq xm7, [base+resize_filter+r10*8]
- movhps xm6, [base+resize_filter+r9*8]
- movhps xm7, [base+resize_filter+r11*8]
-%else
- movd r3d, xm2
- pshufd xm2, xm2, q3312
- movd r1d, xm2
- pshuflw xm2, xm2, q3232
- movq xm6, [base+resize_filter+r3*8]
- movq xm7, [base+resize_filter+r1*8]
- movd r3d, xm2
- punpckhqdq xm2, xm2
- movd r1d, xm2
- movhps xm6, [base+resize_filter+r3*8]
- movhps xm7, [base+resize_filter+r1*8]
-%endif
-
- pmaddubsw m4, m6
- pmaddubsw m5, m7
- phaddw m4, m5
- phaddsw m4, m4
- pmulhrsw m4, m12 ; x=(x+64)>>7
- packuswb m4, m4
- movd [dstq+xq], m4
-
- paddd m0, m15
- add xd, 4
-%if STACK_ALIGNMENT >= 16
- cmp xd, dst_wd
-%else
- cmp xd, dst_wm
-%endif
- jl .loop_x
-
-%if ARCH_X86_64
- add dstq, dst_strideq
- add srcq, src_strideq
- dec hd
-%else
- add dstq, dst_stridem
- add srcq, src_stridem
- dec dword r5m
-%endif
- jg .loop_y
- RET