ref: ee58d65dd59d424f840c04079ef3c3153f9d9576
parent: aaba9f8e770eefd4daede06bf01bf80c04a4f9b3
author: Francois Cartegnie <fcvlcdev@free.fr>
date: Mon Dec 24 10:54:49 EST 2018
Add SSSE3 put_bilin
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -38,6 +38,7 @@
decl_mc_fn(dav1d_put_8tap_sharp_regular_avx2);
decl_mc_fn(dav1d_put_8tap_sharp_smooth_avx2);
decl_mc_fn(dav1d_put_bilin_avx2);
+decl_mc_fn(dav1d_put_bilin_ssse3);
decl_mct_fn(dav1d_prep_8tap_regular_avx2);
decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
@@ -83,6 +84,8 @@
return;
#if BITDEPTH == 8
+ init_mc_fn (FILTER_2D_BILINEAR, bilin, ssse3);
+
c->avg = dav1d_avg_ssse3;
c->w_avg = dav1d_w_avg_ssse3;
c->mask = dav1d_mask_ssse3;
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -45,8 +45,11 @@
db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
-blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
+bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+
pb_64: times 16 db 64
pw_8: times 8 dw 8
pw_26: times 8 dw 26
@@ -76,9 +79,649 @@
BIDIR_JMP_TABLE blend_v_ssse3, 2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put)
+
+BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
SECTION .text
INIT_XMM ssse3
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1
+cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
+%define base t0-put_ssse3
+%else
+DECLARE_REG_TMP 7
+%define base 0
+cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+%endif
+;
+%macro RESTORE_DSQ_32 1
+ %if ARCH_X86_32
+ mov %1, dsm ; restore dsq
+ %endif
+%endmacro
+;
+ movifnidn mxyd, r6m ; mx
+ LEA t0, put_ssse3
+ tzcnt wd, wm
+ mov hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [t0+wq*2+table_offset(put,)]
+ add wq, t0
+ lea r6, [ssq*3]
+ RESTORE_DSQ_32 t0
+ jmp wq
+.put_w2:
+ movzx r4d, word [srcq+ssq*0]
+ movzx r6d, word [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r4w
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r4d, [srcq+ssq*0]
+ mov r6d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r4d
+ mov [dstq+dsq*1], r6d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movq m0, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq [dstq+dsq*0], m0
+ movq [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ lea r4, [dsq*3]
+.put_w16_in:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+r6 ]
+ lea srcq, [srcq+ssq*4]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ mova [dstq+dsq*2], m2
+ mova [dstq+r4 ], m3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .put_w16_in
+ RET
+.put_w32:
+ movu m0, [srcq+ssq*0+16*0]
+ movu m1, [srcq+ssq*0+16*1]
+ movu m2, [srcq+ssq*1+16*0]
+ movu m3, [srcq+ssq*1+16*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+16*0], m0
+ mova [dstq+dsq*0+16*1], m1
+ mova [dstq+dsq*1+16*0], m2
+ mova [dstq+dsq*1+16*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ add srcq, ssq
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ mova [dstq+16*6], m2
+ mova [dstq+16*7], m3
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+ ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+ imul mxyd, 0xff01
+ mova m4, [base+bilin_h_shuf8]
+ mova m0, [base+bilin_h_shuf4]
+ WIN64_SPILL_XMM 7
+ add mxyd, 16 << 8
+ movd m5, mxyd
+ mov mxyd, r7m ; my
+ pshuflw m5, m5, q0000
+ punpcklqdq m5, m5
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)]
+ mova m6, [base+pw_2048]
+ add wq, t0
+ RESTORE_DSQ_32 t0
+ jmp wq
+.h_w2:
+ pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
+.h_w2_loop:
+ movd m0, [srcq+ssq*0]
+ movd m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq m0, m1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ pmulhrsw m0, m6
+ packuswb m0, m0
+ movd r6d, m0
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movq m4, [srcq+ssq*0]
+ movhps m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m4, m0
+ pmaddubsw m4, m5
+ pmulhrsw m4, m6
+ packuswb m4, m4
+ movd [dstq+dsq*0], m4
+ pshufd m4, m4, q0101
+ movd [dstq+dsq*1], m4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ packuswb m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w16
+ RET
+.h_w32:
+ movu m0, [srcq+mmsize*0+8*0]
+ movu m1, [srcq+mmsize*0+8*1]
+ movu m2, [srcq+mmsize*1+8*0]
+ movu m3, [srcq+mmsize*1+8*1]
+ add srcq, ssq
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ pmulhrsw m2, m6
+ pmulhrsw m3, m6
+ packuswb m0, m1
+ packuswb m2, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m2
+ add dstq, dsq
+ dec hd
+ jg .h_w32
+ RET
+.h_w64:
+ mov r6, -16*3
+.h_w64_loop:
+ movu m0, [srcq+r6+16*3+8*0]
+ movu m1, [srcq+r6+16*3+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ packuswb m0, m1
+ mova [dstq+r6+16*3], m0
+ add r6, 16
+ jle .h_w64_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ mov r6, -16*7
+.h_w128_loop:
+ movu m0, [srcq+r6+16*7+8*0]
+ movu m1, [srcq+r6+16*7+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ packuswb m0, m1
+ mova [dstq+r6+16*7], m0
+ add r6, 16
+ jle .h_w128_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)]
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 8
+ imul mxyd, 0xff01
+ mova m7, [base+pw_2048]
+ add mxyd, 16 << 8
+ add wq, t0
+ movd m6, mxyd
+ pshuflw m6, m6, q0000
+ punpcklqdq m6, m6
+ RESTORE_DSQ_32 t0
+ jmp wq
+.v_w2:
+ movd m0, [srcq+ssq*0]
+.v_w2_loop:
+ pinsrw m0, [srcq+ssq*1], 1 ; 0 1
+ lea srcq, [srcq+ssq*2]
+ pshuflw m2, m0, q2301
+ pinsrw m0, [srcq+ssq*0], 0 ; 2 1
+ punpcklbw m1, m0, m2
+ pmaddubsw m1, m6
+ pmulhrsw m1, m7
+ packuswb m1, m1
+ movd r6d, m1
+ mov [dstq+dsq*1], r6w
+ shr r6d, 16
+ mov [dstq+dsq*0], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd m0, [srcq+ssq*0]
+.v_w4_loop:
+ movd m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq m2, m0, m1 ; 0 1
+ movd m0, [srcq+ssq*0]
+ punpckldq m1, m0 ; 1 2
+ punpcklbw m1, m2
+ pmaddubsw m1, m6
+ pmulhrsw m1, m7
+ packuswb m1, m1
+ movd [dstq+dsq*0], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ ;
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq m0, [srcq+ssq*0]
+.v_w8_loop:
+ movddup m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq m3, m0, m2 ; 0 1 m2qh:m0ql
+ movddup m0, [srcq+ssq*0]
+ punpcklqdq m4, m2, m0 ; 1 2 m0qh:m2ql
+ punpcklbw m1, m4, m3
+ punpckhbw m4, m3
+ pmaddubsw m1, m6
+ pmaddubsw m4, m6
+ pmulhrsw m1, m7
+ pmulhrsw m4, m7
+ packuswb m1, m4
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+ ;
+%macro PUT_BILIN_V_W16 0
+ movu m0, [srcq+ssq*0]
+%%loop:
+ movu m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m1, m4, m0
+ punpckhbw m3, m4, m0
+ movu m0, [srcq+ssq*0]
+ punpcklbw m2, m0, m4
+ pmaddubsw m1, m6
+ pmaddubsw m3, m6
+ pmulhrsw m1, m7
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ mova [dstq+dsq*0], m1
+ punpckhbw m3, m0, m4
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmulhrsw m2, m7
+ pmulhrsw m3, m7
+ packuswb m2, m3
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg %%loop
+%endmacro
+ ;
+.v_w16:
+ PUT_BILIN_V_W16
+ RET
+.v_w16gt:
+ mov r4, dstq
+ mov r6, srcq
+.v_w16gt_loop:
+%if ARCH_X86_32
+ mov bakm, t0q
+ RESTORE_DSQ_32 t0
+ PUT_BILIN_V_W16
+ mov t0q, bakm
+%else
+ PUT_BILIN_V_W16
+%endif
+ mov hw, t0w
+ add r4, mmsize
+ add r6, mmsize
+ mov dstq, r4
+ mov srcq, r6
+ sub t0d, 1<<16
+ jg .v_w16gt
+ RET
+.v_w32:
+ lea t0d, [hq+(1<<16)]
+ jmp .v_w16gt
+.v_w64:
+ lea t0d, [hq+(3<<16)]
+ jmp .v_w16gt
+.v_w128:
+ lea t0d, [hq+(7<<16)]
+ jmp .v_w16gt
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+ ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11 ; can't shift by 12 due to signed overflow
+ mova m7, [base+pw_2048]
+ movd m6, mxyd
+ add wq, t0
+ pshuflw m6, m6, q0000
+ punpcklqdq m6, m6
+ jmp wq
+.hv_w2:
+ RESTORE_DSQ_32 t0
+ movd m0, [srcq+ssq*0]
+ pshufd m0, m0, q0000 ; src[x - src_stride]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w2_loop:
+ movd m1, [srcq+ssq*1] ; src[x]
+ lea srcq, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*0] ; src[x + src_stride]
+ pshufd m1, m1, q3120
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 _ 2 _
+ shufps m2, m0, m1, q1032 ; 0 _ 1 _
+ mova m0, m1
+ psubw m1, m2 ; src[x + src_stride] - src[x]
+ paddw m1, m1
+ pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x])
+ paddw m1, m2 ; src[x] + (my * (src[x + src_stride] - src[x])
+ pmulhrsw m1, m7
+ packuswb m1, m1
+ pshuflw m1, m1, q2020
+ movd r6d, m1
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova m4, [base+bilin_h_shuf4]
+ RESTORE_DSQ_32 t0
+ movddup xm0, [srcq+ssq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w4_loop:
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*0]
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2
+ shufps m2, m0, m1, q1032 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ packuswb m1, m1
+ movd [dstq+dsq*0], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ RESTORE_DSQ_32 t0
+ movu m0, [srcq+ssq*0+8*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu m2, [srcq+ssq*1+8*0]
+ lea srcq, [srcq+ssq*2]
+ movu m3, [srcq+ssq*0+8*0]
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ psubw m1, m2, m0
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m0
+ pmaddubsw m0, m3, m5
+ psubw m3, m0, m2
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m2
+ pmulhrsw m1, m7
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+ ;
+ ; 32bit has ssq, dsq free
+%macro PUT_BILIN_HV_W16 0
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ %if WIN64
+ movaps r4m, xmm8
+ %endif
+%%loop:
+%if ARCH_X86_32
+ %define m3back [dstq]
+ %define dsqval dsm
+%else
+ %define m3back m8
+ %define dsqval dsq
+%endif
+ add srcq, ssq
+ movu m2, [srcq+8*1]
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m3, m2, m1
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m1
+ mova m1, m2
+ pmulhrsw m3, m7
+ mova m3back, m3
+ movu m2, [srcq+8*0]
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m3, m2, m0
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m0
+ mova m0, m2
+ pmulhrsw m3, m7
+ packuswb m3, m3back
+ mova [dstq], m3
+ add dstq, dsqval
+ dec hd
+ jg %%loop
+ %if WIN64
+ movaps xmm8, r4m
+ %endif
+ %undef m3back
+ %undef dsqval
+%endmacro
+ ;
+.hv_w16:
+ PUT_BILIN_HV_W16
+ RET
+.hv_w16gt:
+ mov r4, dstq
+ mov r6, srcq
+.hv_w16gt_loop:
+ PUT_BILIN_HV_W16
+ mov hw, t0w
+ add r4, mmsize
+ add r6, mmsize
+ mov dstq, r4
+ mov srcq, r6
+ sub t0d, 1<<16
+ jg .hv_w16gt_loop
+ RET
+.hv_w32:
+ lea t0d, [hq+(1<<16)]
+ jmp .hv_w16gt
+.hv_w64:
+ lea t0d, [hq+(3<<16)]
+ jmp .hv_w16gt
+.hv_w128:
+ lea t0d, [hq+(7<<16)]
+ jmp .hv_w16gt
%if WIN64
DECLARE_REG_TMP 6, 4