ref: e4fbbbce672937dfa53a9783803776b6d8c76a44
parent: 8676fda34cfd0c0cb92b39b0a8ad203b906e062c
author: Henrik Gramner <gramner@twoorioles.com>
date: Wed Nov 7 16:42:38 EST 2018
Add blend AVX2 asm
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -43,6 +43,9 @@
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
+blend_shuf: ; bits 0-3: 0, 0, 0, 0, 1, 1, 1, 1
+pb_64: times 4 db 64
+ times 4 db 1
pw_8: times 2 dw 8
pw_26: times 2 dw 26
pw_34: times 2 dw 34
@@ -58,12 +61,13 @@
cextern mc_subpel_filters
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
-%macro BIDIR_JMP_TABLE 1-7 4, 8, 16, 32, 64, 128
- %xdefine %1_table (%%table - 2*4)
+%macro BIDIR_JMP_TABLE 1-* 4, 8, 16, 32, 64, 128
+ %xdefine %1_table (%%table - 2*%2)
+ %xdefine %%base %1_table
%xdefine %%prefix mangle(private_prefix %+ _%1)
%%table:
- %rep 6
- dd %%prefix %+ .w%2 - (%%table - 2*4)
+ %rep %0 - 1
+ dd %%prefix %+ .w%2 - %%base
%rotate 1
%endrep
%endmacro
@@ -72,6 +76,7 @@
BIDIR_JMP_TABLE w_avg_avx2
BIDIR_JMP_TABLE mask_avx2
BIDIR_JMP_TABLE w_mask_420_avx2
+BIDIR_JMP_TABLE blend_avx2, 2, 4, 8, 16, 32, 64, 128
%macro BASE_JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - %3)
@@ -3281,7 +3286,410 @@
jg .w128_loop
RET
-INIT_YMM avx2
+cglobal blend, 3, 7, 6, dst, ds, tmp, w, h, mask, ms
+%define base r6-blend_avx2_table
+ lea r6, [blend_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movifnidn maskq, maskmp
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m4, [base+pb_64]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r6
+ mov msq, msmp
+ jmp wq
+.w2:
+ cmp msq, 1
+ jb .w2_s0
+ je .w2_s1
+.w2_s2:
+ movd xm1, [maskq]
+ movd xm0, [dstq+dsq*0]
+ pinsrw xm0, [dstq+dsq*1], 1
+ psubb xm2, xm4, xm1
+ punpcklbw xm2, xm1
+ movd xm1, [tmpq]
+ add maskq, 2*2
+ add tmpq, 2*2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_s2
+ RET
+.w2_s1:
+ movd xm1, [maskq]
+ movd xm0, [dstq+dsq*0]
+ psubb xm2, xm4, xm1
+ punpcklbw xm2, xm1
+ pinsrw xm0, [dstq+dsq*1], 1
+ movd xm1, [tmpq]
+ punpcklwd xm2, xm2
+ add maskq, 2
+ add tmpq, 2*2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_s1
+ RET
+.w2_s0:
+ vpbroadcastw xm0, [maskq]
+ psubb xm4, xm0
+ punpcklbw xm4, xm0
+.w2_s0_loop:
+ movd xm0, [dstq+dsq*0]
+ pinsrw xm0, [dstq+dsq*1], 1
+ movd xm1, [tmpq]
+ add tmpq, 2*2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm4
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_s0_loop
+ RET
+ALIGN function_align
+.w4:
+ cmp msq, 1
+ jb .w4_s0
+ je .w4_s1
+.w4_s4:
+ movq xm1, [maskq]
+ movd xm0, [dstq+dsq*0]
+ pinsrd xm0, [dstq+dsq*1], 1
+ psubb xm2, xm4, xm1
+ punpcklbw xm2, xm1
+ movq xm1, [tmpq]
+ add maskq, 4*2
+ add tmpq, 4*2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_s4
+ RET
+.w4_s1:
+ movq xm3, [blend_shuf]
+.w4_s1_loop:
+ movd xm1, [maskq]
+ movd xm0, [dstq+dsq*0]
+ pshufb xm1, xm3
+ psubb xm2, xm4, xm1
+ pinsrd xm0, [dstq+dsq*1], 1
+ punpcklbw xm2, xm1
+ movq xm1, [tmpq]
+ add maskq, 2
+ add tmpq, 4*2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_s1_loop
+ RET
+.w4_s0:
+ vpbroadcastd xm0, [maskq]
+ psubb xm4, xm0
+ punpcklbw xm4, xm0
+.w4_s0_loop:
+ movd xm0, [dstq+dsq*0]
+ pinsrd xm0, [dstq+dsq*1], 1
+ movq xm1, [tmpq]
+ add tmpq, 4*2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm4
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_s0_loop
+ RET
+ALIGN function_align
+.w8:
+ cmp msq, 1
+ jb .w8_s0
+ je .w8_s1
+.w8_s8:
+ movq xm1, [maskq+8*1]
+ vinserti128 m1, [maskq+8*0], 1
+ vpbroadcastq m2, [dstq+dsq*0]
+ movq xm0, [dstq+dsq*1]
+ vpblendd m0, m2, 0x30
+ psubb m2, m4, m1
+ punpcklbw m2, m1
+ movq xm1, [tmpq+8*1]
+ vinserti128 m1, [tmpq+8*0], 1
+ add maskq, 8*2
+ add tmpq, 8*2
+ punpcklbw m0, m1
+ pmaddubsw m0, m2
+ pmulhrsw m0, m5
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movhps [dstq+dsq*0], xm0
+ movq [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_s8
+ RET
+.w8_s1:
+ vpbroadcastd m0, [blend_shuf+0]
+ vpbroadcastd xm3, [blend_shuf+4]
+ vpblendd m3, m0, 0xf0
+.w8_s1_loop:
+ vpbroadcastd m0, [maskq]
+ vpbroadcastq m1, [dstq+dsq*0]
+ pshufb m0, m3
+ psubb m2, m4, m0
+ punpcklbw m2, m0
+ movq xm0, [dstq+dsq*1]
+ vpblendd m0, m1, 0x30
+ movq xm1, [tmpq+8*1]
+ vinserti128 m1, [tmpq+8*0], 1
+ add maskq, 2
+ add tmpq, 8*2
+ punpcklbw m0, m1
+ pmaddubsw m0, m2
+ pmulhrsw m0, m5
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movhps [dstq+dsq*0], xm0
+ movq [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_s1_loop
+ RET
+.w8_s0:
+ vpbroadcastq m0, [maskq]
+ psubb m4, m0
+ punpcklbw m4, m0
+.w8_s0_loop:
+ vpbroadcastq m2, [dstq+dsq*0]
+ movq xm0, [dstq+dsq*1]
+ vpblendd m0, m2, 0x30
+ movq xm1, [tmpq+8*1]
+ vinserti128 m1, [tmpq+8*0], 1
+ add tmpq, 8*2
+ punpcklbw m0, m1
+ pmaddubsw m0, m4
+ pmulhrsw m0, m5
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movhps [dstq+dsq*0], xm0
+ movq [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_s0_loop
+ RET
+ALIGN function_align
+.w16:
+ cmp msq, 1
+ jb .w16_s0
+ WIN64_SPILL_XMM 7
+ je .w16_s1
+.w16_s16:
+ mova m0, [maskq]
+ mova xm1, [dstq+dsq*0]
+ vinserti128 m1, [dstq+dsq*1], 1
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ mova m6, [tmpq]
+ add maskq, 16*2
+ add tmpq, 16*2
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_s16
+ RET
+.w16_s1:
+ vpbroadcastd xm6, [blend_shuf]
+ vpbroadcastd m0, [blend_shuf+4]
+ vpblendd m6, m0, 0xf0
+.w16_s1_loop:
+ vpbroadcastd m2, [maskq]
+ mova xm1, [dstq+dsq*0]
+ pshufb m2, m6
+ psubb m3, m4, m2
+ vinserti128 m1, [dstq+dsq*1], 1
+ punpcklbw m3, m2
+ mova m2, [tmpq]
+ add maskq, 2
+ add tmpq, 16*2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_s1_loop
+ RET
+.w16_s0:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 6
+ vbroadcasti128 m0, [maskq]
+ psubb m4, m0
+ punpcklbw m3, m4, m0
+ punpckhbw m4, m0
+.w16_s0_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti128 m1, [dstq+dsq*1], 1
+ mova m2, [tmpq]
+ add tmpq, 16*2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_s0_loop
+ RET
+ALIGN function_align
+.w32:
+ mov wd, 32
+ jmp .w32_start
+.w64:
+ mov wd, 64
+ jmp .w32_start
+.w128:
+ mov wd, 128
+.w32_start:
+ WIN64_SPILL_XMM 7
+ cmp msq, 1
+ jb .w32_s0
+ je .w32_s1
+ sub dsq, wq
+.w32_s32:
+ mov r6d, wd
+.w32_s32_loop:
+ mova m0, [maskq]
+ mova m1, [dstq]
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ mova m6, [tmpq]
+ add maskq, 32
+ add tmpq, 32
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, 32
+ sub r6d, 32
+ jg .w32_s32_loop
+ add dstq, dsq
+ dec hd
+ jg .w32_s32
+ RET
+.w32_s1:
+ sub dsq, wq
+.w32_s1_loop0:
+ vpbroadcastb m0, [maskq]
+ mov r6d, wd
+ inc maskq
+ psubb m3, m4, m0
+ punpcklbw m3, m0
+.w32_s1_loop:
+ mova m1, [dstq]
+ mova m2, [tmpq]
+ add tmpq, 32
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, 32
+ sub r6d, 32
+ jg .w32_s1_loop
+ add dstq, dsq
+ dec hd
+ jg .w32_s1_loop0
+ RET
+.w32_s0:
+%if WIN64
+ PUSH r7
+ PUSH r8
+ %define regs_used 9
+%endif
+ lea r6d, [hq+wq*8-256]
+ mov r7, dstq
+ mov r8, tmpq
+.w32_s0_loop0:
+ mova m0, [maskq]
+ add maskq, 32
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+.w32_s0_loop:
+ mova m1, [dstq]
+ mova m6, [tmpq]
+ add tmpq, wq
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .w32_s0_loop
+ add r7, 32
+ add r8, 32
+ mov dstq, r7
+ mov tmpq, r8
+ mov hb, r6b
+ sub r6d, 256
+ jg .w32_s0_loop0
+ RET
+
cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
bottomext, rightext
; we assume that the buffer (stride) is larger than width, so we can
@@ -3475,4 +3883,5 @@
.end:
RET
+
%endif ; ARCH_X86_64
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -54,6 +54,7 @@
decl_w_avg_fn(dav1d_w_avg_avx2);
decl_mask_fn(dav1d_mask_avx2);
decl_w_mask_fn(dav1d_w_mask_420_avx2);
+decl_blend_fn(dav1d_blend_avx2);
decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
@@ -96,6 +97,7 @@
c->w_avg = dav1d_w_avg_avx2;
c->mask = dav1d_mask_avx2;
c->w_mask[2] = dav1d_w_mask_420_avx2;
+ c->blend = dav1d_blend_avx2;
c->warp8x8 = dav1d_warp_affine_8x8_avx2;
c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
--- a/tests/checkasm/mc.c
+++ b/tests/checkasm/mc.c
@@ -237,12 +237,12 @@
}
static void check_blend(Dav1dMCDSPContext *const c) {
- ALIGN_STK_32(pixel, tmp, 128 * 128,);
- ALIGN_STK_32(pixel, c_dst, 128 * 128,);
- ALIGN_STK_32(pixel, a_dst, 128 * 128,);
- ALIGN_STK_32(uint8_t, mask, 128 * 128,);
+ ALIGN_STK_32(pixel, tmp, 128 * 32,);
+ ALIGN_STK_32(pixel, c_dst, 128 * 32,);
+ ALIGN_STK_32(pixel, a_dst, 128 * 32,);
+ ALIGN_STK_32(uint8_t, mask, 128 * 32,);
- for (int i = 0; i < 128 * 128; i++) {
+ for (int i = 0; i < 128 * 32; i++) {
tmp[i] = rand() & ((1 << BITDEPTH) - 1);
mask[i] = rand() % 65;
}
@@ -252,9 +252,11 @@
for (int w = 2; w <= 128; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
+ const int h_min = (w == 128) ? 4 : 2;
+ const int h_max = (w > 32) ? 32 : (w == 2) ? 64 : 128;
for (int ms = 0; ms <= w; ms += ms ? w - 1 : 1)
if (check_func(c->blend, "blend_w%d_ms%d_%dbpc", w, ms, BITDEPTH))
- for (int h = imax(w / 8, 2); h <= imin(w * 8, 128); h <<= 1) {
+ for (int h = h_min; h <= h_max; h <<= 1) {
for (int i = 0; i < w * h; i++)
c_dst[i] = a_dst[i] = rand() & ((1 << BITDEPTH) - 1);