ref: dab82163cceb61a2b734189488f5da9e90e98f6f
parent: bf8d64004d1f830a8e739a9d0a69781d7a393665
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Tue Mar 24 12:12:40 EDT 2020
x86: add AVX2 SIMD for generate_grain_uv_{422,444} gen_grain_uv_ar0_8bpc_420_c: 72275.4 gen_grain_uv_ar0_8bpc_420_avx2: 7253.4 gen_grain_uv_ar0_8bpc_422_c: 111742.9 gen_grain_uv_ar0_8bpc_422_avx2: 13704.1 gen_grain_uv_ar0_8bpc_444_c: 205688.5 gen_grain_uv_ar0_8bpc_444_avx2: 25007.5 gen_grain_uv_ar1_8bpc_420_c: 100682.5 gen_grain_uv_ar1_8bpc_420_avx2: 18434.4 gen_grain_uv_ar1_8bpc_422_c: 167931.4 gen_grain_uv_ar1_8bpc_422_avx2: 37817.9 gen_grain_uv_ar1_8bpc_444_c: 323812.2 gen_grain_uv_ar1_8bpc_444_avx2: 74049.6 gen_grain_uv_ar2_8bpc_420_c: 159545.7 gen_grain_uv_ar2_8bpc_420_avx2: 23994.0 gen_grain_uv_ar2_8bpc_422_c: 295959.9 gen_grain_uv_ar2_8bpc_422_avx2: 48103.5 gen_grain_uv_ar2_8bpc_444_c: 571862.2 gen_grain_uv_ar2_8bpc_444_avx2: 93044.6 gen_grain_uv_ar3_8bpc_420_c: 243445.9 gen_grain_uv_ar3_8bpc_420_avx2: 27698.3 gen_grain_uv_ar3_8bpc_422_c: 458189.9 gen_grain_uv_ar3_8bpc_422_avx2: 54183.1 gen_grain_uv_ar3_8bpc_444_c: 883627.3 gen_grain_uv_ar3_8bpc_444_avx2: 103296.7 Also contains slight fixes to generate_grain_uv.ar0 to not pack before adding the current grain value. Fixes overflows in e.g. seed=1115072968.
--- a/src/x86/film_grain.asm
+++ b/src/x86/film_grain.asm
@@ -60,6 +60,8 @@
ALIGN 4
JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444_avx2, 0, 1, 2, 3
struc FGData
.seed: resd 1
@@ -413,8 +415,9 @@
jg .y_loop_ar3
RET
+%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
INIT_XMM avx2
-cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
+cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv
lea r4, [pb_mask]
%define base r4-pb_mask
movq xm1, [base+rnd_next_upperbit_mask]
@@ -428,11 +431,17 @@
pxor xm0, xm9
vpbroadcastd xm9, [base+pd_m65536]
lea r6, [gaussian_sequence]
- mov r7d, 38
+%if %2
+ mov r7d, 73-35*%3
add bufq, 44
.loop_y:
mov r5, -44
.loop_x:
+%else
+ mov r5, -73*82
+ sub bufq, r5
+.loop:
+%endif
pand xm2, xm0, xm1
psrlw xm3, xm2, 10
por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
@@ -455,15 +464,19 @@
packsswb xm2, xm2
movd [bufq+r5], xm2
add r5, 4
+%if %2
jl .loop_x
add bufq, 82
dec r7d
jg .loop_y
+%else
+ jl .loop
+%endif
; auto-regression code
movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
- movsxd r5, [base+generate_grain_uv_420_avx2_table+r5*4]
- lea r5, [r5+base+generate_grain_uv_420_avx2_table]
+ movsxd r5, [base+generate_grain_uv_%1_avx2_table+r5*4]
+ lea r5, [r5+base+generate_grain_uv_%1_avx2_table]
jmp r5
.ar0:
@@ -475,63 +488,126 @@
movd xm3, [base+hmul_bits+shiftq*2]
DEFINE_ARGS buf, bufy, h
pmovsxbw xm4, xm4
+%if %2
vpbroadcastd m7, [pb_1]
- vpbroadcastw m6, [hmul_bits+4]
+ vpbroadcastw m6, [hmul_bits+2+%3*2]
+%endif
vpbroadcastw m4, xm4
vpbroadcastw m3, xm3
- sub bufq, 82*38+82-(82*3+41)
+ pxor m12, m12
+%if %2
+ sub bufq, 82*(73-35*%3)+82-(82*3+41)
+%else
+ sub bufq, 82*70-3
+%endif
add bufyq, 3+82*3
- mov hd, 35
+ mov hd, 70-35*%3
.y_loop_ar0:
+%if %2
; first 32 pixels
movu xm8, [bufyq]
+%if %3
movu xm9, [bufyq+82]
+%endif
movu xm10, [bufyq+16]
+%if %3
movu xm11, [bufyq+82+16]
+%endif
vinserti128 m8, [bufyq+32], 1
+%if %3
vinserti128 m9, [bufyq+82+32], 1
+%endif
vinserti128 m10, [bufyq+48], 1
+%if %3
vinserti128 m11, [bufyq+82+48], 1
+%endif
pmaddubsw m8, m7, m8
+%if %3
pmaddubsw m9, m7, m9
+%endif
pmaddubsw m10, m7, m10
+%if %3
pmaddubsw m11, m7, m11
paddw m8, m9
paddw m10, m11
+%endif
pmulhrsw m8, m6
pmulhrsw m10, m6
+%else
+ xor r3d, r3d
+ ; first 32x2 pixels
+.x_loop_ar0:
+ movu m8, [bufyq+r3]
+ pcmpgtb m9, m12, m8
+ punpckhbw m10, m8, m9
+ punpcklbw m8, m9
+%endif
pmullw m8, m4
pmullw m10, m4
pmulhrsw m8, m3
pmulhrsw m10, m3
- packsswb m8, m10
+%if %2
movu m0, [bufq]
- punpckhbw m1, m0, m8
- punpcklbw m0, m8
- pmaddubsw m1, m7, m1
- pmaddubsw m0, m7, m0
- packsswb m0, m1
+%else
+ movu m0, [bufq+r3]
+%endif
+ pcmpgtb m1, m12, m0
+ punpckhbw m9, m0, m1
+ punpcklbw m0, m1
+ paddw m0, m8
+ paddw m9, m10
+ packsswb m0, m9
+%if %2
movu [bufq], m0
+%else
+ movu [bufq+r3], m0
+ add r3d, 32
+ cmp r3d, 64
+ jl .x_loop_ar0
+%endif
- ; last 6 pixels
+ ; last 6/12 pixels
movu xm8, [bufyq+32*2]
+%if %2
+%if %3
movu xm9, [bufyq+32*2+82]
+%endif
pmaddubsw xm8, xm7, xm8
+%if %3
pmaddubsw xm9, xm7, xm9
paddw xm8, xm9
+%endif
pmulhrsw xm8, xm6
pmullw xm8, xm4
pmulhrsw xm8, xm3
- packsswb xm8, xm8
movq xm0, [bufq+32]
- punpcklbw xm8, xm0
- pmaddubsw xm8, xm7, xm8
+ pcmpgtb xm9, xm12, xm0
+ punpcklbw xm9, xm0, xm9
+ paddw xm8, xm9
packsswb xm8, xm8
vpblendw xm0, xm8, xm0, 1000b
movq [bufq+32], xm0
+%else
+ pcmpgtb xm9, xm12, xm8
+ punpckhbw xm10, xm8, xm9
+ punpcklbw xm8, xm9
+ pmullw xm10, xm4
+ pmullw xm8, xm4
+ pmulhrsw xm10, xm3
+ pmulhrsw xm8, xm3
+ movu xm0, [bufq+64]
+ pcmpgtb xm9, xm12, xm0
+ punpcklbw xm1, xm0, xm9
+ punpckhbw xm9, xm0, xm9
+ paddw xm1, xm8
+ paddw xm9, xm10
+ packsswb xm1, xm9
+ vpblendw xm0, xm1, xm0, 11000000b
+ movu [bufq+64], xm0
+%endif
add bufq, 82
- add bufyq, 82*2
+ add bufyq, 82<<%3
dec hd
jg .y_loop_ar0
RET
@@ -549,27 +625,43 @@
pshufd xm5, xm4, q1111
pshufd xm4, xm4, q0000
pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd
+%if %2
vpbroadcastd xm7, [pb_1]
- vpbroadcastw xm6, [hmul_bits+4]
+ vpbroadcastw xm6, [hmul_bits+2+%3*2]
+%endif
vpbroadcastd xm3, xm3
- sub bufq, 82*38+44-(82*3+41)
+%if %2
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*70-(82-3)
+%endif
add bufyq, 79+82*3
- mov hd, 35
+ mov hd, 70-35*%3
mov mind, -128
mov maxd, 127
.y_loop_ar1:
- mov xq, -38
+ mov xq, -(76>>%2)
movsx val3d, byte [bufq+xq-1]
.x_loop_ar1:
pmovsxbw xm0, [bufq+xq-82-1] ; top/left
+%if %2
movq xm8, [bufyq+xq*2]
+%if %3
movq xm9, [bufyq+xq*2+82]
+%endif
+%endif
psrldq xm2, xm0, 2 ; top
psrldq xm1, xm0, 4 ; top/right
+%if %2
pmaddubsw xm8, xm7, xm8
+%if %3
pmaddubsw xm9, xm7, xm9
paddw xm8, xm9
+%endif
pmulhrsw xm8, xm6
+%else
+ pmovsxbw xm8, [bufyq+xq]
+%endif
punpcklwd xm0, xm2
punpcklwd xm1, xm8
pmaddwd xm0, xm4
@@ -598,7 +690,7 @@
.x_loop_ar1_end:
add bufq, 82
- add bufyq, 82*2
+ add bufyq, 82<<%3
dec hd
jg .y_loop_ar1
RET
@@ -611,8 +703,10 @@
pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7
pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12
pinsrw xm9, [base+pw_1], 5
- vpbroadcastw xm7, [base+hmul_bits+4]
+%if %2
+ vpbroadcastw xm7, [base+hmul_bits+2+%3*2]
vpbroadcastd xm6, [base+pb_1]
+%endif
DEFINE_ARGS buf, bufy, fg_data, h, unused, x
pshufd xm12, xm9, q0000
pshufd xm13, xm9, q1111
@@ -621,11 +715,15 @@
pshufd xm10, xm8, q2222
pshufd xm9, xm8, q1111
pshufd xm8, xm8, q0000
- sub bufq, 82*38+44-(82*3+41)
+%if %2
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*70-(82-3)
+%endif
add bufyq, 79+82*3
- mov hd, 35
+ mov hd, 70-35*%3
.y_loop_ar2:
- mov xq, -38
+ mov xq, -(76>>%2)
.x_loop_ar2:
pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
@@ -654,12 +752,20 @@
paddd xm2, xm3
paddd xm2, xm4
+%if %2
movq xm0, [bufyq+xq*2]
+%if %3
movq xm3, [bufyq+xq*2+82]
+%endif
pmaddubsw xm0, xm6, xm0
+%if %3
pmaddubsw xm3, xm6, xm3
paddw xm0, xm3
+%endif
pmulhrsw xm0, xm7
+%else
+ pmovsxbw xm0, [bufyq+xq]
+%endif
punpcklwd xm0, xm15
pmaddwd xm0, xm14
paddd xm2, xm0
@@ -685,7 +791,7 @@
.x_loop_ar2_end:
add bufq, 82
- add bufyq, 82*2
+ add bufyq, 82<<%3
dec hd
jg .y_loop_ar2
RET
@@ -730,14 +836,20 @@
mova [rsp+ 9*16], xm3
mova [rsp+10*16], xm4
mova [rsp+11*16], xm5
+%if %2
vpbroadcastd xm13, [base+pb_1]
- vpbroadcastw xm15, [base+hmul_bits+4]
+ vpbroadcastw xm15, [base+hmul_bits+2+%3*2]
+%endif
DEFINE_ARGS buf, bufy, fg_data, h, unused, x
- sub bufq, 82*38+44-(82*3+41)
+%if %2
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*70-(82-3)
+%endif
add bufyq, 79+82*3
- mov hd, 35
+ mov hd, 70-35*%3
.y_loop_ar3:
- mov xq, -38
+ mov xq, -(76>>%2)
.x_loop_ar3:
movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
@@ -800,12 +912,20 @@
palignr xm9, xm5, xm2, 10
palignr xm5, xm5, xm2, 12
+%if %2
movq xm1, [bufyq+xq*2]
+%if %3
movq xm2, [bufyq+xq*2+82]
+%endif
pmaddubsw xm1, xm13, xm1
+%if %3
pmaddubsw xm2, xm13, xm2
paddw xm1, xm2
+%endif
pmulhrsw xm1, xm15
+%else
+ pmovsxbw xm1, [bufyq+xq]
+%endif
punpcklwd xm6, xm7
punpcklwd xm8, xm9
@@ -841,10 +961,15 @@
.x_loop_ar3_end:
add bufq, 82
- add bufyq, 82*2
+ add bufyq, 82<<%3
dec hd
jg .y_loop_ar3
RET
+%endmacro
+
+generate_grain_uv_fn 420, 1, 1
+generate_grain_uv_fn 422, 1, 0
+generate_grain_uv_fn 444, 0, 0
INIT_YMM avx2
cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
--- a/src/x86/film_grain_init_tmpl.c
+++ b/src/x86/film_grain_init_tmpl.c
@@ -35,6 +35,8 @@
decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_avx2);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_avx2);
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
@@ -55,6 +57,8 @@
#if BITDEPTH == 8 && ARCH_X86_64
c->generate_grain_y = dav1d_generate_grain_y_avx2;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_avx2;
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_avx2;
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_avx2;
c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
#endif