ref: 562f1958a61c6e2ab0e416889fbb0aaa3cae1942
parent: 940eee483a852ec54349ef36f19713bb2b895b57
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Sun Nov 3 02:52:55 EST 2019
Simplify shifting in generate_grain_y/uv AVX2
--- a/include/dav1d/headers.h
+++ b/include/dav1d/headers.h
@@ -319,7 +319,7 @@
int ar_coeff_lag;
int8_t ar_coeffs_y[24];
int8_t ar_coeffs_uv[2][25];
- int ar_coeff_shift;
+ uint64_t ar_coeff_shift;
int grain_scale_shift;
int uv_mult[2];
int uv_luma_mult[2];
--- a/meson.build
+++ b/meson.build
@@ -30,7 +30,7 @@
'b_ndebug=if-release'],
meson_version: '>= 0.47.0')
-dav1d_soname_version = '3.1.0'
+dav1d_soname_version = '4.0.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
--- a/src/film_grain_tmpl.c
+++ b/src/film_grain_tmpl.c
@@ -43,7 +43,7 @@
return (*state >> (16 - bits)) & ((1 << bits) - 1);
}
-static inline int round2(const int x, const int shift) {
+static inline int round2(const int x, const uint64_t shift) {
return (x + ((1 << shift) >> 1)) >> shift;
}
--- a/src/x86/film_grain.asm
+++ b/src/x86/film_grain.asm
@@ -44,6 +44,7 @@
max: dw 255, 240, 235
min: dw 0, 16
pb_27_17_17_27: db 27, 17, 17, 27
+pw_1: dw 1
%macro JMP_TABLE 1-*
%xdefine %1_table %%table
@@ -56,6 +57,7 @@
%endrep
%endmacro
+ALIGN 4
JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3
@@ -69,8 +71,8 @@
.scaling_shift: resd 1
.ar_coeff_lag: resd 1
.ar_coeffs_y: resb 24
- .ar_coeffs_uv: resb 2 * 26 ; includes padding
- .ar_coeff_shift: resd 1
+ .ar_coeffs_uv: resb 2 * 28 ; includes padding
+ .ar_coeff_shift: resq 1
.grain_scale_shift: resd 1
.uv_mult: resd 2
.uv_luma_mult: resd 2
@@ -190,12 +192,12 @@
.ar2:
DEFINE_ARGS buf, fg_data, shift
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
- movd xm14, [base+hmul_bits-10+shiftq*2]
+ vpbroadcastw xm14, [base+round_vals-12+shiftq*2]
movq xm15, [base+byte_blend+1]
pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7
movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11
pmovsxbw xm9, xm9
- DEFINE_ARGS buf, h, x
+ DEFINE_ARGS buf, fg_data, h, x
pshufd xm12, xm9, q0000
pshufd xm13, xm9, q1111
pshufd xm11, xm8, q3333
@@ -202,6 +204,7 @@
pshufd xm10, xm8, q2222
pshufd xm9, xm8, q1111
pshufd xm8, xm8, q0000
+ pmovzxwd xm14, xm14
sub bufq, 82*73-(82*3+79)
mov hd, 70
.y_loop_ar2:
@@ -233,6 +236,7 @@
paddd xm4, xm6
paddd xm2, xm7
paddd xm2, xm4
+ paddd xm2, xm14
movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5]
.x_loop_ar2_inner:
@@ -241,9 +245,8 @@
paddd xm3, xm2
psrldq xm1, 4 ; y=0,x=0
psrldq xm2, 4 ; shift top to next pixel
- psrad xm3, 5
- packssdw xm3, xm3
- pmulhrsw xm3, xm14
+ psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
paddw xm3, xm1
packsswb xm3, xm3
pextrb [bufq+xq], xm3, 0
@@ -274,7 +277,7 @@
ALLOC_STACK 16*12
%endif
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
- movd xm14, [base+hmul_bits-10+shiftq*2]
+ vpbroadcastw xm14, [base+round_vals-12+shiftq*2]
movq xm15, [base+byte_blend]
pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-7
pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_y+ 8] ; cf8-15
@@ -288,10 +291,11 @@
pshufd xm8, xm1, q3333
pshufd xm1, xm1, q0000
pshufd xm3, xm2, q1111
+ psrldq xm13, xm2, 10
+ pinsrw xm2, [pw_1], 5
pshufd xm4, xm2, q2222
- psrldq xm5, xm2, 10
pshufd xm2, xm2, q0000
- pinsrw xm5, [base+round_vals+shiftq*2-10], 3
+ pinsrw xm13, [base+round_vals+shiftq*2-10], 3
mova [rsp+ 0*16], xm0
mova [rsp+ 1*16], xm9
mova [rsp+ 2*16], xm10
@@ -303,9 +307,7 @@
mova [rsp+ 8*16], xm2
mova [rsp+ 9*16], xm3
mova [rsp+10*16], xm4
- mova [rsp+11*16], xm5
- pxor xm13, xm13
- DEFINE_ARGS buf, h, x
+ DEFINE_ARGS buf, fg_data, h, x
sub bufq, 82*73-(82*3+79)
mov hd, 70
.y_loop_ar3:
@@ -374,7 +376,7 @@
punpcklwd xm6, xm7
punpcklwd xm8, xm9
- punpcklwd xm5, xm13
+ punpcklwd xm5, xm14
pmaddwd xm6, [rsp+ 8*16]
pmaddwd xm8, [rsp+ 9*16]
pmaddwd xm5, [rsp+10*16]
@@ -385,14 +387,13 @@
movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4]
.x_loop_ar3_inner:
pmovsxbw xm2, xm1
- pmaddwd xm2, [rsp+16*11]
+ pmaddwd xm2, xm13
pshufd xm3, xm2, q1111
paddd xm2, xm3 ; left+cur
paddd xm2, xm0 ; add top
psrldq xm0, 4
- psrad xm2, 5
- packssdw xm2, xm2
- pmulhrsw xm2, xm14
+ psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
packsswb xm2, xm2
pextrb [bufq+xq], xm2, 0
pslldq xm2, 3
@@ -606,17 +607,16 @@
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
imul uvd, 25
- movd xm15, [base+hmul_bits-10+shiftq*2]
+ vpbroadcastw xm15, [base+round_vals-12+shiftq*2]
pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7
pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12
+ pinsrw xm9, [base+pw_1], 5
vpbroadcastw xm7, [base+hmul_bits+4]
vpbroadcastd xm6, [base+pb_1]
- DEFINE_ARGS buf, bufy, h, x
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
pshufd xm12, xm9, q0000
pshufd xm13, xm9, q1111
pshufd xm14, xm9, q2222
- pxor xm10, xm10
- vpblendw xm14, xm10, 10101010b
pshufd xm11, xm8, q3333
pshufd xm10, xm8, q2222
pshufd xm9, xm8, q1111
@@ -660,7 +660,7 @@
pmaddubsw xm3, xm6, xm3
paddw xm0, xm3
pmulhrsw xm0, xm7
- punpcklwd xm0, xm0
+ punpcklwd xm0, xm15
pmaddwd xm0, xm14
paddd xm2, xm0
@@ -670,9 +670,7 @@
pmaddwd xm3, xm0, xm13
paddd xm3, xm2
psrldq xm2, 4 ; shift top to next pixel
- psrad xm3, 5
- packssdw xm3, xm3
- pmulhrsw xm3, xm15
+ psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
pslldq xm3, 2
psrldq xm0, 2
paddw xm3, xm0
@@ -699,7 +697,7 @@
%assign stack_size (stack_size+16*12)
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
imul uvd, 25
- movd xm14, [base+hmul_bits-10+shiftq*2]
+ vpbroadcastw xm14, [base+round_vals-12+shiftq*2]
pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7
pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15
pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23
@@ -719,6 +717,7 @@
psrldq xm5, xm2, 10
pshufd xm2, xm2, q0000
pinsrw xm5, [base+round_vals+shiftq*2-10], 3
+ pmovzxwd xm14, xm14
mova [rsp+ 0*16], xm0
mova [rsp+ 1*16], xm9
mova [rsp+ 2*16], xm10
@@ -733,7 +732,7 @@
mova [rsp+11*16], xm5
vpbroadcastd xm13, [base+pb_1]
vpbroadcastw xm15, [base+hmul_bits+4]
- DEFINE_ARGS buf, bufy, h, x
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
sub bufq, 82*38+44-(82*3+41)
add bufyq, 79+82*3
mov hd, 35
@@ -817,6 +816,7 @@
paddd xm0, xm6
paddd xm8, xm5
paddd xm0, xm8
+ paddd xm0, xm14
movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4]
.x_loop_ar3_inner:
@@ -826,9 +826,8 @@
paddd xm2, xm3 ; left+cur
paddd xm2, xm0 ; add top
psrldq xm0, 4
- psrad xm2, 5
- packssdw xm2, xm2
- pmulhrsw xm2, xm14
+ psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw, we only care about one value
pslldq xm2, 6
vpblendw xm1, xm2, 1000b
packsswb xm1, xm1