shithub: dav1d

Download patch

ref: dab82163cceb61a2b734189488f5da9e90e98f6f
parent: bf8d64004d1f830a8e739a9d0a69781d7a393665
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Tue Mar 24 12:12:40 EDT 2020

x86: add AVX2 SIMD for generate_grain_uv_{422,444}

gen_grain_uv_ar0_8bpc_420_c: 72275.4
gen_grain_uv_ar0_8bpc_420_avx2: 7253.4
gen_grain_uv_ar0_8bpc_422_c: 111742.9
gen_grain_uv_ar0_8bpc_422_avx2: 13704.1
gen_grain_uv_ar0_8bpc_444_c: 205688.5
gen_grain_uv_ar0_8bpc_444_avx2: 25007.5
gen_grain_uv_ar1_8bpc_420_c: 100682.5
gen_grain_uv_ar1_8bpc_420_avx2: 18434.4
gen_grain_uv_ar1_8bpc_422_c: 167931.4
gen_grain_uv_ar1_8bpc_422_avx2: 37817.9
gen_grain_uv_ar1_8bpc_444_c: 323812.2
gen_grain_uv_ar1_8bpc_444_avx2: 74049.6
gen_grain_uv_ar2_8bpc_420_c: 159545.7
gen_grain_uv_ar2_8bpc_420_avx2: 23994.0
gen_grain_uv_ar2_8bpc_422_c: 295959.9
gen_grain_uv_ar2_8bpc_422_avx2: 48103.5
gen_grain_uv_ar2_8bpc_444_c: 571862.2
gen_grain_uv_ar2_8bpc_444_avx2: 93044.6
gen_grain_uv_ar3_8bpc_420_c: 243445.9
gen_grain_uv_ar3_8bpc_420_avx2: 27698.3
gen_grain_uv_ar3_8bpc_422_c: 458189.9
gen_grain_uv_ar3_8bpc_422_avx2: 54183.1
gen_grain_uv_ar3_8bpc_444_c: 883627.3
gen_grain_uv_ar3_8bpc_444_avx2: 103296.7

Also contains slight fixes to generate_grain_uv.ar0 to not pack before
adding the current grain value. Fixes overflows in e.g. seed=1115072968.

--- a/src/x86/film_grain.asm
+++ b/src/x86/film_grain.asm
@@ -60,6 +60,8 @@
 ALIGN 4
 JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
 JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444_avx2, 0, 1, 2, 3
 
 struc FGData
     .seed:                      resd 1
@@ -413,8 +415,9 @@
     jg .y_loop_ar3
     RET
 
+%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
 INIT_XMM avx2
-cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
+cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv
     lea              r4, [pb_mask]
 %define base r4-pb_mask
     movq            xm1, [base+rnd_next_upperbit_mask]
@@ -428,11 +431,17 @@
     pxor            xm0, xm9
     vpbroadcastd    xm9, [base+pd_m65536]
     lea              r6, [gaussian_sequence]
-    mov             r7d, 38
+%if %2
+    mov             r7d, 73-35*%3
     add            bufq, 44
 .loop_y:
     mov              r5, -44
 .loop_x:
+%else
+    mov              r5, -73*82
+    sub            bufq, r5
+.loop:
+%endif
     pand            xm2, xm0, xm1
     psrlw           xm3, xm2, 10
     por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
@@ -455,15 +464,19 @@
     packsswb        xm2, xm2
     movd      [bufq+r5], xm2
     add              r5, 4
+%if %2
     jl .loop_x
     add            bufq, 82
     dec             r7d
     jg .loop_y
+%else
+    jl .loop
+%endif
 
     ; auto-regression code
     movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
-    movsxd           r5, [base+generate_grain_uv_420_avx2_table+r5*4]
-    lea              r5, [r5+base+generate_grain_uv_420_avx2_table]
+    movsxd           r5, [base+generate_grain_uv_%1_avx2_table+r5*4]
+    lea              r5, [r5+base+generate_grain_uv_%1_avx2_table]
     jmp              r5
 
 .ar0:
@@ -475,63 +488,126 @@
     movd            xm3, [base+hmul_bits+shiftq*2]
     DEFINE_ARGS buf, bufy, h
     pmovsxbw        xm4, xm4
+%if %2
     vpbroadcastd     m7, [pb_1]
-    vpbroadcastw     m6, [hmul_bits+4]
+    vpbroadcastw     m6, [hmul_bits+2+%3*2]
+%endif
     vpbroadcastw     m4, xm4
     vpbroadcastw     m3, xm3
-    sub            bufq, 82*38+82-(82*3+41)
+    pxor            m12, m12
+%if %2
+    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
+%else
+    sub            bufq, 82*70-3
+%endif
     add           bufyq, 3+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
 .y_loop_ar0:
+%if %2
     ; first 32 pixels
     movu            xm8, [bufyq]
+%if %3
     movu            xm9, [bufyq+82]
+%endif
     movu           xm10, [bufyq+16]
+%if %3
     movu           xm11, [bufyq+82+16]
+%endif
     vinserti128      m8, [bufyq+32], 1
+%if %3
     vinserti128      m9, [bufyq+82+32], 1
+%endif
     vinserti128     m10, [bufyq+48], 1
+%if %3
     vinserti128     m11, [bufyq+82+48], 1
+%endif
     pmaddubsw        m8, m7, m8
+%if %3
     pmaddubsw        m9, m7, m9
+%endif
     pmaddubsw       m10, m7, m10
+%if %3
     pmaddubsw       m11, m7, m11
     paddw            m8, m9
     paddw           m10, m11
+%endif
     pmulhrsw         m8, m6
     pmulhrsw        m10, m6
+%else
+    xor             r3d, r3d
+    ; first 32x2 pixels
+.x_loop_ar0:
+    movu             m8, [bufyq+r3]
+    pcmpgtb          m9, m12, m8
+    punpckhbw       m10, m8, m9
+    punpcklbw        m8, m9
+%endif
     pmullw           m8, m4
     pmullw          m10, m4
     pmulhrsw         m8, m3
     pmulhrsw        m10, m3
-    packsswb         m8, m10
+%if %2
     movu             m0, [bufq]
-    punpckhbw        m1, m0, m8
-    punpcklbw        m0, m8
-    pmaddubsw        m1, m7, m1
-    pmaddubsw        m0, m7, m0
-    packsswb         m0, m1
+%else
+    movu             m0, [bufq+r3]
+%endif
+    pcmpgtb          m1, m12, m0
+    punpckhbw        m9, m0, m1
+    punpcklbw        m0, m1
+    paddw            m0, m8
+    paddw            m9, m10
+    packsswb         m0, m9
+%if %2
     movu         [bufq], m0
+%else
+    movu      [bufq+r3], m0
+    add             r3d, 32
+    cmp             r3d, 64
+    jl .x_loop_ar0
+%endif
 
-    ; last 6 pixels
+    ; last 6/12 pixels
     movu            xm8, [bufyq+32*2]
+%if %2
+%if %3
     movu            xm9, [bufyq+32*2+82]
+%endif
     pmaddubsw       xm8, xm7, xm8
+%if %3
     pmaddubsw       xm9, xm7, xm9
     paddw           xm8, xm9
+%endif
     pmulhrsw        xm8, xm6
     pmullw          xm8, xm4
     pmulhrsw        xm8, xm3
-    packsswb        xm8, xm8
     movq            xm0, [bufq+32]
-    punpcklbw       xm8, xm0
-    pmaddubsw       xm8, xm7, xm8
+    pcmpgtb         xm9, xm12, xm0
+    punpcklbw       xm9, xm0, xm9
+    paddw           xm8, xm9
     packsswb        xm8, xm8
     vpblendw        xm0, xm8, xm0, 1000b
     movq      [bufq+32], xm0
+%else
+    pcmpgtb         xm9, xm12, xm8
+    punpckhbw      xm10, xm8, xm9
+    punpcklbw       xm8, xm9
+    pmullw         xm10, xm4
+    pmullw          xm8, xm4
+    pmulhrsw       xm10, xm3
+    pmulhrsw        xm8, xm3
+    movu            xm0, [bufq+64]
+    pcmpgtb         xm9, xm12, xm0
+    punpcklbw       xm1, xm0, xm9
+    punpckhbw       xm9, xm0, xm9
+    paddw           xm1, xm8
+    paddw           xm9, xm10
+    packsswb        xm1, xm9
+    vpblendw        xm0, xm1, xm0, 11000000b
+    movu      [bufq+64], xm0
+%endif
 
     add            bufq, 82
-    add           bufyq, 82*2
+    add           bufyq, 82<<%3
     dec              hd
     jg .y_loop_ar0
     RET
@@ -549,27 +625,43 @@
     pshufd          xm5, xm4, q1111
     pshufd          xm4, xm4, q0000
     pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
+%if %2
     vpbroadcastd    xm7, [pb_1]
-    vpbroadcastw    xm6, [hmul_bits+4]
+    vpbroadcastw    xm6, [hmul_bits+2+%3*2]
+%endif
     vpbroadcastd    xm3, xm3
-    sub            bufq, 82*38+44-(82*3+41)
+%if %2
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*70-(82-3)
+%endif
     add           bufyq, 79+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
     mov            mind, -128
     mov            maxd, 127
 .y_loop_ar1:
-    mov              xq, -38
+    mov              xq, -(76>>%2)
     movsx         val3d, byte [bufq+xq-1]
 .x_loop_ar1:
     pmovsxbw        xm0, [bufq+xq-82-1]     ; top/left
+%if %2
     movq            xm8, [bufyq+xq*2]
+%if %3
     movq            xm9, [bufyq+xq*2+82]
+%endif
+%endif
     psrldq          xm2, xm0, 2             ; top
     psrldq          xm1, xm0, 4             ; top/right
+%if %2
     pmaddubsw       xm8, xm7, xm8
+%if %3
     pmaddubsw       xm9, xm7, xm9
     paddw           xm8, xm9
+%endif
     pmulhrsw        xm8, xm6
+%else
+    pmovsxbw        xm8, [bufyq+xq]
+%endif
     punpcklwd       xm0, xm2
     punpcklwd       xm1, xm8
     pmaddwd         xm0, xm4
@@ -598,7 +690,7 @@
 
 .x_loop_ar1_end:
     add            bufq, 82
-    add           bufyq, 82*2
+    add           bufyq, 82<<%3
     dec              hd
     jg .y_loop_ar1
     RET
@@ -611,8 +703,10 @@
     pmovsxbw        xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-7
     pmovsxbw        xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8]   ; cf8-12
     pinsrw          xm9, [base+pw_1], 5
-    vpbroadcastw    xm7, [base+hmul_bits+4]
+%if %2
+    vpbroadcastw    xm7, [base+hmul_bits+2+%3*2]
     vpbroadcastd    xm6, [base+pb_1]
+%endif
     DEFINE_ARGS buf, bufy, fg_data, h, unused, x
     pshufd         xm12, xm9, q0000
     pshufd         xm13, xm9, q1111
@@ -621,11 +715,15 @@
     pshufd         xm10, xm8, q2222
     pshufd          xm9, xm8, q1111
     pshufd          xm8, xm8, q0000
-    sub            bufq, 82*38+44-(82*3+41)
+%if %2
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*70-(82-3)
+%endif
     add           bufyq, 79+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
 .y_loop_ar2:
-    mov              xq, -38
+    mov              xq, -(76>>%2)
 
 .x_loop_ar2:
     pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
@@ -654,12 +752,20 @@
     paddd           xm2, xm3
     paddd           xm2, xm4
 
+%if %2
     movq            xm0, [bufyq+xq*2]
+%if %3
     movq            xm3, [bufyq+xq*2+82]
+%endif
     pmaddubsw       xm0, xm6, xm0
+%if %3
     pmaddubsw       xm3, xm6, xm3
     paddw           xm0, xm3
+%endif
     pmulhrsw        xm0, xm7
+%else
+    pmovsxbw        xm0, [bufyq+xq]
+%endif
     punpcklwd       xm0, xm15
     pmaddwd         xm0, xm14
     paddd           xm2, xm0
@@ -685,7 +791,7 @@
 
 .x_loop_ar2_end:
     add            bufq, 82
-    add           bufyq, 82*2
+    add           bufyq, 82<<%3
     dec              hd
     jg .y_loop_ar2
     RET
@@ -730,14 +836,20 @@
     mova    [rsp+ 9*16], xm3
     mova    [rsp+10*16], xm4
     mova    [rsp+11*16], xm5
+%if %2
     vpbroadcastd   xm13, [base+pb_1]
-    vpbroadcastw   xm15, [base+hmul_bits+4]
+    vpbroadcastw   xm15, [base+hmul_bits+2+%3*2]
+%endif
     DEFINE_ARGS buf, bufy, fg_data, h, unused, x
-    sub            bufq, 82*38+44-(82*3+41)
+%if %2
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*70-(82-3)
+%endif
     add           bufyq, 79+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
 .y_loop_ar3:
-    mov              xq, -38
+    mov              xq, -(76>>%2)
 
 .x_loop_ar3:
     movu            xm0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
@@ -800,12 +912,20 @@
     palignr         xm9, xm5, xm2, 10
     palignr         xm5, xm5, xm2, 12
 
+%if %2
     movq            xm1, [bufyq+xq*2]
+%if %3
     movq            xm2, [bufyq+xq*2+82]
+%endif
     pmaddubsw       xm1, xm13, xm1
+%if %3
     pmaddubsw       xm2, xm13, xm2
     paddw           xm1, xm2
+%endif
     pmulhrsw        xm1, xm15
+%else
+    pmovsxbw        xm1, [bufyq+xq]
+%endif
 
     punpcklwd       xm6, xm7
     punpcklwd       xm8, xm9
@@ -841,10 +961,15 @@
 
 .x_loop_ar3_end:
     add            bufq, 82
-    add           bufyq, 82*2
+    add           bufyq, 82<<%3
     dec              hd
     jg .y_loop_ar3
     RET
+%endmacro
+
+generate_grain_uv_fn 420, 1, 1
+generate_grain_uv_fn 422, 1, 0
+generate_grain_uv_fn 444, 0, 0
 
 INIT_YMM avx2
 cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
--- a/src/x86/film_grain_init_tmpl.c
+++ b/src/x86/film_grain_init_tmpl.c
@@ -35,6 +35,8 @@
 
 decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
 decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_avx2);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_avx2);
 decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
 decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
 
@@ -55,6 +57,8 @@
 #if BITDEPTH == 8 && ARCH_X86_64
     c->generate_grain_y = dav1d_generate_grain_y_avx2;
     c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_avx2;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_avx2;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_avx2;
     c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
     c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
 #endif