shithub: dav1d

Download patch

ref: 4e22ef3a82e89c18927f3a810c7d585e5bd038a6
parent: ff41197bc89fe06311cb07d0acf7e3cac76c6946
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Mon Oct 7 05:21:33 EDT 2019

Add AVX2 version of generate_grain_uv (4:2:0)

gen_grain_uv_ar0_8bpc_420_c: 30131.8
gen_grain_uv_ar0_8bpc_420_avx2: 6600.4
gen_grain_uv_ar1_8bpc_420_c: 46110.5
gen_grain_uv_ar1_8bpc_420_avx2: 17887.2
gen_grain_uv_ar2_8bpc_420_c: 73593.2
gen_grain_uv_ar2_8bpc_420_avx2: 26918.6
gen_grain_uv_ar3_8bpc_420_c: 114499.3
gen_grain_uv_ar3_8bpc_420_avx2: 29804.6

--- a/src/obu.c
+++ b/src/obu.c
@@ -1098,6 +1098,8 @@
                     const int num_uv_pos = num_y_pos + !!fgd->num_y_points;
                     for (int i = 0; i < num_uv_pos; i++)
                         fgd->ar_coeffs_uv[pl][i] = dav1d_get_bits(gb, 8) - 128;
+                    if (!fgd->num_y_points)
+                        fgd->ar_coeffs_uv[pl][num_uv_pos] = 0;
                 }
             fgd->ar_coeff_shift = dav1d_get_bits(gb, 2) + 6;
             fgd->grain_scale_shift = dav1d_get_bits(gb, 2);
--- a/src/x86/film_grain.asm
+++ b/src/x86/film_grain.asm
@@ -32,6 +32,8 @@
 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
 byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
+pw_seed_xor: times 2 dw 0xb524
+             times 2 dw 0x49d8
 pd_m65536: dd ~0xffff
 pb_23_22: times 2 db 23, 22
 pb_1: times 4 db 1
@@ -55,6 +57,7 @@
 %endmacro
 
 JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3
 
 struc FGData
     .seed:                      resd 1
@@ -405,6 +408,443 @@
 
 .x_loop_ar3_end:
     add            bufq, 82
+    dec              hd
+    jg .y_loop_ar3
+    RET
+
+INIT_XMM avx2
+cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
+    lea              r4, [pb_mask]
+%define base r4-pb_mask
+    movq            xm1, [base+rnd_next_upperbit_mask]
+    movq            xm4, [base+mul_bits]
+    movq            xm7, [base+hmul_bits]
+    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
+    vpbroadcastw    xm8, [base+round+r5*2]
+    mova            xm5, [base+pb_mask]
+    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
+    vpbroadcastw    xm9, [base+pw_seed_xor+uvq*4]
+    pxor            xm0, xm9
+    vpbroadcastd    xm9, [base+pd_m65536]
+    lea              r6, [gaussian_sequence]
+    mov             r7d, 38
+    add            bufq, 44
+.loop_y:
+    mov              r5, -44
+.loop_x:
+    pand            xm2, xm0, xm1
+    psrlw           xm3, xm2, 10
+    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+    pmullw          xm2, xm4            ; bits 0x0f00 are set
+    pshufb          xm2, xm5, xm2       ; set 15th bit for next 4 seeds
+    psllq           xm6, xm2, 30
+    por             xm2, xm6
+    psllq           xm6, xm2, 15
+    por             xm2, xm6            ; aggregate each bit into next seed's high bit
+    pmulhuw         xm3, xm0, xm7
+    por             xm2, xm3            ; 4 next output seeds
+    pshuflw         xm0, xm2, q3333
+    psrlw           xm2, 5
+    pmovzxwd        xm3, xm2
+    mova            xm6, xm9
+    vpgatherdd      xm2, [r6+xm3*2], xm6
+    pandn           xm2, xm9, xm2
+    packusdw        xm2, xm2
+    pmulhrsw        xm2, xm8
+    packsswb        xm2, xm2
+    movd      [bufq+r5], xm2
+    add              r5, 4
+    jl .loop_x
+    add            bufq, 82
+    dec             r7d
+    jg .loop_y
+
+    ; auto-regression code
+    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
+    movsxd           r5, [base+generate_grain_uv_420_avx2_table+r5*4]
+    lea              r5, [r5+base+generate_grain_uv_420_avx2_table]
+    jmp              r5
+
+.ar0:
+    INIT_YMM avx2
+    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+    imul            uvd, 25
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+    movd            xm3, [base+hmul_bits+shiftq*2]
+    DEFINE_ARGS buf, bufy, h
+    pmovsxbw        xm4, xm4
+    vpbroadcastd     m7, [pb_1]
+    vpbroadcastw     m6, [hmul_bits+4]
+    vpbroadcastw     m4, xm4
+    vpbroadcastw     m3, xm3
+    sub            bufq, 82*38+82-(82*3+41)
+    add           bufyq, 3+82*3
+    mov              hd, 35
+.y_loop_ar0:
+    ; first 32 pixels
+    movu            xm8, [bufyq]
+    movu            xm9, [bufyq+82]
+    movu           xm10, [bufyq+16]
+    movu           xm11, [bufyq+82+16]
+    vinserti128      m8, [bufyq+32], 1
+    vinserti128      m9, [bufyq+82+32], 1
+    vinserti128     m10, [bufyq+48], 1
+    vinserti128     m11, [bufyq+82+48], 1
+    pmaddubsw        m8, m7, m8
+    pmaddubsw        m9, m7, m9
+    pmaddubsw       m10, m7, m10
+    pmaddubsw       m11, m7, m11
+    paddw            m8, m9
+    paddw           m10, m11
+    pmulhrsw         m8, m6
+    pmulhrsw        m10, m6
+    pmullw           m8, m4
+    pmullw          m10, m4
+    pmulhrsw         m8, m3
+    pmulhrsw        m10, m3
+    packsswb         m8, m10
+    movu             m0, [bufq]
+    punpckhbw        m1, m0, m8
+    punpcklbw        m0, m8
+    pmaddubsw        m1, m7, m1
+    pmaddubsw        m0, m7, m0
+    packsswb         m0, m1
+    movu         [bufq], m0
+
+    ; last 6 pixels
+    movu            xm8, [bufyq+32*2]
+    movu            xm9, [bufyq+32*2+82]
+    pmaddubsw       xm8, xm7, xm8
+    pmaddubsw       xm9, xm7, xm9
+    paddw           xm8, xm9
+    pmulhrsw        xm8, xm6
+    pmullw          xm8, xm4
+    pmulhrsw        xm8, xm3
+    packsswb        xm8, xm8
+    movq            xm0, [bufq+32]
+    punpcklbw       xm8, xm0
+    pmaddubsw       xm8, xm7, xm8
+    packsswb        xm8, xm8
+    vpblendw        xm0, xm8, xm0, 1000b
+    movq      [bufq+32], xm0
+
+    add            bufq, 82
+    add           bufyq, 82*2
+    dec              hd
+    jg .y_loop_ar0
+    RET
+
+.ar1:
+    INIT_XMM avx2
+    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
+    imul            uvd, 25
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+    pinsrb          xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
+    DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift
+    pmovsxbw        xm4, xm4
+    pshufd          xm5, xm4, q1111
+    pshufd          xm4, xm4, q0000
+    pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
+    vpbroadcastd    xm7, [pb_1]
+    vpbroadcastw    xm6, [hmul_bits+4]
+    vpbroadcastd    xm3, xm3
+    sub            bufq, 82*38+44-(82*3+41)
+    add           bufyq, 79+82*3
+    mov              hd, 35
+    mov            mind, -128
+    mov            maxd, 127
+.y_loop_ar1:
+    mov              xq, -38
+    movsx         val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+    pmovsxbw        xm0, [bufq+xq-82-1]     ; top/left
+    movq            xm8, [bufyq+xq*2]
+    movq            xm9, [bufyq+xq*2+82]
+    psrldq          xm2, xm0, 2             ; top
+    psrldq          xm1, xm0, 4             ; top/right
+    pmaddubsw       xm8, xm7, xm8
+    pmaddubsw       xm9, xm7, xm9
+    paddw           xm8, xm9
+    pmulhrsw        xm8, xm6
+    punpcklwd       xm0, xm2
+    punpcklwd       xm1, xm8
+    pmaddwd         xm0, xm4
+    pmaddwd         xm1, xm5
+    paddd           xm0, xm1
+    paddd           xm0, xm3
+.x_loop_ar1_inner:
+    movd          val0d, xm0
+    psrldq          xm0, 4
+    imul          val3d, cf3d
+    add           val3d, val0d
+    sarx          val3d, val3d, shiftd
+    movsx         val0d, byte [bufq+xq]
+    add           val3d, val0d
+    cmp           val3d, maxd
+    cmovg         val3d, maxd
+    cmp           val3d, mind
+    cmovl         val3d, mind
+    mov  byte [bufq+xq], val3b
+    ; keep val3d in-place as left for next x iteration
+    inc              xq
+    jz .x_loop_ar1_end
+    test             xq, 3
+    jnz .x_loop_ar1_inner
+    jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+    add            bufq, 82
+    add           bufyq, 82*2
+    dec              hd
+    jg .y_loop_ar1
+    RET
+
+.ar2:
+    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    imul            uvd, 25
+    movd           xm15, [base+hmul_bits-10+shiftq*2]
+    pmovsxbw        xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-7
+    pmovsxbw        xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8]   ; cf8-12
+    DEFINE_ARGS buf, bufy, h, x
+    pshufd         xm12, xm9, q0000
+    pshufd         xm13, xm9, q1111
+    pshufd         xm14, xm9, q2222
+    pxor           xm10, xm10
+    vpblendw       xm14, xm10, 10101010b
+    pshufd         xm11, xm8, q3333
+    pshufd         xm10, xm8, q2222
+    pshufd          xm9, xm8, q1111
+    pshufd          xm8, xm8, q0000
+    sub            bufq, 82*38+44-(82*3+41)
+    add           bufyq, 79+82*3
+    mov              hd, 35
+.y_loop_ar2:
+    mov              xq, -38
+
+.x_loop_ar2:
+    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
+    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
+    psrldq          xm2, xm0, 2             ; y=-2,x=[-1,+5]
+    psrldq          xm3, xm1, 2             ; y=-1,x=[-1,+5]
+    psrldq          xm4, xm1, 4             ; y=-1,x=[+0,+5]
+    punpcklwd       xm2, xm0, xm2
+    punpcklwd       xm3, xm4
+    pmaddwd         xm2, xm8
+    pmaddwd         xm3, xm11
+    paddd           xm2, xm3
+
+    psrldq          xm4, xm0, 4             ; y=-2,x=[+0,+5]
+    psrldq          xm5, xm0, 6             ; y=-2,x=[+1,+5]
+    psrldq          xm6, xm0, 8             ; y=-2,x=[+2,+5]
+    punpcklwd       xm4, xm5
+    punpcklwd       xm6, xm1
+    psrldq          xm7, xm1, 6             ; y=-1,x=[+1,+5]
+    psrldq          xm1, xm1, 8             ; y=-1,x=[+2,+5]
+    punpcklwd       xm7, xm1
+    pmaddwd         xm4, xm9
+    pmaddwd         xm6, xm10
+    pmaddwd         xm7, xm12
+    paddd           xm4, xm6
+    paddd           xm2, xm7
+    paddd           xm2, xm4
+
+    vpbroadcastd    xm4, [base+pb_1]
+    movq            xm6, [bufyq+xq*2]
+    movq            xm7, [bufyq+xq*2+82]
+    pmaddubsw       xm6, xm4, xm6
+    pmaddubsw       xm7, xm4, xm7
+    vpbroadcastw    xm4, [base+hmul_bits+4]
+    paddw           xm6, xm7
+    pmulhrsw        xm6, xm4
+    pxor            xm7, xm7
+    punpcklwd       xm6, xm7
+    pmaddwd         xm6, xm14
+    paddd           xm2, xm6
+
+    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+    pmovsxbw        xm0, xm0
+    pmaddwd         xm3, xm0, xm13
+    paddd           xm3, xm2
+    psrldq          xm2, 4                  ; shift top to next pixel
+    psrad           xm3, 5
+    packssdw        xm3, xm3
+    pmulhrsw        xm3, xm15
+    pslldq          xm3, 2
+    psrldq          xm0, 2
+    paddw           xm3, xm0
+    vpblendw        xm0, xm3, 00000010b
+    packsswb        xm0, xm0
+    pextrb    [bufq+xq], xm0, 1
+    inc              xq
+    jz .x_loop_ar2_end
+    test             xq, 3
+    jnz .x_loop_ar2_inner
+    jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+    add            bufq, 82
+    add           bufyq, 82*2
+    dec              hd
+    jg .y_loop_ar2
+    RET
+
+.ar3:
+    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+    SUB             rsp, 16*12
+%assign stack_size_padded (stack_size_padded+16*12)
+%assign stack_size (stack_size+16*12)
+    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
+    imul            uvd, 25
+    movd           xm14, [base+hmul_bits-10+shiftq*2]
+    pmovsxbw        xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]   ; cf0-7
+    pmovsxbw        xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8]   ; cf8-15
+    pmovsxbw        xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]   ; cf16-23
+    pmovsxbw        xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24]   ; cf24 [luma]
+    pshufd          xm9, xm0, q1111
+    pshufd         xm10, xm0, q2222
+    pshufd         xm11, xm0, q3333
+    pshufd          xm0, xm0, q0000
+    pshufd          xm6, xm1, q1111
+    pshufd          xm7, xm1, q2222
+    pshufd          xm8, xm1, q3333
+    pshufd          xm1, xm1, q0000
+    pshufd          xm3, xm2, q1111
+    pshufd          xm4, xm2, q2222
+    vpbroadcastw    xm5, xm5
+    vpblendw        xm4, xm5, 10101010b                     ; interleave luma cf
+    psrldq          xm5, xm2, 10
+    pshufd          xm2, xm2, q0000
+    pinsrw          xm5, [base+round_vals+shiftq*2-10], 3
+    mova    [rsp+ 0*16], xm0
+    mova    [rsp+ 1*16], xm9
+    mova    [rsp+ 2*16], xm10
+    mova    [rsp+ 3*16], xm11
+    mova    [rsp+ 4*16], xm1
+    mova    [rsp+ 5*16], xm6
+    mova    [rsp+ 6*16], xm7
+    mova    [rsp+ 7*16], xm8
+    mova    [rsp+ 8*16], xm2
+    mova    [rsp+ 9*16], xm3
+    mova    [rsp+10*16], xm4
+    mova    [rsp+11*16], xm5
+    vpbroadcastd   xm13, [base+pb_1]
+    vpbroadcastw   xm15, [base+hmul_bits+4]
+    DEFINE_ARGS buf, bufy, h, x
+    sub            bufq, 82*38+44-(82*3+41)
+    add           bufyq, 79+82*3
+    mov              hd, 35
+.y_loop_ar3:
+    mov              xq, -38
+
+.x_loop_ar3:
+    movu            xm0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
+    movu            xm1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
+    movu            xm2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
+    pxor            xm3, xm3
+    pcmpgtb         xm6, xm3, xm2
+    pcmpgtb         xm5, xm3, xm1
+    pcmpgtb         xm4, xm3, xm0
+    punpckhbw       xm3, xm0, xm4
+    punpcklbw       xm0, xm4
+    punpckhbw       xm4, xm1, xm5
+    punpcklbw       xm1, xm5
+    punpckhbw       xm5, xm2, xm6
+    punpcklbw       xm2, xm6
+
+    psrldq          xm6, xm0, 2
+    psrldq          xm7, xm0, 4
+    psrldq          xm8, xm0, 6
+    psrldq          xm9, xm0, 8
+    palignr        xm10, xm3, xm0, 10
+    palignr        xm11, xm3, xm0, 12
+
+    punpcklwd       xm0, xm6
+    punpcklwd       xm7, xm8
+    punpcklwd       xm9, xm10
+    punpcklwd      xm11, xm1
+    pmaddwd         xm0, [rsp+ 0*16]
+    pmaddwd         xm7, [rsp+ 1*16]
+    pmaddwd         xm9, [rsp+ 2*16]
+    pmaddwd        xm11, [rsp+ 3*16]
+    paddd           xm0, xm7
+    paddd           xm9, xm11
+    paddd           xm0, xm9
+
+    psrldq          xm6, xm1, 2
+    psrldq          xm7, xm1, 4
+    psrldq          xm8, xm1, 6
+    psrldq          xm9, xm1, 8
+    palignr        xm10, xm4, xm1, 10
+    palignr        xm11, xm4, xm1, 12
+    psrldq         xm12, xm2, 2
+
+    punpcklwd       xm6, xm7
+    punpcklwd       xm8, xm9
+    punpcklwd      xm10, xm11
+    punpcklwd      xm12, xm2, xm12
+    pmaddwd         xm6, [rsp+ 4*16]
+    pmaddwd         xm8, [rsp+ 5*16]
+    pmaddwd        xm10, [rsp+ 6*16]
+    pmaddwd        xm12, [rsp+ 7*16]
+    paddd           xm6, xm8
+    paddd          xm10, xm12
+    paddd           xm6, xm10
+    paddd           xm0, xm6
+
+    psrldq          xm6, xm2, 4
+    psrldq          xm7, xm2, 6
+    psrldq          xm8, xm2, 8
+    palignr         xm9, xm5, xm2, 10
+    palignr         xm5, xm5, xm2, 12
+
+    movq            xm1, [bufyq+xq*2]
+    movq            xm2, [bufyq+xq*2+82]
+    pmaddubsw       xm1, xm13, xm1
+    pmaddubsw       xm2, xm13, xm2
+    paddw           xm1, xm2
+    vpbroadcastw    xm3, xm15
+    pmulhrsw        xm1, xm3
+
+    punpcklwd       xm6, xm7
+    punpcklwd       xm8, xm9
+    punpcklwd       xm5, xm1
+    pmaddwd         xm6, [rsp+ 8*16]
+    pmaddwd         xm8, [rsp+ 9*16]
+    pmaddwd         xm5, [rsp+10*16]
+    paddd           xm0, xm6
+    paddd           xm8, xm5
+    paddd           xm0, xm8
+
+    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+    pmovsxbw        xm1, xm1
+    pmaddwd         xm2, xm1, [rsp+16*11]
+    pshufd          xm3, xm2, q1111
+    paddd           xm2, xm3                ; left+cur
+    paddd           xm2, xm0                ; add top
+    psrldq          xm0, 4
+    psrad           xm2, 5
+    packssdw        xm2, xm2
+    pmulhrsw        xm2, xm14
+    pslldq          xm2, 6
+    vpblendw        xm1, xm2, 1000b
+    packsswb        xm1, xm1
+    pextrb    [bufq+xq], xm1, 3
+    psrldq          xm1, 1
+    inc              xq
+    jz .x_loop_ar3_end
+    test             xq, 3
+    jnz .x_loop_ar3_inner
+    jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+    add            bufq, 82
+    add           bufyq, 82*2
     dec              hd
     jg .y_loop_ar3
     RET
--- a/src/x86/film_grain_init_tmpl.c
+++ b/src/x86/film_grain_init_tmpl.c
@@ -29,6 +29,7 @@
 #include "src/film_grain.h"
 
 decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
 decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
 decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
 
@@ -39,6 +40,7 @@
 
 #if BITDEPTH == 8 && ARCH_X86_64
     c->generate_grain_y = dav1d_generate_grain_y_avx2;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_avx2;
     c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
     c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
 #endif
--- a/tests/checkasm/filmgrain.c
+++ b/tests/checkasm/filmgrain.c
@@ -34,6 +34,12 @@
 #define UNIT_TEST 1
 #include "src/fg_apply_tmpl.c"
 
+static const char ss_name[][4] = {
+    [DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
+    [DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
+    [DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
+};
+
 static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
     entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
     entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
@@ -72,6 +78,64 @@
     report("gen_grain_y");
 }
 
+static void check_gen_grnuv(const Dav1dFilmGrainDSPContext *const dsp) {
+    entry grain_lut_y[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+    entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
+    entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+
+    declare_func(void, entry grain_lut[][GRAIN_WIDTH],
+                 const entry grain_lut_y[][GRAIN_WIDTH],
+                 const Dav1dFilmGrainData *data, int uv HIGHBD_DECL_SUFFIX);
+
+    for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
+        const enum Dav1dPixelLayout layout = layout_idx + 1;
+        const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
+
+        for (int i = 0; i < 4; i++) {
+            if (check_func(dsp->generate_grain_uv[layout_idx],
+                           "gen_grain_uv_ar%d_%dbpc_%s",
+                           i, BITDEPTH, ss_name[layout_idx]))
+            {
+                Dav1dFilmGrainData fg_data;
+                fg_data.seed = rnd() & 0xFFFF;
+
+#if BITDEPTH == 16
+                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#endif
+
+                fg_data.num_y_points = rnd() & 1;
+                fg_data.grain_scale_shift = rnd() & 3;
+                fg_data.ar_coeff_shift = (rnd() & 3) + 6;
+                fg_data.ar_coeff_lag = i;
+                const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
+                for (int n = 0; n < num_y_pos; n++)
+                    fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+                dsp->generate_grain_y(grain_lut_y, &fg_data HIGHBD_TAIL_SUFFIX);
+
+                const int uv = rnd() & 1;
+                const int num_uv_pos = num_y_pos + !!fg_data.num_y_points;
+                for (int n = 0; n < num_uv_pos; n++)
+                    fg_data.ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
+                if (!fg_data.num_y_points)
+                    fg_data.ar_coeffs_uv[uv][num_uv_pos] = 0;
+                memset(grain_lut_c, 0xff, sizeof(grain_lut_c));
+                memset(grain_lut_a, 0xff, sizeof(grain_lut_a));
+                call_ref(grain_lut_c, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
+                call_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
+                int diff = 0, w = ss_x ? 44 : GRAIN_WIDTH;
+                for (int y = 0; y < (ss_y ? 38 : GRAIN_HEIGHT); y++)
+                    diff |= memcmp(grain_lut_a[y], grain_lut_c[y], w * sizeof(entry));
+                if (diff) fail();
+
+                bench_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
+            }
+        }
+    }
+
+    report("gen_grain_uv");
+}
+
 static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
     ALIGN_STK_32(pixel, c_dst, 128 * 32,);
     ALIGN_STK_32(pixel, a_dst, 128 * 32,);
@@ -157,11 +221,6 @@
                  int is_identity HIGHBD_DECL_SUFFIX);
 
     for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
-        const char ss_name[][4] = {
-            [DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
-            [DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
-            [DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
-        };
         const enum Dav1dPixelLayout layout = layout_idx + 1;
         const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
         const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
@@ -264,6 +323,7 @@
     bitfn(dav1d_film_grain_dsp_init)(&c);
 
     check_gen_grny(&c);
+    check_gen_grnuv(&c);
     check_fgy_sbrow(&c);
     check_fguv_sbrow(&c);
 }