shithub: dav1d

Download patch

ref: acde4240fe0fd55077cfefabba1348b632f33fbd
parent: dd797aa20b080487cd3a2975e7d028b03039502f
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Sat Oct 27 14:38:21 EDT 2018

Add AVX2 SIMD implementation for cdef_dir

cdef_dir_8bpc_c: 629.3
cdef_dir_8bpc_avx2: 82.4

First 1000 frames of Chimera 1080p:
before: 0m23.084s
after: 0m21.860s

--- a/src/cdef.h
+++ b/src/cdef.h
@@ -48,9 +48,11 @@
                         /*const*/ pixel *const top[2],
                         int pri_strength, int sec_strength,
                         int dir, int damping, enum CdefEdgeFlags edges);
-typedef int (*cdef_dir_fn)(const pixel *dst, ptrdiff_t stride,
-                           unsigned *var);
 
+#define decl_cdef_dir_fn(name) \
+int (name)(const pixel *dst, ptrdiff_t dst_stride, unsigned *var)
+typedef decl_cdef_dir_fn(*cdef_dir_fn);
+
 typedef struct Dav1dCdefDSPContext {
     cdef_dir_fn dir;
     cdef_fn fb[3 /* 444/luma, 422, 420 */];
@@ -58,5 +60,8 @@
 
 void dav1d_cdef_dsp_init_8bpc(Dav1dCdefDSPContext *c);
 void dav1d_cdef_dsp_init_10bpc(Dav1dCdefDSPContext *c);
+
+void dav1d_cdef_dsp_init_x86_8bpc(Dav1dCdefDSPContext *c);
+void dav1d_cdef_dsp_init_x86_10bpc(Dav1dCdefDSPContext *c);
 
 #endif /* __DAV1D_SRC_CDEF_H__ */
--- a/src/cdef_tmpl.c
+++ b/src/cdef_tmpl.c
@@ -273,4 +273,8 @@
     c->fb[0] = cdef_filter_block_8x8_c;
     c->fb[1] = cdef_filter_block_4x8_c;
     c->fb[2] = cdef_filter_block_4x4_c;
+
+#if HAVE_ASM && ARCH_X86 && BITDEPTH == 8
+    bitfn(dav1d_cdef_dsp_init_x86)(c);
+#endif
 }
--- a/src/meson.build
+++ b/src/meson.build
@@ -101,6 +101,7 @@
         )
 
         libdav1d_tmpl_sources += files(
+            'x86/cdef_init_tmpl.c',
             'x86/ipred_init_tmpl.c',
             'x86/itx_init_tmpl.c',
             'x86/loopfilter_init_tmpl.c',
@@ -110,6 +111,7 @@
 
         # NASM source files
         libdav1d_sources_asm = files(
+            'x86/cdef.asm',
             'x86/cpuid.asm',
             'x86/ipred.asm',
             'x86/itx.asm',
--- /dev/null
+++ b/src/x86/cdef.asm
@@ -1,0 +1,264 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+pd_04512763: dd 0, 4, 5, 1, 2, 7, 6, 3
+div_table: dd 840, 420, 280, 210, 168, 140, 120, 105
+           dd 420, 210, 140, 105
+pd_04261537: dd 0, 4, 2, 6, 1, 5, 3, 7
+shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
+shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+pw_128: times 2 dw 128
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
+    lea       stride3q, [strideq*3]
+    movq           xm0, [srcq+strideq*0]
+    movq           xm1, [srcq+strideq*1]
+    movq           xm2, [srcq+strideq*2]
+    movq           xm3, [srcq+stride3q]
+    lea           srcq, [srcq+strideq*4]
+    vpbroadcastq    m4, [srcq+strideq*0]
+    vpbroadcastq    m5, [srcq+strideq*1]
+    vpbroadcastq    m6, [srcq+strideq*2]
+    vpbroadcastq    m7, [srcq+stride3q]
+    vpbroadcastd    m8, [pw_128]
+    pxor            m9, m9
+
+    vpblendd        m0, m0, m7, 0xf0
+    vpblendd        m1, m1, m6, 0xf0
+    vpblendd        m2, m2, m5, 0xf0
+    vpblendd        m3, m3, m4, 0xf0
+
+    punpcklbw       m0, m9
+    punpcklbw       m1, m9
+    punpcklbw       m2, m9
+    punpcklbw       m3, m9
+
+    psubw           m0, m8
+    psubw           m1, m8
+    psubw           m2, m8
+    psubw           m3, m8
+
+    ; shuffle registers to generate partial_sum_diag[0-1] together
+    vpermq          m7, m0, q1032
+    vpermq          m6, m1, q1032
+    vpermq          m5, m2, q1032
+    vpermq          m4, m3, q1032
+
+    ; start with partial_sum_hv[0-1]
+    paddw           m8, m0, m1
+    paddw           m9, m2, m3
+    phaddw         m10, m0, m1
+    phaddw         m11, m2, m3
+    paddw           m8, m9
+    phaddw         m10, m11
+    vextracti128   xm9, m8, 1
+    vextracti128  xm11, m10, 1
+    paddw          xm8, xm9                 ; partial_sum_hv[1]
+    phaddw        xm10, xm11                ; partial_sum_hv[0]
+    vinserti128     m8, xm10, 1
+    vpbroadcastd    m9, [div_table+44]
+    pmaddwd         m8, m8
+    pmulld          m8, m9                  ; cost6[2a-d] | cost2[a-d]
+
+    ; create aggregates [lower half]:
+    ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+
+    ;      m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0
+    ; m10=             m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+
+    ;      m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x
+    ; and [upper half]:
+    ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+
+    ;      m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567
+    ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+
+    ;      m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx
+    ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd
+
+    pslldq          m9, m1, 2
+    psrldq         m10, m1, 14
+    pslldq         m11, m2, 4
+    psrldq         m12, m2, 12
+    pslldq         m13, m3, 6
+    psrldq         m14, m3, 10
+    paddw           m9, m11
+    paddw          m10, m12
+    paddw           m9, m13
+    paddw          m10, m14
+    pslldq         m11, m4, 8
+    psrldq         m12, m4, 8
+    pslldq         m13, m5, 10
+    psrldq         m14, m5, 6
+    paddw           m9, m11
+    paddw          m10, m12
+    paddw           m9, m13
+    paddw          m10, m14
+    pslldq         m11, m6, 12
+    psrldq         m12, m6, 4
+    pslldq         m13, m7, 14
+    psrldq         m14, m7, 2
+    paddw           m9, m11
+    paddw          m10, m12
+    paddw           m9, m13
+    paddw          m10, m14                 ; partial_sum_diag[0/1][8-14,zero]
+    vbroadcasti128 m14, [shufw_6543210x]
+    vbroadcasti128 m13, [div_table+16]
+    vbroadcasti128 m12, [div_table+0]
+    paddw           m9, m0                  ; partial_sum_diag[0/1][0-7]
+    pshufb         m10, m14
+    punpckhwd      m11, m9, m10
+    punpcklwd       m9, m10
+    pmaddwd        m11, m11
+    pmaddwd         m9, m9
+    pmulld         m11, m13
+    pmulld          m9, m12
+    paddd           m9, m11                 ; cost0[a-d] | cost4[a-d]
+
+    ; merge horizontally and vertically for partial_sum_alt[0-3]
+    paddw          m10, m0, m1
+    paddw          m11, m2, m3
+    paddw          m12, m4, m5
+    paddw          m13, m6, m7
+    phaddw          m0, m4
+    phaddw          m1, m5
+    phaddw          m2, m6
+    phaddw          m3, m7
+
+    ; create aggregates [lower half]:
+    ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234
+    ; m11=              m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx
+    ; and [upper half]:
+    ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
+    ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
+    ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd
+
+    vbroadcasti128 m14, [shufw_210xxxxx]
+    pslldq          m4, m11, 2
+    psrldq         m11, 14
+    pslldq          m5, m12, 4
+    psrldq         m12, 12
+    pslldq          m6, m13, 6
+    psrldq         m13, 10
+    paddw           m4, m10
+    paddw          m11, m12
+    vpbroadcastd   m12, [div_table+44]
+    paddw           m5, m6
+    paddw          m11, m13                 ; partial_sum_alt[3/2] right
+    vbroadcasti128 m13, [div_table+32]
+    paddw           m4, m5                  ; partial_sum_alt[3/2] left
+    pshufb         m11, m14
+    punpckhwd       m6, m4, m11
+    punpcklwd       m4, m11
+    pmaddwd         m6, m6
+    pmaddwd         m4, m4
+    pmulld          m6, m12
+    pmulld          m4, m13
+    paddd           m4, m6                  ; cost7[a-d] | cost5[a-d]
+
+    ; create aggregates [lower half]:
+    ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234
+    ; m1 =             m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx
+    ; and [upper half]:
+    ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
+    ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
+    ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd
+
+    pslldq          m5, m1, 2
+    psrldq          m1, 14
+    pslldq          m6, m2, 4
+    psrldq          m2, 12
+    pslldq          m7, m3, 6
+    psrldq          m3, 10
+    paddw           m5, m0
+    paddw           m1, m2
+    paddw           m6, m7
+    paddw           m1, m3                  ; partial_sum_alt[0/1] right
+    paddw           m5, m6                  ; partial_sum_alt[0/1] left
+    pshufb          m1, m14
+    punpckhwd       m6, m5, m1
+    punpcklwd       m5, m1
+    pmaddwd         m6, m6
+    pmaddwd         m5, m5
+    pmulld          m6, m12
+    pmulld          m5, m13
+    paddd           m5, m6                  ; cost1[a-d] | cost3[a-d]
+
+    mova           xm0, [pd_04512763+ 0]
+    mova           xm1, [pd_04512763+ 16]
+    phaddd          m9, m8
+    phaddd          m5, m4
+    phaddd          m9, m5
+    vpermd          m0, m9                  ; cost[0/4/2/6]
+    vpermd          m1, m9                  ; cost[1/5/3/7]
+
+    ; now find the best cost, its idx^4 complement, and its idx
+    pcmpgtd        xm2, xm1, xm0            ; [1/5/3/7] > [0/4/2/6]
+    pand           xm3, xm2, xm1
+    pandn          xm4, xm2, xm0
+    por            xm3, xm4                 ; higher 4 values
+    pshufd         xm1, xm1, q2301
+    pshufd         xm0, xm0, q2301
+    pand           xm1, xm2, xm1
+    pandn          xm0, xm2, xm0
+    por            xm0, xm1                 ; complementary 4 values at idx^4 offset
+    pand          xm13, xm2, [pd_04261537+16]
+    pandn         xm14, xm2, [pd_04261537+ 0]
+    por           xm14, xm13                ; indices
+
+    punpckhqdq     xm4, xm3, xm0
+    punpcklqdq     xm3, xm0
+    pcmpgtd        xm5, xm4, xm3            ; [2or3-6or7] > [0or1/4or5]
+    punpcklqdq     xm5, xm5
+    pand           xm6, xm5, xm4
+    pandn          xm7, xm5, xm3
+    por            xm6, xm7                 ; { highest 2 values, complements at idx^4 }
+    movhlps       xm13, xm14
+    pand          xm13, xm5, xm13
+    pandn         xm14, xm5, xm14
+    por           xm14, xm13
+
+    pshufd         xm7, xm6, q3311
+    pcmpgtd        xm8, xm7, xm6            ; [4or5or6or7] > [0or1or2or3]
+    punpcklqdq     xm8, xm8
+    pand           xm9, xm8, xm7
+    pandn         xm10, xm8, xm6
+    por            xm9, xm10                ; max
+    movhlps       xm10, xm9                 ; complement at idx^4
+    psubd          xm9, xm10
+    psrld          xm9, 10
+    movd        [varq], xm9
+    pshufd        xm13, xm14, q1111
+    pand          xm13, xm8, xm13
+    pandn         xm14, xm8, xm14
+    por           xm14, xm13
+    movd           eax, xm14
+    RET
+%endif ; ARCH_X86_64
--- /dev/null
+++ b/src/x86/cdef_init_tmpl.c
@@ -1,0 +1,41 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/cdef.h"
+
+decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
+
+void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+#if BITDEPTH == 8 && ARCH_X86_64
+    c->dir = dav1d_cdef_dir_avx2;
+#endif
+}