shithub: dav1d

Download patch

ref: 701f88b9e3b60a9578ab1e98803c37561e41f466
parent: 53389fcda59c9b49f5e334a9aa212d59766824ca
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Oct 8 09:03:29 EDT 2018

x86: Add dc/h/v intra prediction AVX2 asm

--- a/src/ipred.c
+++ b/src/ipred.c
@@ -758,4 +758,8 @@
     c->cfl_pred[3] = cfl_pred_32xN_c;
 
     c->pal_pred = pal_pred_c;
+
+#if HAVE_ASM && ARCH_X86
+    bitfn(dav1d_intra_pred_dsp_init_x86)(c);
+#endif
 }
--- a/src/ipred.h
+++ b/src/ipred.h
@@ -99,4 +99,7 @@
 void dav1d_intra_pred_dsp_init_8bpc(Dav1dIntraPredDSPContext *c);
 void dav1d_intra_pred_dsp_init_10bpc(Dav1dIntraPredDSPContext *c);
 
+void dav1d_intra_pred_dsp_init_x86_8bpc(Dav1dIntraPredDSPContext *c);
+void dav1d_intra_pred_dsp_init_x86_10bpc(Dav1dIntraPredDSPContext *c);
+
 #endif /* __DAV1D_SRC_IPRED_H__ */
--- a/src/meson.build
+++ b/src/meson.build
@@ -98,6 +98,7 @@
         )
 
         libdav1d_tmpl_sources += files(
+            'x86/ipred_init.c',
             'x86/itx_init.c',
             'x86/loopfilter_init.c',
             'x86/mc_init.c',
@@ -106,6 +107,7 @@
         # NASM source files
         libdav1d_sources_asm = files(
             'x86/cpuid.asm',
+            'x86/ipred.asm',
             'x86/itx.asm',
             'x86/loopfilter.asm',
             'x86/mc.asm',
--- a/src/recon.c
+++ b/src/recon.c
@@ -701,7 +701,8 @@
     const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
 
     // coefficient coding
-    pixel edge_mem[257], *const edge = &edge_mem[128];
+    ALIGN_STK_32(pixel, edge_buf, 257,);
+    pixel *const edge = edge_buf + 128;
     const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
 
     for (int init_y = 0; init_y < h4; init_y += 16) {
@@ -1106,7 +1107,8 @@
                 obmc(t, dst, f->cur.p.stride[0], b_dim, 0, bx4, by4, w4, h4);
         }
         if (b->interintra_type) {
-            pixel tl_edge_px[65], *const tl_edge = &tl_edge_px[32];
+            ALIGN_STK_32(pixel, tl_edge_buf, 65,);
+            pixel *const tl_edge = tl_edge_buf + 32;
             enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
                                    SMOOTH_PRED : b->interintra_mode;
             pixel *const tmp = t->scratch.interintra;
--- /dev/null
+++ b/src/x86/ipred.asm
@@ -1,0 +1,399 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA
+
+pb_128: times 4 db 128
+
+%macro JMP_TABLE 3-*
+    %xdefine %1_%2_table (%%table - 2*4)
+    %xdefine %%base mangle(private_prefix %+ _%1_%2)
+    %%table:
+    %rep %0 - 2
+        dd %%base %+ .%3 - (%%table - 2*4)
+        %rotate 1
+    %endrep
+%endmacro
+
+%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
+
+JMP_TABLE ipred_dc,      avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+                               s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_h,       avx2, w4, w8, w16, w32, w64
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
+    lea                  r5, [ipred_dc_left_avx2_table]
+    tzcnt                wd, wm
+    inc                 tlq
+    movu                 m0, [tlq]
+    movifnidn            hd, hm
+    mov                 r6d, 0x8000
+    shrx                r6d, r6d, wd
+    movd                xm3, r6d
+    movsxd               r6, [r5+wq*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, r5
+    add                  r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    jmp                  r6
+
+cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
+    mov                  hd, hm ; zero upper half
+    tzcnt               r6d, hd
+    sub                 tlq, hq
+    tzcnt                wd, wm
+    movu                 m0, [tlq]
+    mov                 r5d, 0x8000
+    shrx                r5d, r5d, r6d
+    movd                xm3, r5d
+    lea                  r5, [ipred_dc_left_avx2_table]
+    movsxd               r6, [r5+r6*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, r5
+    add                  r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    jmp                  r6
+.h64:
+    movu                 m1, [tlq+32] ; unaligned when jumping here from dc_top
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+.h32:
+    vextracti128        xm1, m0, 1
+    paddw               xm0, xm1
+.h16:
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+.h8:
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+.h4:
+    pmaddwd             xm0, xm2
+    pmulhrsw            xm0, xm3
+    lea            stride3q, [strideq*3]
+    vpbroadcastb         m0, xm0
+    mova                 m1, m0
+    jmp                  wq
+
+cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3
+    movifnidn            hd, hm
+    movifnidn            wd, wm
+    tzcnt               r6d, hd
+    lea                 r5d, [wq+hq]
+    movd                xm4, r5d
+    tzcnt               r5d, r5d
+    movd                xm5, r5d
+    lea                  r5, [ipred_dc_avx2_table]
+    tzcnt                wd, wd
+    movsxd               r6, [r5+r6*4]
+    movsxd               wq, [r5+wq*4+5*4]
+    pcmpeqd              m3, m3
+    psrlw               xm4, 1
+    add                  r6, r5
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  r6
+.h4:
+    movd                xm0, [tlq-4]
+    pmaddubsw           xm0, xm3
+    jmp                  wq
+.w4:
+    movd                xm1, [tlq+1]
+    pmaddubsw           xm1, xm3
+    psubw               xm0, xm4
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    cmp                  hd, 4
+    jg .w4_mul
+    psrlw               xm0, 3
+    jmp .w4_end
+.w4_mul:
+    punpckhqdq          xm1, xm0, xm0
+    lea                 r2d, [hq*2]
+    mov                 r6d, 0x55563334
+    paddw               xm0, xm1
+    shrx                r6d, r6d, r2d
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    movd                xm1, r6d
+    psrlw               xm0, 2
+    pmulhuw             xm0, xm1
+.w4_end:
+    vpbroadcastb        xm0, xm0
+.s4:
+    movd   [dstq+strideq*0], xm0
+    movd   [dstq+strideq*1], xm0
+    movd   [dstq+strideq*2], xm0
+    movd   [dstq+stride3q ], xm0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s4
+    RET
+ALIGN function_align
+.h8:
+    movq                xm0, [tlq-8]
+    pmaddubsw           xm0, xm3
+    jmp                  wq
+.w8:
+    movq                xm1, [tlq+1]
+    vextracti128        xm2, m0, 1
+    pmaddubsw           xm1, xm3
+    psubw               xm0, xm4
+    paddw               xm0, xm2
+    punpckhqdq          xm2, xm0, xm0
+    paddw               xm0, xm2
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 8
+    je .w8_end
+    mov                 r6d, 0x5556
+    mov                 r2d, 0x3334
+    cmp                  hd, 32
+    cmovz               r6d, r2d
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w8_end:
+    vpbroadcastb        xm0, xm0
+.s8:
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xm0
+    movq   [dstq+strideq*2], xm0
+    movq   [dstq+stride3q ], xm0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s8
+    RET
+ALIGN function_align
+.h16:
+    mova                xm0, [tlq-16]
+    pmaddubsw           xm0, xm3
+    jmp                  wq
+.w16:
+    movu                xm1, [tlq+1]
+    vextracti128        xm2, m0, 1
+    pmaddubsw           xm1, xm3
+    psubw               xm0, xm4
+    paddw               xm0, xm2
+    paddw               xm0, xm1
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 16
+    je .w16_end
+    mov                 r6d, 0x5556
+    mov                 r2d, 0x3334
+    test                 hb, 8|32
+    cmovz               r6d, r2d
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w16_end:
+    vpbroadcastb        xm0, xm0
+.s16:
+    mova   [dstq+strideq*0], xm0
+    mova   [dstq+strideq*1], xm0
+    mova   [dstq+strideq*2], xm0
+    mova   [dstq+stride3q ], xm0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s16
+    RET
+ALIGN function_align
+.h32:
+    mova                 m0, [tlq-32]
+    pmaddubsw            m0, m3
+    jmp                  wq
+.w32:
+    movu                 m1, [tlq+1]
+    pmaddubsw            m1, m3
+    paddw                m0, m1
+    vextracti128        xm1, m0, 1
+    psubw               xm0, xm4
+    paddw               xm0, xm1
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 32
+    je .w32_end
+    lea                 r2d, [hq*2]
+    mov                 r6d, 0x33345556
+    shrx                r6d, r6d, r2d
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w32_end:
+    vpbroadcastb         m0, xm0
+.s32:
+    mova   [dstq+strideq*0], m0
+    mova   [dstq+strideq*1], m0
+    mova   [dstq+strideq*2], m0
+    mova   [dstq+stride3q ], m0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s32
+    RET
+ALIGN function_align
+.h64:
+    mova                 m0, [tlq-64]
+    mova                 m1, [tlq-32]
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m3
+    paddw                m0, m1
+    jmp                  wq
+.w64:
+    movu                 m1, [tlq+ 1]
+    movu                 m2, [tlq+33]
+    pmaddubsw            m1, m3
+    pmaddubsw            m2, m3
+    paddw                m0, m1
+    paddw                m0, m2
+    vextracti128        xm1, m0, 1
+    psubw               xm0, xm4
+    paddw               xm0, xm1
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 64
+    je .w64_end
+    mov                 r6d, 0x33345556
+    shrx                r6d, r6d, hd
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w64_end:
+    vpbroadcastb         m0, xm0
+    mova                 m1, m0
+.s64:
+    mova [dstq+strideq*0+32*0], m0
+    mova [dstq+strideq*0+32*1], m1
+    mova [dstq+strideq*1+32*0], m0
+    mova [dstq+strideq*1+32*1], m1
+    mova [dstq+strideq*2+32*0], m0
+    mova [dstq+strideq*2+32*1], m1
+    mova [dstq+stride3q +32*0], m0
+    mova [dstq+stride3q +32*1], m1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s64
+    RET
+
+cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
+    lea                  r5, [ipred_dc_splat_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    vpbroadcastd         m0, [r5-ipred_dc_splat_avx2_table+pb_128]
+    mova                 m1, m0
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+
+cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
+    lea                  r5, [ipred_dc_splat_avx2_table]
+    tzcnt                wd, wm
+    movu                 m0, [tlq+ 1]
+    movu                 m1, [tlq+33]
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+
+%macro IPRED_H 2 ; w, store_type
+    vpbroadcastb         m0, [tlq-1]
+    vpbroadcastb         m1, [tlq-2]
+    vpbroadcastb         m2, [tlq-3]
+    sub                 tlq, 4
+    vpbroadcastb         m3, [tlq+0]
+    mov%2  [dstq+strideq*0], m0
+    mov%2  [dstq+strideq*1], m1
+    mov%2  [dstq+strideq*2], m2
+    mov%2  [dstq+stride3q ], m3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w%1
+    RET
+ALIGN function_align
+%endmacro
+
+INIT_XMM avx2
+cglobal ipred_h, 3, 6, 4, dst, stride, tl, w, h, stride3
+    lea                  r5, [ipred_h_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    IPRED_H               4, d
+.w8:
+    IPRED_H               8, q
+.w16:
+    IPRED_H              16, a
+INIT_YMM avx2
+.w32:
+    IPRED_H              32, a
+.w64:
+    vpbroadcastb         m0, [tlq-1]
+    vpbroadcastb         m1, [tlq-2]
+    vpbroadcastb         m2, [tlq-3]
+    sub                 tlq, 4
+    vpbroadcastb         m3, [tlq+0]
+    mova [dstq+strideq*0+32*0], m0
+    mova [dstq+strideq*0+32*1], m0
+    mova [dstq+strideq*1+32*0], m1
+    mova [dstq+strideq*1+32*1], m1
+    mova [dstq+strideq*2+32*0], m2
+    mova [dstq+strideq*2+32*1], m2
+    mova [dstq+stride3q +32*0], m3
+    mova [dstq+stride3q +32*1], m3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w64
+    RET
+
+%endif
--- /dev/null
+++ b/src/x86/ipred_init.c
@@ -1,0 +1,51 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+decl_angular_ipred_fn(dav1d_ipred_dc_avx2);
+decl_angular_ipred_fn(dav1d_ipred_dc_128_avx2);
+decl_angular_ipred_fn(dav1d_ipred_dc_top_avx2);
+decl_angular_ipred_fn(dav1d_ipred_dc_left_avx2);
+decl_angular_ipred_fn(dav1d_ipred_h_avx2);
+decl_angular_ipred_fn(dav1d_ipred_v_avx2);
+
+void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+#if BITDEPTH == 8 && ARCH_X86_64
+    c->intra_pred[DC_PRED]       = dav1d_ipred_dc_avx2;
+    c->intra_pred[DC_128_PRED]   = dav1d_ipred_dc_128_avx2;
+    c->intra_pred[TOP_DC_PRED]   = dav1d_ipred_dc_top_avx2;
+    c->intra_pred[LEFT_DC_PRED]  = dav1d_ipred_dc_left_avx2;
+    c->intra_pred[HOR_PRED]      = dav1d_ipred_h_avx2;
+    c->intra_pred[VERT_PRED]     = dav1d_ipred_v_avx2;
+#endif
+}