shithub: dav1d

Download patch

ref: 80e47425e6ca9834e19930ce9663009e421f314c
parent: 1400b028cf4e24eaeab74f036f04ea8fa79240fa
author: Janne Grunau <janne-vlc@jannau.net>
date: Sat Sep 29 10:49:43 EDT 2018

arm64/mc: add 8-bit neon asm for avg, w_avg and mask

checkasm --bench on a Qualcomm Kryo (Sanpdragon 820):
nop: 33.0
avg_w4_8bpc_c: 450.5
avg_w4_8bpc_neon: 20.1
avg_w8_8bpc_c: 438.6
avg_w8_8bpc_neon: 45.2
avg_w16_8bpc_c: 1003.7
avg_w16_8bpc_neon: 112.8
avg_w32_8bpc_c: 3249.6
avg_w32_8bpc_neon: 429.9
avg_w64_8bpc_c: 7213.3
avg_w64_8bpc_neon: 1299.4
avg_w128_8bpc_c: 16791.3
avg_w128_8bpc_neon: 2978.4
w_avg_w4_8bpc_c: 605.7
w_avg_w4_8bpc_neon: 30.9
w_avg_w8_8bpc_c: 545.8
w_avg_w8_8bpc_neon: 72.9
w_avg_w16_8bpc_c: 1430.1
w_avg_w16_8bpc_neon: 193.5
w_avg_w32_8bpc_c: 4876.3
w_avg_w32_8bpc_neon: 715.3
w_avg_w64_8bpc_c: 11338.0
w_avg_w64_8bpc_neon: 2147.0
w_avg_w128_8bpc_c: 26822.0
w_avg_w128_8bpc_neon: 4596.3
mask_w4_8bpc_c: 604.6
mask_w4_8bpc_neon: 37.2
mask_w8_8bpc_c: 654.8
mask_w8_8bpc_neon: 96.0
mask_w16_8bpc_c: 1663.0
mask_w16_8bpc_neon: 272.4
mask_w32_8bpc_c: 5707.6
mask_w32_8bpc_neon: 1028.9
mask_w64_8bpc_c: 12735.3
mask_w64_8bpc_neon: 2533.2
mask_w128_8bpc_c: 31027.6
mask_w128_8bpc_neon: 6247.2

--- /dev/null
+++ b/src/arm/64/mc.S
@@ -1,0 +1,237 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#if BITDEPTH == 8
+
+.macro avg dst, t0, t1
+        ld1             {\t0\().8h},   [x2],  16
+        ld1             {\t1\().8h},   [x3],  16
+        add             \t0\().8h,   \t0\().8h,   \t1\().8h
+        sqrshrun        \dst\().8b,  \t0\().8h,   #5
+.endm
+
+.macro avg16 dst, t0, t1, t2, t3
+        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
+        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
+        add             \t0\().8h,   \t0\().8h,   \t2\().8h
+        add             \t1\().8h,   \t1\().8h,   \t3\().8h
+        sqrshrun        \dst\().8b,  \t0\().8h,   #5
+        sqrshrun2       \dst\().16b, \t1\().8h,   #5
+.endm
+
+.macro w_avg dst, t0, t1
+        ld1             {\t0\().8h},   [x2],  16
+        ld1             {\t1\().8h},   [x3],  16
+        sub             \t0\().8h,   \t1\().8h,   \t0\().8h
+        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
+        add             \t0\().8h,   \t1\().8h,   \t0\().8h
+        sqrshrun        \dst\().8b,  \t0\().8h,   #4
+.endm
+
+.macro w_avg16 dst, t0, t1, t2, t3
+        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
+        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
+        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
+        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
+        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
+        sqdmulh         \t1\().8h,   \t1\().8h,   v30.8h
+        add             \t0\().8h,   \t2\().8h,   \t0\().8h
+        add             \t1\().8h,   \t3\().8h,   \t1\().8h
+        sqrshrun        \dst\().8b,  \t0\().8h,   #4
+        sqrshrun2       \dst\().16b, \t1\().8h,   #4
+.endm
+
+.macro mask dst, t0, t1
+        ld1             {v30.8b},      [x6],  8
+        ld1             {\t0\().8h},   [x2],  16
+        mul             v30.8b, v30.8b, v31.8b
+        ld1             {\t1\().8h},   [x3],  16
+        shll            v30.8h, v30.8b, #8
+        sub             \t0\().8h,   \t1\().8h,   \t0\().8h
+        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
+        add             \t0\().8h,   \t1\().8h,   \t0\().8h
+        sqrshrun        \dst\().8b,  \t0\().8h,   #4
+.endm
+
+.macro mask16 dst, t0, t1, t2, t3
+        ld1             {v30.16b}, [x6],  16
+        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
+        mul             v30.16b, v30.16b, v31.16b
+        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
+        shll            v28.8h, v30.8b,  #8
+        shll2           v29.8h, v30.16b, #8
+        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
+        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
+        sqdmulh         \t0\().8h,   \t0\().8h,   v28.8h
+        sqdmulh         \t1\().8h,   \t1\().8h,   v29.8h
+        add             \t0\().8h,   \t2\().8h,   \t0\().8h
+        add             \t1\().8h,   \t3\().8h,   \t1\().8h
+        sqrshrun        \dst\().8b,  \t0\().8h,   #4
+        sqrshrun2       \dst\().16b, \t1\().8h,   #4
+.endm
+
+.macro bidir_fn type
+function \type\()_8bpc_neon, export=1
+.ifc \type, w_avg
+        dup             v30.8h, w6
+        neg             v30.8h, v30.8h
+        shl             v30.8h, v30.8h, #11
+.endif
+.ifc \type, mask
+        movi            v31.16b, #256-2
+.endif
+        rbit            w4,  w4
+        adr             x7,  \type\()_tbl
+        clz             w4,  w4
+        \type           v4,  v0,  v1
+        ldrh            w4,  [x7, x4, lsl #1]
+        \type           v5,  v2,  v3
+        sub             x7,  x7,  w4, uxth
+        br              x7
+4:
+        cmp             w5,  #4
+        st1             {v4.s}[0],  [x0], x1
+        st1             {v4.s}[1],  [x0], x1
+        st1             {v5.s}[0],  [x0], x1
+        st1             {v5.s}[1],  [x0], x1
+        b.eq            0f
+        \type           v6,  v0,  v1
+        \type           v7,  v2,  v3
+        cmp             w5,  #8
+        st1             {v6.s}[0],  [x0], x1
+        st1             {v6.s}[1],  [x0], x1
+        st1             {v7.s}[0],  [x0], x1
+        st1             {v7.s}[1],  [x0], x1
+        b.eq            0f
+        \type           v4,  v0,  v1
+        \type           v5,  v2,  v3
+        st1             {v4.s}[0],  [x0], x1
+        st1             {v4.s}[1],  [x0], x1
+        \type           v6,  v0,  v1
+        st1             {v5.s}[0],  [x0], x1
+        st1             {v5.s}[1],  [x0], x1
+        \type           v7,  v2,  v3
+        st1             {v6.s}[0],  [x0], x1
+        st1             {v6.s}[1],  [x0], x1
+        st1             {v7.s}[0],  [x0], x1
+        st1             {v7.s}[1],  [x0], x1
+        ret
+8:
+        st1             {v4.8b},  [x0], x1
+        \type           v6,  v0,  v1
+        st1             {v5.8b},  [x0], x1
+        \type           v7,  v0,  v1
+        st1             {v6.8b},  [x0], x1
+        subs            w5,  w5,  #4
+        st1             {v7.8b},  [x0], x1
+        b.le            0f
+        \type           v4,  v0,  v1
+        \type           v5,  v2,  v3
+        b               8b
+160:
+        trn1            v4.2d,  v4.2d,  v5.2d
+16:
+        \type\()16      v5, v0, v1, v2, v3
+        st1             {v4.16b}, [x0], x1
+        \type\()16      v6, v0, v1, v2, v3
+        st1             {v5.16b}, [x0], x1
+        \type\()16      v7, v0, v1, v2, v3
+        st1             {v6.16b}, [x0], x1
+        subs            w5,  w5,  #4
+        st1             {v7.16b}, [x0], x1
+        b.le            0f
+        \type\()16      v4, v0, v1, v2, v3
+        b               16b
+320:
+        trn1            v4.2d,  v4.2d,  v5.2d
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+32:
+        \type\()16      v5, v0, v1, v2, v3
+        \type\()16      v6, v0, v1, v2, v3
+        st1             {v4.16b,v5.16b}, [x0], x1
+        \type\()16      v7, v0, v1, v2, v3
+        subs            w5,  w5,  #2
+        st1             {v6.16b,v7.16b}, [x7], x1
+        b.le            0f
+        \type\()16      v4, v0, v1, v2, v3
+        b               32b
+640:
+        trn1            v4.2d,  v4.2d,  v5.2d
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+64:
+        \type\()16      v5,  v0, v1, v2, v3
+        \type\()16      v6,  v0, v1, v2, v3
+        \type\()16      v7,  v0, v1, v2, v3
+        \type\()16      v16, v0, v1, v2, v3
+        \type\()16      v17, v0, v1, v2, v3
+        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
+        \type\()16      v18, v0, v1, v2, v3
+        \type\()16      v19, v0, v1, v2, v3
+        subs            w5,  w5,  #2
+        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
+        b.le            0f
+        \type\()16      v4, v0, v1, v2, v3
+        b               64b
+1280:
+        trn1            v4.2d,  v4.2d,  v5.2d
+        add             x7,  x0,  #64
+128:
+        \type\()16      v5,  v0, v1, v2, v3
+        \type\()16      v6,  v0, v1, v2, v3
+        \type\()16      v7,  v0, v1, v2, v3
+        \type\()16      v16, v0, v1, v2, v3
+        \type\()16      v17, v0, v1, v2, v3
+        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
+        \type\()16      v18, v0, v1, v2, v3
+        \type\()16      v19, v0, v1, v2, v3
+        subs            w5,  w5,  #1
+        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
+        b.le            0f
+        \type\()16      v4, v0, v1, v2, v3
+        b               128b
+0:
+        ret
+\type\()_tbl:
+        .hword 0, 0
+        .hword \type\()_tbl -    4b
+        .hword \type\()_tbl -    8b
+        .hword \type\()_tbl -  160b
+        .hword \type\()_tbl -  320b
+        .hword \type\()_tbl -  640b
+        .hword \type\()_tbl - 1280b
+endfunc
+.endm
+
+bidir_fn avg
+bidir_fn w_avg
+bidir_fn mask
+
+#endif /* BITDEPTH == 8 */
--- /dev/null
+++ b/src/arm/mc_init.c
@@ -1,0 +1,47 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/mc.h"
+#include "src/cpu.h"
+
+decl_avg_fn(dav1d_avg_8bpc_neon);
+decl_w_avg_fn(dav1d_w_avg_8bpc_neon);
+decl_mask_fn(dav1d_mask_8bpc_neon);
+
+void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if BITDEPTH == 8 && ARCH_AARCH64
+    c->avg = dav1d_avg_8bpc_neon;
+    c->w_avg = dav1d_w_avg_8bpc_neon;
+    c->mask = dav1d_mask_8bpc_neon;
+#endif
+}
--- a/src/mc.c
+++ b/src/mc.c
@@ -532,7 +532,11 @@
     c->warp8x8  = warp_affine_8x8_c;
     c->warp8x8t = warp_affine_8x8t_c;
 
-#if HAVE_ASM && ARCH_X86
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+    bitfn(dav1d_mc_dsp_init_arm)(c);
+#elif ARCH_X86
     bitfn(dav1d_mc_dsp_init_x86)(c);
+#endif
 #endif
 }
--- a/src/mc.h
+++ b/src/mc.h
@@ -101,6 +101,9 @@
 void dav1d_mc_dsp_init_8bpc(Dav1dMCDSPContext *c);
 void dav1d_mc_dsp_init_10bpc(Dav1dMCDSPContext *c);
 
+void dav1d_mc_dsp_init_arm_8bpc(Dav1dMCDSPContext *c);
+void dav1d_mc_dsp_init_arm_10bpc(Dav1dMCDSPContext *c);
+
 void dav1d_mc_dsp_init_x86_8bpc(Dav1dMCDSPContext *c);
 void dav1d_mc_dsp_init_x86_10bpc(Dav1dMCDSPContext *c);
 
--- a/src/meson.build
+++ b/src/meson.build
@@ -83,9 +83,11 @@
             'arm/cpu.c',
         )
         libdav1d_tmpl_sources += files(
+            'arm/mc_init.c',
         )
         if host_machine.cpu_family() == 'aarch64'
             libdav1d_tmpl_sources += files(
+                'arm/64/mc.S',
             )
         elif host_machine.cpu_family().startswith('arm')
             libdav1d_tmpl_sources += files(