shithub: dav1d

--- a/src/itx.c

+++ b/src/itx.c

@@ -204,6 +204,8 @@

     c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \

         inv_txfm_add_identity_adst_##w##x##h##_c; \

+    memset(c, 0, sizeof(*c)); /* Zero unused function pointer elements. */

     c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;

     assign_itx_all_fn84( 4,  4, );

     assign_itx_all_fn84( 4,  8, R);

--- a/tests/checkasm/checkasm.c

+++ b/tests/checkasm/checkasm.c

@@ -51,6 +51,8 @@

     const char *name;

     void (*func)(void);

 } tests[] = {

+    { "itx_8bpc", checkasm_check_itx_8bpc },

+    { "itx_10bpc", checkasm_check_itx_10bpc },

     { "mc_8bpc", checkasm_check_mc_8bpc },

     { "mc_10bpc", checkasm_check_mc_10bpc },

     { 0 }

@@ -253,7 +255,7 @@

 /* Get the suffix of the specified cpu flag */

 static const char *cpu_suffix(const unsigned cpu) {

-    for (int i = sizeof(cpus) / sizeof(*cpus) - 2; i >= 0; i--)

+    for (int i = (int)(sizeof(cpus) / sizeof(*cpus)) - 2; i >= 0; i--)

         if (cpu & cpus[i].flag)

             return cpus[i].suffix;

@@ -411,11 +413,6 @@

 #endif

     int ret = 0;

-    /*if (!tests[0].func || !cpus[0].flag) {

-        fprintf(stderr, "checkasm: no tests to perform\n");

-        return 0;

-    }*/

     while (argc > 1) {

         if (!strncmp(argv[1], "--bench", 7)) {

 #ifndef readtime

@@ -445,7 +442,9 @@

     for (int i = 0; cpus[i].flag; i++)

         check_cpu_flag(cpus[i].name, cpus[i].flag);

-    if (state.num_failed) {

+    if (!state.num_checked) {

+        fprintf(stderr, "checkasm: no tests to perform\n");

+    } else if (state.num_failed) {

         fprintf(stderr, "checkasm: %d of %d tests have failed\n",

                 state.num_failed, state.num_checked);

         ret = 1;

--- a/tests/checkasm/checkasm.h

+++ b/tests/checkasm/checkasm.h

@@ -36,6 +36,8 @@

 #include "include/common/attributes.h"

 #include "include/common/intops.h"

+void checkasm_check_itx_8bpc(void);

+void checkasm_check_itx_10bpc(void);

 void checkasm_check_mc_8bpc(void);

 void checkasm_check_mc_10bpc(void);

--- /dev/null

+++ b/tests/checkasm/itx.c

@@ -1,0 +1,279 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "tests/checkasm/checkasm.h"

+#include <math.h>

+#include "src/itx.h"

+#include "src/levels.h"

+#include "src/scan.h"

+#include "src/tables.h"

+#ifndef M_PI

+#define M_PI 3.14159265358979323846

+#endif

+#ifndef M_SQRT1_2

+#define M_SQRT1_2 0.707106781186547524401

+#endif

+enum Tx1D { DCT, ADST, FLIPADST, IDENTITY, WHT };

+static const uint8_t itx_1d_types[N_TX_TYPES_PLUS_LL][2] = {

+    [DCT_DCT]           = { DCT,      DCT      },

+    [ADST_DCT]          = { DCT,      ADST     },

+    [DCT_ADST]          = { ADST,     DCT      },

+    [ADST_ADST]         = { ADST,     ADST     },

+    [FLIPADST_DCT]      = { DCT,      FLIPADST },

+    [DCT_FLIPADST]      = { FLIPADST, DCT      },

+    [FLIPADST_FLIPADST] = { FLIPADST, FLIPADST },

+    [ADST_FLIPADST]     = { FLIPADST, ADST     },

+    [FLIPADST_ADST]     = { ADST,     FLIPADST },

+    [IDTX]              = { IDENTITY, IDENTITY },

+    [V_DCT]             = { IDENTITY, DCT      },

+    [H_DCT]             = { DCT,      IDENTITY },

+    [V_ADST]            = { IDENTITY, ADST     },

+    [H_ADST]            = { ADST,     IDENTITY },

+    [V_FLIPADST]        = { IDENTITY, FLIPADST },

+    [H_FLIPADST]        = { FLIPADST, IDENTITY },

+    [WHT_WHT]           = { WHT,      WHT      },

+};

+static const char *const itx_1d_names[5] = {

+    [DCT]      = "dct",

+    [ADST]     = "adst",

+    [FLIPADST] = "flipadst",

+    [IDENTITY] = "identity",

+    [WHT]      = "wht"

+};

+static const double scaling_factors[9] = {

+    4.00,             /*  4x4                          */

+    4.00 * M_SQRT1_2, /*  4x8   8x4                    */

+    2.00,             /*  4x16  8x8  16x4              */

+    2.00 * M_SQRT1_2, /*        8x16 16x8              */

+    1.00,             /*        8x32 16x16 32x8        */

+    1.00 * M_SQRT1_2, /*             16x32 32x16       */

+    0.50,             /*             16x64 32x32 64x16 */

+    0.50 * M_SQRT1_2, /*                   32x64 64x32 */

+    0.25,             /*                         64x64 */

+};

+/* FIXME: Ensure that those forward transforms are similar to the real AV1

+ * transforms. The FLIPADST currently uses the ADST forward transform for

+ * example which is obviously "incorrect", but we're just using it for now

+ * since it does produce coefficients in the correct range at least. */

+/* DCT-II */

+static void fdct_1d(double *const out, const double *const in, const int sz) {

+    for (int i = 0; i < sz; i++) {

+        out[i] = 0.0;

+        for (int j = 0; j < sz; j++)

+            out[i] += in[j] * cos(M_PI * (2 * j + 1) * i / (sz * 2.0));

+    }

+    out[0] *= M_SQRT1_2;

+}

+/* See "Towards jointly optimal spatial prediction and adaptive transform in

+ * video/image coding", by J. Han, A. Saxena, and K. Rose

+ * IEEE Proc. ICASSP, pp. 726-729, Mar. 2010.

+ * and "A Butterfly Structured Design of The Hybrid Transform Coding Scheme",

+ * by Jingning Han, Yaowu Xu, and Debargha Mukherjee

+ * http://research.google.com/pubs/archive/41418.pdf

+ */

+static void fadst_1d(double *const out, const double *const in, const int sz) {

+    for (int i = 0; i < sz; i++) {

+        out[i] = 0.0;

+        for (int j = 0; j < sz; j++)

+            out[i] += in[j] * sin(M_PI *

+            (sz == 4 ? (    j + 1) * (2 * i + 1) / (8.0 + 1.0) :

+                       (2 * j + 1) * (2 * i + 1) / (sz * 4.0)));

+    }

+}

+static void fwht4_1d(double *const out, const double *const in)

+{

+    const double t0 = in[0] + in[1];

+    const double t3 = in[3] - in[2];

+    const double t4 = (t0 - t3) * 0.5;

+    const double t1 = t4 - in[1];

+    const double t2 = t4 - in[2];

+    out[0] = t0 - t2;

+    out[1] = t2;

+    out[2] = t3 + t1;

+    out[3] = t1;

+}

+static int copy_subcoefs(coef *coeff,

+                         const enum RectTxfmSize tx, const enum TxfmType txtp,

+                         const int sw, const int sh, const int subsh)

+{

+    /* copy the topleft coefficients such that the return value (being the

+     * coefficient scantable index for the eob token) guarantees that only

+     * the topleft $sub out of $sz (where $sz >= $sub) coefficients in both

+     * dimensions are non-zero. This leads to braching to specific optimized

+     * simd versions (e.g. dc-only) so that we get full asm coverage in this

+     * test */

+    const int16_t *const scan = av1_scans[tx][av1_tx_type_class[txtp]];

+    const int sub_high = subsh > 0 ? subsh * 8 - 1 : 0;

+    const int sub_low  = subsh > 1 ? sub_high - 8 : 0;

+    int n, eob;

+    for (n = 0, eob = 0; n < sw * sh; n++) {

+        const int rc = scan[n];

+        const int rcx = rc % sh, rcy = rc / sh;

+        /* Pick a random eob within this sub-itx */

+        if (rcx > sub_high || rcy > sub_high) {

+            break; /* upper boundary */

+        } else if (!eob && (rcx > sub_low || rcy > sub_low))

+            eob = n; /* lower boundary */

+    }

+    if (eob)

+        eob += rand() % (n - eob - 1);

+    for (n = eob + 1; n < sw * sh; n++)

+        coeff[scan[n]] = 0;

+    return eob;

+}

+static int ftx(coef *const buf, const enum RectTxfmSize tx,

+               const enum TxfmType txtp, const int w, const int h,

+               const int subsh)

+{

+    double out[64 * 64], temp[64 * 64];

+    const double scale = scaling_factors[ctz(w * h) - 4];

+    const int sw = imin(w, 32), sh = imin(h, 32);

+    for (int i = 0; i < h; i++) {

+        double in[64], temp_out[64];

+        for (int i = 0; i < w; i++)

+            in[i] = (rand() & ((2 << BITDEPTH) - 1)) - ((1 << BITDEPTH) - 1);

+        switch (itx_1d_types[txtp][0]) {

+        case DCT:

+            fdct_1d(temp_out, in, w);

+            break;

+        case ADST:

+        case FLIPADST:

+            fadst_1d(temp_out, in, w);

+            break;

+        case WHT:

+            fwht4_1d(temp_out, in);

+            break;

+        case IDENTITY:

+            memcpy(temp_out, in, w * sizeof(*temp_out));

+            break;

+        }

+        for (int j = 0; j < w; j++)

+            temp[j * h + i] = temp_out[j] * scale;

+    }

+    for (int i = 0; i < w; i++) {

+        switch (itx_1d_types[txtp][0]) {

+        case DCT:

+            fdct_1d(&out[i * h], &temp[i * h], h);

+            break;

+        case ADST:

+        case FLIPADST:

+            fadst_1d(&out[i * h], &temp[i * h], h);

+            break;

+        case WHT:

+            fwht4_1d(&out[i * h], &temp[i * h]);

+            break;

+        case IDENTITY:

+            memcpy(&out[i * h], &temp[i * h], h * sizeof(*out));

+            break;

+        }

+    }

+    for (int y = 0; y < sh; y++)

+        for (int x = 0; x < sw; x++)

+            buf[y * sw + x] = out[y * w + x] + 0.5;

+    return copy_subcoefs(buf, tx, txtp, sw, sh, subsh);

+}

+void bitfn(checkasm_check_itx)(void) {

+    Dav1dInvTxfmDSPContext c;

+    bitfn(dav1d_itx_dsp_init)(&c);

+    ALIGN_STK_32(coef, coeff, 3, [32 * 32]);

+    ALIGN_STK_32(pixel, c_dst, 64 * 64,);

+    ALIGN_STK_32(pixel, a_dst, 64 * 64,);

+    static const uint8_t txfm_size_order[N_RECT_TX_SIZES] = {

+        TX_4X4,   RTX_4X8,  RTX_4X16,

+        RTX_8X4,  TX_8X8,   RTX_8X16,  RTX_8X32,

+        RTX_16X4, RTX_16X8, TX_16X16,  RTX_16X32, RTX_16X64,

+                  RTX_32X8, RTX_32X16, TX_32X32,  RTX_32X64,

+                            RTX_64X16, RTX_64X32, TX_64X64

+    };

+    static const uint8_t subsh_iters[5] = { 2, 2, 3, 5, 5 };

+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);

+    for (int i = 0; i < N_RECT_TX_SIZES; i++) {

+        const enum RectTxfmSize tx = txfm_size_order[i];

+        const int w = av1_txfm_dimensions[tx].w * 4;

+        const int h = av1_txfm_dimensions[tx].h * 4;

+        const int sw = imin(w, 32), sh = imin(h, 32);

+        const int subsh_max = subsh_iters[imax(av1_txfm_dimensions[tx].lw,

+                                               av1_txfm_dimensions[tx].lh)];

+        for (enum TxfmType txtp = 0; txtp < N_TX_TYPES_PLUS_LL; txtp++)

+            for (int subsh = 0; subsh < subsh_max; subsh++)

+                if (check_func(c.itxfm_add[tx][txtp],

+                               "inv_txfm_add_%dx%d_%s_%s_%d_%dbpc",

+                               w, h, itx_1d_names[itx_1d_types[txtp][0]],

+                               itx_1d_names[itx_1d_types[txtp][1]], subsh,

+                               BITDEPTH))

+                {

+                    const int eob = ftx(coeff[0], tx, txtp, w, h, subsh);

+                    for (int j = 0; j < w * h; j++)

+                        c_dst[j] = a_dst[j] = rand() & ((1 << BITDEPTH) - 1);

+                    memcpy(coeff[1], coeff[0], sw * sh * sizeof(**coeff));

+                    memcpy(coeff[2], coeff[0], sw * sh * sizeof(**coeff));

+                    call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob);

+                    call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob);

+                    if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)) ||

+                        memcmp(coeff[0], coeff[1], sw * sh * sizeof(**coeff)))

+                    {

+                        fail();

+                    }

+                    bench_new(a_dst, w * sizeof(*c_dst), coeff[2], eob);

+                }

+        report("add_%dx%d", w, h);

+    }

+}

--- a/tests/meson.build

+++ b/tests/meson.build

@@ -34,7 +34,10 @@

 if is_asm_enabled

     checkasm_sources = files('checkasm/checkasm.c')

-    checkasm_tmpl_sources = files('checkasm/mc.c')

+    checkasm_tmpl_sources = files(

+        'checkasm/itx.c',

+        'checkasm/mc.c',

+    )

     checkasm_bitdepth_objs = []

     foreach bitdepth : dav1d_bitdepths

@@ -58,6 +61,8 @@

         checkasm_nasm_objs = nasm_gen.process(files('checkasm/x86/checkasm.asm'))

     endif

+    m_lib = cc.find_library('m', required: false)

     checkasm = executable('checkasm',

         checkasm_sources,

         checkasm_nasm_objs,

@@ -71,7 +76,7 @@

         include_directories: dav1d_inc_dirs,

         c_args: [stackalign_flag, stackrealign_flag],

         build_by_default: false,

-        dependencies : [thread_dependency],

+        dependencies : [thread_dependency, m_lib],

     test('checkasm test', checkasm)

--

⑨