shithub: dav1d

--- a/src/itx_1d.c

+++ b/src/itx_1d.c

@@ -1,6 +1,6 @@

/*

- * Copyright © 2018, VideoLAN and dav1d authors

- * Copyright © 2018, Two Orioles, LLC

+ * Copyright © 2018-2019, VideoLAN and dav1d authors

+ * Copyright © 2018-2019, Two Orioles, LLC

  * All rights reserved.

  * Redistribution and use in source and binary forms, with or without

@@ -30,8 +30,10 @@

 #include <stddef.h>

 #include <stdint.h>

-#include "common/attributes.h"

+#include "common/intops.h"

+#include "src/itx_1d.h"

 #define CLIP(a) iclip(a, min, max)

/*

@@ -60,9 +62,9 @@

  * wrap around.

*/

-static void NOINLINE

-inv_dct4_1d(const coef *const in, const ptrdiff_t in_s,

-            coef *const out, const ptrdiff_t out_s, const int max)

+void dav1d_inv_dct4_1d_c(const int32_t *const in, const ptrdiff_t in_s,

+                         int32_t *const out, const ptrdiff_t out_s,

+                         const int max)

     const int min = -max - 1;

     const int in0 = in[0 * in_s], in1 = in[1 * in_s];

@@ -79,14 +81,14 @@

     out[3 * out_s] = CLIP(t0 - t3);

-static void NOINLINE

-inv_dct8_1d(const coef *const in, const ptrdiff_t in_s,

-            coef *const out, const ptrdiff_t out_s, const int max)

+void dav1d_inv_dct8_1d_c(const int32_t *const in, const ptrdiff_t in_s,

+                         int32_t *const out, const ptrdiff_t out_s,

+                         const int max)

     const int min = -max - 1;

-    coef tmp[4];

+    int32_t tmp[4];

-    inv_dct4_1d(in, in_s * 2, tmp, 1, max);

+    dav1d_inv_dct4_1d_c(in, in_s * 2, tmp, 1, max);

     const int in1 = in[1 * in_s], in3 = in[3 * in_s];

     const int in5 = in[5 * in_s], in7 = in[7 * in_s];

@@ -114,14 +116,14 @@

     out[7 * out_s] = CLIP(tmp[0] - t7);

-static void NOINLINE

-inv_dct16_1d(const coef *const in, const ptrdiff_t in_s,

-             coef *const out, const ptrdiff_t out_s, const int max)

+void dav1d_inv_dct16_1d_c(const int32_t *const in, const ptrdiff_t in_s,

+                          int32_t *const out, const ptrdiff_t out_s,

+                          const int max)

     const int min = -max - 1;

-    coef tmp[8];

+    int32_t tmp[8];

-    inv_dct8_1d(in, in_s * 2, tmp, 1, max);

+    dav1d_inv_dct8_1d_c(in, in_s * 2, tmp, 1, max);

     const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];

     const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];

@@ -183,14 +185,14 @@

     out[15 * out_s] = CLIP(tmp[0] - t15a);

-static void NOINLINE

-inv_dct32_1d(const coef *const in, const ptrdiff_t in_s,

-             coef *const out, const ptrdiff_t out_s, const int max)

+void dav1d_inv_dct32_1d_c(const int32_t *const in, const ptrdiff_t in_s,

+                          int32_t *const out, const ptrdiff_t out_s,

+                          const int max)

     const int min = -max - 1;

-    coef tmp[16];

+    int32_t tmp[16];

-    inv_dct16_1d(in, in_s * 2, tmp, 1, max);

+    dav1d_inv_dct16_1d_c(in, in_s * 2, tmp, 1, max);

     const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];

     const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];

@@ -330,14 +332,14 @@

     out[31 * out_s] = CLIP(tmp[ 0] - t31);

-static void NOINLINE

-inv_dct64_1d(const coef *const in, const ptrdiff_t in_s,

-             coef *const out, const ptrdiff_t out_s, const int max)

+void dav1d_inv_dct64_1d_c(const int32_t *const in, const ptrdiff_t in_s,

+                          int32_t *const out, const ptrdiff_t out_s,

+                          const int max)

     const int min = -max - 1;

-    coef tmp[32];

+    int32_t tmp[32];

-    inv_dct32_1d(in, in_s * 2, tmp, 1, max);

+    dav1d_inv_dct32_1d_c(in, in_s * 2, tmp, 1, max);

     const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];

     const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];

@@ -655,9 +657,9 @@

     out[63 * out_s] = CLIP(tmp[ 0] - t63a);

-static void NOINLINE

-inv_adst4_1d(const coef *const in, const ptrdiff_t in_s,

-             coef *const out, const ptrdiff_t out_s, const int range)

+void dav1d_inv_adst4_1d_c(const int32_t *const in, const ptrdiff_t in_s,

+                          int32_t *const out, const ptrdiff_t out_s,

+                          const int range)

     const int in0 = in[0 * in_s], in1 = in[1 * in_s];

     const int in2 = in[2 * in_s], in3 = in[3 * in_s];

@@ -674,9 +676,9 @@

                      in0 + in2 - in1;

-static void NOINLINE

-inv_adst8_1d(const coef *const in, const ptrdiff_t in_s,

-             coef *const out, const ptrdiff_t out_s, const int max)

+void dav1d_inv_adst8_1d_c(const int32_t *const in, const ptrdiff_t in_s,

+                          int32_t *const out, const ptrdiff_t out_s,

+                          const int max)

     const int min = -max - 1;

     const int in0 = in[0 * in_s], in1 = in[1 * in_s];

@@ -723,9 +725,9 @@

     out[5 * out_s] = -(((t6 - t7) * 181 + 128) >> 8);

-static void NOINLINE

-inv_adst16_1d(const coef *const in, const ptrdiff_t in_s,

-              coef *const out, const ptrdiff_t out_s, const int max)

+void dav1d_inv_adst16_1d_c(const int32_t *const in, const ptrdiff_t in_s,

+                           int32_t *const out, const ptrdiff_t out_s,

+                           const int max)

     const int min = -max - 1;

     const int in0  = in[ 0 * in_s], in1  = in[ 1 * in_s];

@@ -834,10 +836,11 @@

 #define flip_inv_adst(sz) \

-static void inv_flipadst##sz##_1d(const coef *const in, const ptrdiff_t in_s, \

-                                  coef *const out, const ptrdiff_t out_s, const int range) \

+void dav1d_inv_flipadst##sz##_1d_c(const int32_t *const in, const ptrdiff_t in_s, \

+                                   int32_t *const out, const ptrdiff_t out_s, \

+                                   const int range) \

{ \

-    inv_adst##sz##_1d(in, in_s, &out[(sz - 1) * out_s], -out_s, range); \

+    dav1d_inv_adst##sz##_1d_c(in, in_s, &out[(sz - 1) * out_s], -out_s, range); \

 flip_inv_adst(4)

@@ -846,42 +849,41 @@

 #undef flip_inv_adst

-static void NOINLINE

-inv_identity4_1d(const coef *const in, const ptrdiff_t in_s,

-                 coef *const out, const ptrdiff_t out_s, const int range)

+void dav1d_inv_identity4_1d_c(const int32_t *const in, const ptrdiff_t in_s,

+                              int32_t *const out, const ptrdiff_t out_s,

+                              const int range)

     for (int i = 0; i < 4; i++)

         out[out_s * i] = in[in_s * i] + ((in[in_s * i] * 1697 + 2048) >> 12);

-static void NOINLINE

-inv_identity8_1d(const coef *const in, const ptrdiff_t in_s,

-                 coef *const out, const ptrdiff_t out_s, const int range)

+void dav1d_inv_identity8_1d_c(const int32_t *const in, const ptrdiff_t in_s,

+                              int32_t *const out, const ptrdiff_t out_s,

+                              const int range)

     for (int i = 0; i < 8; i++)

         out[out_s * i] = in[in_s * i] * 2;

-static void NOINLINE

-inv_identity16_1d(const coef *const in, const ptrdiff_t in_s,

-                  coef *const out, const ptrdiff_t out_s, const int range)

+void dav1d_inv_identity16_1d_c(const int32_t *const in, const ptrdiff_t in_s,

+                               int32_t *const out, const ptrdiff_t out_s,

+                               const int range)

     for (int i = 0; i < 16; i++)

         out[out_s * i] = 2 * in[in_s * i] + ((in[in_s * i] * 1697 + 1024) >> 11);

-static void NOINLINE

-inv_identity32_1d(const coef *const in, const ptrdiff_t in_s,

-                  coef *const out, const ptrdiff_t out_s, const int range)

+void dav1d_inv_identity32_1d_c(const int32_t *const in, const ptrdiff_t in_s,

+                               int32_t *const out, const ptrdiff_t out_s,

+                               const int range)

     for (int i = 0; i < 32; i++)

         out[out_s * i] = in[in_s * i] * 4;

-static void NOINLINE

-inv_wht4_1d(const coef *const in, const ptrdiff_t in_s,

-            coef *const out, const ptrdiff_t out_s,

-            const int pass)

+void dav1d_inv_wht4_1d_c(const int32_t *const in, const ptrdiff_t in_s,

+                         int32_t *const out, const ptrdiff_t out_s,

+                         const int pass)

     const int sh = 2 * !pass;

     const int in0 = in[0 * in_s] >> sh, in1 = in[1 * in_s] >> sh;

--- /dev/null

+++ b/src/itx_1d.h

@@ -1,0 +1,60 @@

+/*

+ * Copyright © 2018-2019, VideoLAN and dav1d authors

+ * Copyright © 2018-2019, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include <stddef.h>

+#include <stdint.h>

+#ifndef DAV1D_SRC_ITX_1D_H

+#define DAV1D_SRC_ITX_1D_H

+#define decl_itx_1d_fn(name) \

+void (name)(const int32_t *in, ptrdiff_t in_s, \

+            int32_t *out, ptrdiff_t out_s, const int range)

+typedef decl_itx_1d_fn(*itx_1d_fn);

+decl_itx_1d_fn(dav1d_inv_dct4_1d_c);

+decl_itx_1d_fn(dav1d_inv_dct8_1d_c);

+decl_itx_1d_fn(dav1d_inv_dct16_1d_c);

+decl_itx_1d_fn(dav1d_inv_dct32_1d_c);

+decl_itx_1d_fn(dav1d_inv_dct64_1d_c);

+decl_itx_1d_fn(dav1d_inv_adst4_1d_c);

+decl_itx_1d_fn(dav1d_inv_adst8_1d_c);

+decl_itx_1d_fn(dav1d_inv_adst16_1d_c);

+decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c);

+decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c);

+decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c);

+decl_itx_1d_fn(dav1d_inv_identity4_1d_c);

+decl_itx_1d_fn(dav1d_inv_identity8_1d_c);

+decl_itx_1d_fn(dav1d_inv_identity16_1d_c);

+decl_itx_1d_fn(dav1d_inv_identity32_1d_c);

+decl_itx_1d_fn(dav1d_inv_wht4_1d_c);

+#endif /* DAV1D_SRC_ITX_1D_H */

--- a/src/itx_tmpl.c

+++ b/src/itx_tmpl.c

@@ -1,6 +1,6 @@

/*

- * Copyright © 2018, VideoLAN and dav1d authors

- * Copyright © 2018, Two Orioles, LLC

+ * Copyright © 2018-2019, VideoLAN and dav1d authors

+ * Copyright © 2018-2019, Two Orioles, LLC

  * All rights reserved.

  * Redistribution and use in source and binary forms, with or without

@@ -35,12 +35,8 @@

 #include "common/intops.h"

 #include "src/itx.h"

+#include "src/itx_1d.h"

-#include "src/itx_1d.c"

-typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,

-                          coef *out, ptrdiff_t out_s, const int range);

 static void NOINLINE

 inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,

                coef *const coeff, const int eob,

@@ -73,29 +69,21 @@

     const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);

     // Maximum value for h and w is 64

-    coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];

+    int32_t tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];

     const int row_clip_max = (1 << (bitdepth + 8 - 1)) - 1;

-    const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) -1;

+    const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) - 1;

     if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));

     for (i = 0; i < sh; i++) {

-        if (w != sw || is_rect2) {

-            for (j = 0; j < sw; j++) {

-                in_mem[j] = coeff[i + j * sh];

-                if (is_rect2)

-                    in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;

-            }

-            first_1d_fn(in_mem, 1, &tmp[i * w], 1, row_clip_max);

-        } else {

-            first_1d_fn(&coeff[i], sh, &tmp[i * w], 1, row_clip_max);

+        for (j = 0; j < sw; j++) {

+            in_mem[j] = coeff[i + j * sh];

+            if (is_rect2)

+                in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;

+        first_1d_fn(in_mem, 1, &tmp[i * w], 1, row_clip_max);

         for (j = 0; j < w; j++)

-#if BITDEPTH == 8

-            tmp[i * w + j] = (tmp[i * w + j] + rnd) >> shift;

-#else

             tmp[i * w + j] = iclip((tmp[i * w + j] + rnd) >> shift,

                                    -col_clip_max - 1, col_clip_max);

-#endif

     if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));

@@ -118,8 +106,8 @@

                                                HIGHBD_DECL_SUFFIX) \

{ \

     inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \

-                   inv_##type1##w##_1d, inv_##type2##h##_1d, has_dconly \

-                   HIGHBD_TAIL_SUFFIX); \

+                   dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \

+                   has_dconly HIGHBD_TAIL_SUFFIX); \

 #define inv_txfm_fn64(w, h, shift) \

@@ -176,15 +164,18 @@

     const int bitdepth = bitdepth_from_max(bitdepth_max);

     const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) -1;

     const int col_clip_min = -col_clip_max - 1;

-    coef tmp[4 * 4], out[4];

+    int32_t tmp[4 * 4], out[4], in_mem[4];

-    for (int i = 0; i < 4; i++)

-        inv_wht4_1d(&coeff[i], 4, &tmp[i * 4], 1, 0);

+    for (int i = 0; i < 4; i++) {

+        for (int j = 0; j < 4; j++)

+            in_mem[j] = coeff[i + j * 4];

+        dav1d_inv_wht4_1d_c(in_mem, 1, &tmp[i * 4], 1, 0);

+    }

     for (int k = 0; k < 4 * 4; k++)

         tmp[k] = iclip(tmp[k], col_clip_min, col_clip_max);

     for (int i = 0; i < 4; i++) {

-        inv_wht4_1d(&tmp[i], 4, out, 1, 1);

+        dav1d_inv_wht4_1d_c(&tmp[i], 4, out, 1, 1);

         for (int j = 0; j < 4; j++)

             dst[i + j * PXSTRIDE(stride)] =

                 iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);

--- a/src/meson.build

+++ b/src/meson.build

@@ -35,6 +35,7 @@

     'dequant_tables.c',

     'getbits.c',

     'intra_edge.c',

+    'itx_1d.c',

     'lf_mask.c',

     'log.c',

     'msac.c',