shithub: dav1d

Download patch

ref: 7aea6858ecd2e8e596dedc0be5396d8edb54eefc
parent: cdf4a3bc94be2425a7d0b140091bbfaf08c48eac
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Sat Dec 28 12:00:25 EST 2019

av1: do C inverse transforms in int32_t precision

Fixes C part of #321.

--- a/src/itx_1d.c
+++ b/src/itx_1d.c
@@ -1,6 +1,6 @@
 /*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,8 +30,10 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include "common/attributes.h"
+#include "common/intops.h"
 
+#include "src/itx_1d.h"
+
 #define CLIP(a) iclip(a, min, max)
 
 /*
@@ -60,9 +62,9 @@
  * wrap around.
  */
 
-static void NOINLINE
-inv_dct4_1d(const coef *const in, const ptrdiff_t in_s,
-            coef *const out, const ptrdiff_t out_s, const int max)
+void dav1d_inv_dct4_1d_c(const int32_t *const in, const ptrdiff_t in_s,
+                         int32_t *const out, const ptrdiff_t out_s,
+                         const int max)
 {
     const int min = -max - 1;
     const int in0 = in[0 * in_s], in1 = in[1 * in_s];
@@ -79,14 +81,14 @@
     out[3 * out_s] = CLIP(t0 - t3);
 }
 
-static void NOINLINE
-inv_dct8_1d(const coef *const in, const ptrdiff_t in_s,
-            coef *const out, const ptrdiff_t out_s, const int max)
+void dav1d_inv_dct8_1d_c(const int32_t *const in, const ptrdiff_t in_s,
+                         int32_t *const out, const ptrdiff_t out_s,
+                         const int max)
 {
     const int min = -max - 1;
-    coef tmp[4];
+    int32_t tmp[4];
 
-    inv_dct4_1d(in, in_s * 2, tmp, 1, max);
+    dav1d_inv_dct4_1d_c(in, in_s * 2, tmp, 1, max);
 
     const int in1 = in[1 * in_s], in3 = in[3 * in_s];
     const int in5 = in[5 * in_s], in7 = in[7 * in_s];
@@ -114,14 +116,14 @@
     out[7 * out_s] = CLIP(tmp[0] - t7);
 }
 
-static void NOINLINE
-inv_dct16_1d(const coef *const in, const ptrdiff_t in_s,
-             coef *const out, const ptrdiff_t out_s, const int max)
+void dav1d_inv_dct16_1d_c(const int32_t *const in, const ptrdiff_t in_s,
+                          int32_t *const out, const ptrdiff_t out_s,
+                          const int max)
 {
     const int min = -max - 1;
-    coef tmp[8];
+    int32_t tmp[8];
 
-    inv_dct8_1d(in, in_s * 2, tmp, 1, max);
+    dav1d_inv_dct8_1d_c(in, in_s * 2, tmp, 1, max);
 
     const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];
     const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];
@@ -183,14 +185,14 @@
     out[15 * out_s] = CLIP(tmp[0] - t15a);
 }
 
-static void NOINLINE
-inv_dct32_1d(const coef *const in, const ptrdiff_t in_s,
-             coef *const out, const ptrdiff_t out_s, const int max)
+void dav1d_inv_dct32_1d_c(const int32_t *const in, const ptrdiff_t in_s,
+                          int32_t *const out, const ptrdiff_t out_s,
+                          const int max)
 {
     const int min = -max - 1;
-    coef tmp[16];
+    int32_t tmp[16];
 
-    inv_dct16_1d(in, in_s * 2, tmp, 1, max);
+    dav1d_inv_dct16_1d_c(in, in_s * 2, tmp, 1, max);
 
     const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];
     const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];
@@ -330,14 +332,14 @@
     out[31 * out_s] = CLIP(tmp[ 0] - t31);
 }
 
-static void NOINLINE
-inv_dct64_1d(const coef *const in, const ptrdiff_t in_s,
-             coef *const out, const ptrdiff_t out_s, const int max)
+void dav1d_inv_dct64_1d_c(const int32_t *const in, const ptrdiff_t in_s,
+                          int32_t *const out, const ptrdiff_t out_s,
+                          const int max)
 {
     const int min = -max - 1;
-    coef tmp[32];
+    int32_t tmp[32];
 
-    inv_dct32_1d(in, in_s * 2, tmp, 1, max);
+    dav1d_inv_dct32_1d_c(in, in_s * 2, tmp, 1, max);
 
     const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];
     const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];
@@ -655,9 +657,9 @@
     out[63 * out_s] = CLIP(tmp[ 0] - t63a);
 }
 
-static void NOINLINE
-inv_adst4_1d(const coef *const in, const ptrdiff_t in_s,
-             coef *const out, const ptrdiff_t out_s, const int range)
+void dav1d_inv_adst4_1d_c(const int32_t *const in, const ptrdiff_t in_s,
+                          int32_t *const out, const ptrdiff_t out_s,
+                          const int range)
 {
     const int in0 = in[0 * in_s], in1 = in[1 * in_s];
     const int in2 = in[2 * in_s], in3 = in[3 * in_s];
@@ -674,9 +676,9 @@
                      in0 + in2 - in1;
 }
 
-static void NOINLINE
-inv_adst8_1d(const coef *const in, const ptrdiff_t in_s,
-             coef *const out, const ptrdiff_t out_s, const int max)
+void dav1d_inv_adst8_1d_c(const int32_t *const in, const ptrdiff_t in_s,
+                          int32_t *const out, const ptrdiff_t out_s,
+                          const int max)
 {
     const int min = -max - 1;
     const int in0 = in[0 * in_s], in1 = in[1 * in_s];
@@ -723,9 +725,9 @@
     out[5 * out_s] = -(((t6 - t7) * 181 + 128) >> 8);
 }
 
-static void NOINLINE
-inv_adst16_1d(const coef *const in, const ptrdiff_t in_s,
-              coef *const out, const ptrdiff_t out_s, const int max)
+void dav1d_inv_adst16_1d_c(const int32_t *const in, const ptrdiff_t in_s,
+                           int32_t *const out, const ptrdiff_t out_s,
+                           const int max)
 {
     const int min = -max - 1;
     const int in0  = in[ 0 * in_s], in1  = in[ 1 * in_s];
@@ -834,10 +836,11 @@
 }
 
 #define flip_inv_adst(sz) \
-static void inv_flipadst##sz##_1d(const coef *const in, const ptrdiff_t in_s, \
-                                  coef *const out, const ptrdiff_t out_s, const int range) \
+void dav1d_inv_flipadst##sz##_1d_c(const int32_t *const in, const ptrdiff_t in_s, \
+                                   int32_t *const out, const ptrdiff_t out_s, \
+                                   const int range) \
 { \
-    inv_adst##sz##_1d(in, in_s, &out[(sz - 1) * out_s], -out_s, range); \
+    dav1d_inv_adst##sz##_1d_c(in, in_s, &out[(sz - 1) * out_s], -out_s, range); \
 }
 
 flip_inv_adst(4)
@@ -846,42 +849,41 @@
 
 #undef flip_inv_adst
 
-static void NOINLINE
-inv_identity4_1d(const coef *const in, const ptrdiff_t in_s,
-                 coef *const out, const ptrdiff_t out_s, const int range)
+void dav1d_inv_identity4_1d_c(const int32_t *const in, const ptrdiff_t in_s,
+                              int32_t *const out, const ptrdiff_t out_s,
+                              const int range)
 {
     for (int i = 0; i < 4; i++)
         out[out_s * i] = in[in_s * i] + ((in[in_s * i] * 1697 + 2048) >> 12);
 }
 
-static void NOINLINE
-inv_identity8_1d(const coef *const in, const ptrdiff_t in_s,
-                 coef *const out, const ptrdiff_t out_s, const int range)
+void dav1d_inv_identity8_1d_c(const int32_t *const in, const ptrdiff_t in_s,
+                              int32_t *const out, const ptrdiff_t out_s,
+                              const int range)
 {
     for (int i = 0; i < 8; i++)
         out[out_s * i] = in[in_s * i] * 2;
 }
 
-static void NOINLINE
-inv_identity16_1d(const coef *const in, const ptrdiff_t in_s,
-                  coef *const out, const ptrdiff_t out_s, const int range)
+void dav1d_inv_identity16_1d_c(const int32_t *const in, const ptrdiff_t in_s,
+                               int32_t *const out, const ptrdiff_t out_s,
+                               const int range)
 {
     for (int i = 0; i < 16; i++)
         out[out_s * i] = 2 * in[in_s * i] + ((in[in_s * i] * 1697 + 1024) >> 11);
 }
 
-static void NOINLINE
-inv_identity32_1d(const coef *const in, const ptrdiff_t in_s,
-                  coef *const out, const ptrdiff_t out_s, const int range)
+void dav1d_inv_identity32_1d_c(const int32_t *const in, const ptrdiff_t in_s,
+                               int32_t *const out, const ptrdiff_t out_s,
+                               const int range)
 {
     for (int i = 0; i < 32; i++)
         out[out_s * i] = in[in_s * i] * 4;
 }
 
-static void NOINLINE
-inv_wht4_1d(const coef *const in, const ptrdiff_t in_s,
-            coef *const out, const ptrdiff_t out_s,
-            const int pass)
+void dav1d_inv_wht4_1d_c(const int32_t *const in, const ptrdiff_t in_s,
+                         int32_t *const out, const ptrdiff_t out_s,
+                         const int pass)
 {
     const int sh = 2 * !pass;
     const int in0 = in[0 * in_s] >> sh, in1 = in[1 * in_s] >> sh;
--- /dev/null
+++ b/src/itx_1d.h
@@ -1,0 +1,60 @@
+/*
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifndef DAV1D_SRC_ITX_1D_H
+#define DAV1D_SRC_ITX_1D_H
+
+#define decl_itx_1d_fn(name) \
+void (name)(const int32_t *in, ptrdiff_t in_s, \
+            int32_t *out, ptrdiff_t out_s, const int range)
+typedef decl_itx_1d_fn(*itx_1d_fn);
+
+decl_itx_1d_fn(dav1d_inv_dct4_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct8_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct16_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct32_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct64_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_adst4_1d_c);
+decl_itx_1d_fn(dav1d_inv_adst8_1d_c);
+decl_itx_1d_fn(dav1d_inv_adst16_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c);
+decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c);
+decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_identity4_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity8_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity16_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity32_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_wht4_1d_c);
+
+#endif /* DAV1D_SRC_ITX_1D_H */
--- a/src/itx_tmpl.c
+++ b/src/itx_tmpl.c
@@ -1,6 +1,6 @@
 /*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,12 +35,8 @@
 #include "common/intops.h"
 
 #include "src/itx.h"
+#include "src/itx_1d.h"
 
-#include "src/itx_1d.c"
-
-typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
-                          coef *out, ptrdiff_t out_s, const int range);
-
 static void NOINLINE
 inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
                coef *const coeff, const int eob,
@@ -73,29 +69,21 @@
 
     const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);
     // Maximum value for h and w is 64
-    coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
+    int32_t tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
     const int row_clip_max = (1 << (bitdepth + 8 - 1)) - 1;
-    const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) -1;
+    const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) - 1;
 
     if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));
     for (i = 0; i < sh; i++) {
-        if (w != sw || is_rect2) {
-            for (j = 0; j < sw; j++) {
-                in_mem[j] = coeff[i + j * sh];
-                if (is_rect2)
-                    in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;
-            }
-            first_1d_fn(in_mem, 1, &tmp[i * w], 1, row_clip_max);
-        } else {
-            first_1d_fn(&coeff[i], sh, &tmp[i * w], 1, row_clip_max);
+        for (j = 0; j < sw; j++) {
+            in_mem[j] = coeff[i + j * sh];
+            if (is_rect2)
+                in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;
         }
+        first_1d_fn(in_mem, 1, &tmp[i * w], 1, row_clip_max);
         for (j = 0; j < w; j++)
-#if BITDEPTH == 8
-            tmp[i * w + j] = (tmp[i * w + j] + rnd) >> shift;
-#else
             tmp[i * w + j] = iclip((tmp[i * w + j] + rnd) >> shift,
                                    -col_clip_max - 1, col_clip_max);
-#endif
     }
 
     if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));
@@ -118,8 +106,8 @@
                                                HIGHBD_DECL_SUFFIX) \
 { \
     inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
-                   inv_##type1##w##_1d, inv_##type2##h##_1d, has_dconly \
-                   HIGHBD_TAIL_SUFFIX); \
+                   dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \
+                   has_dconly HIGHBD_TAIL_SUFFIX); \
 }
 
 #define inv_txfm_fn64(w, h, shift) \
@@ -176,15 +164,18 @@
     const int bitdepth = bitdepth_from_max(bitdepth_max);
     const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) -1;
     const int col_clip_min = -col_clip_max - 1;
-    coef tmp[4 * 4], out[4];
+    int32_t tmp[4 * 4], out[4], in_mem[4];
 
-    for (int i = 0; i < 4; i++)
-        inv_wht4_1d(&coeff[i], 4, &tmp[i * 4], 1, 0);
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++)
+            in_mem[j] = coeff[i + j * 4];
+        dav1d_inv_wht4_1d_c(in_mem, 1, &tmp[i * 4], 1, 0);
+    }
     for (int k = 0; k < 4 * 4; k++)
         tmp[k] = iclip(tmp[k], col_clip_min, col_clip_max);
 
     for (int i = 0; i < 4; i++) {
-        inv_wht4_1d(&tmp[i], 4, out, 1, 1);
+        dav1d_inv_wht4_1d_c(&tmp[i], 4, out, 1, 1);
         for (int j = 0; j < 4; j++)
             dst[i + j * PXSTRIDE(stride)] =
                 iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);
--- a/src/meson.build
+++ b/src/meson.build
@@ -35,6 +35,7 @@
     'dequant_tables.c',
     'getbits.c',
     'intra_edge.c',
+    'itx_1d.c',
     'lf_mask.c',
     'log.c',
     'msac.c',