shithub: dav1d

--- a/src/itx_1d.c

+++ b/src/itx_1d.c

@@ -62,42 +62,63 @@

  * wrap around.

*/

-void dav1d_inv_dct4_1d_c(const int32_t *const in, const ptrdiff_t in_s,

-                         int32_t *const out, const ptrdiff_t out_s,

-                         const int max)

+static NOINLINE void

+inv_dct4_1d_internal_c(int32_t *const c, const ptrdiff_t stride,

+                       const int min, const int max, const int tx64)

-    const int min = -max - 1;

-    const int in0 = in[0 * in_s], in1 = in[1 * in_s];

-    const int in2 = in[2 * in_s], in3 = in[3 * in_s];

+    assert(stride > 0);

+    const int in0 = c[0 * stride], in1 = c[1 * stride];

-    int t0 = ((in0 + in2) * 181 + 128) >> 8;

-    int t1 = ((in0 - in2) * 181 + 128) >> 8;

-    int t2 = ((in1 *  1567         - in3 * (3784 - 4096) + 2048) >> 12) - in3;

-    int t3 = ((in1 * (3784 - 4096) + in3 *  1567         + 2048) >> 12) + in1;

+    int t0, t1, t2, t3;

+    if (tx64) {

+        t0 = t1 = (in0 * 181 + 128) >> 8;

+        t2 = (in1 * 1567 + 2048) >> 12;

+        t3 = (in1 * 3784 + 2048) >> 12;

+    } else {

+        const int in2 = c[2 * stride], in3 = c[3 * stride];

-    out[0 * out_s] = CLIP(t0 + t3);

-    out[1 * out_s] = CLIP(t1 + t2);

-    out[2 * out_s] = CLIP(t1 - t2);

-    out[3 * out_s] = CLIP(t0 - t3);

+        t0 = ((in0 + in2) * 181 + 128) >> 8;

+        t1 = ((in0 - in2) * 181 + 128) >> 8;

+        t2 = ((in1 *  1567         - in3 * (3784 - 4096) + 2048) >> 12) - in3;

+        t3 = ((in1 * (3784 - 4096) + in3 *  1567         + 2048) >> 12) + in1;

+    }

+    c[0 * stride] = CLIP(t0 + t3);

+    c[1 * stride] = CLIP(t1 + t2);

+    c[2 * stride] = CLIP(t1 - t2);

+    c[3 * stride] = CLIP(t0 - t3);

-void dav1d_inv_dct8_1d_c(const int32_t *const in, const ptrdiff_t in_s,

-                         int32_t *const out, const ptrdiff_t out_s,

-                         const int max)

+void dav1d_inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,

+                         const int min, const int max)

-    const int min = -max - 1;

-    int32_t tmp[4];

+    inv_dct4_1d_internal_c(c, stride, min, max, 0);

+}

-    dav1d_inv_dct4_1d_c(in, in_s * 2, tmp, 1, max);

+static NOINLINE void

+inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride,

+                       const int min, const int max, const int tx64)

+{

+    assert(stride > 0);

+    inv_dct4_1d_internal_c(c, stride << 1, min, max, tx64);

-    const int in1 = in[1 * in_s], in3 = in[3 * in_s];

-    const int in5 = in[5 * in_s], in7 = in[7 * in_s];

+    const int in1 = c[1 * stride], in3 = c[3 * stride];

-    int t4a = ((in1 *   799         - in7 * (4017 - 4096) + 2048) >> 12) - in7;

-    int t5a =  (in5 *  1703         - in3 *  1138         + 1024) >> 11;

-    int t6a =  (in5 *  1138         + in3 *  1703         + 1024) >> 11;

-    int t7a = ((in1 * (4017 - 4096) + in7 *  799          + 2048) >> 12) + in1;

+    int t4a, t5a, t6a, t7a;

+    if (tx64) {

+        t4a = (in1 *   799 + 2048) >> 12;

+        t5a = (in3 * -2276 + 2048) >> 12;

+        t6a = (in3 *  3406 + 2048) >> 12;

+        t7a = (in1 *  4017 + 2048) >> 12;

+    } else {

+        const int in5 = c[5 * stride], in7 = c[7 * stride];

+        t4a = ((in1 *   799         - in7 * (4017 - 4096) + 2048) >> 12) - in7;

+        t5a =  (in5 *  1703         - in3 *  1138         + 1024) >> 11;

+        t6a =  (in5 *  1138         + in3 *  1703         + 1024) >> 11;

+        t7a = ((in1 * (4017 - 4096) + in7 *  799          + 2048) >> 12) + in1;

+    }

     int t4  = CLIP(t4a + t5a);

         t5a = CLIP(t4a - t5a);

     int t7  = CLIP(t7a + t6a);

@@ -106,39 +127,61 @@

     int t5  = ((t6a - t5a) * 181 + 128) >> 8;

     int t6  = ((t6a + t5a) * 181 + 128) >> 8;

-    out[0 * out_s] = CLIP(tmp[0] + t7);

-    out[1 * out_s] = CLIP(tmp[1] + t6);

-    out[2 * out_s] = CLIP(tmp[2] + t5);

-    out[3 * out_s] = CLIP(tmp[3] + t4);

-    out[4 * out_s] = CLIP(tmp[3] - t4);

-    out[5 * out_s] = CLIP(tmp[2] - t5);

-    out[6 * out_s] = CLIP(tmp[1] - t6);

-    out[7 * out_s] = CLIP(tmp[0] - t7);

+    const int t0 = c[0 * stride];

+    const int t1 = c[2 * stride];

+    const int t2 = c[4 * stride];

+    const int t3 = c[6 * stride];

+    c[0 * stride] = CLIP(t0 + t7);

+    c[1 * stride] = CLIP(t1 + t6);

+    c[2 * stride] = CLIP(t2 + t5);

+    c[3 * stride] = CLIP(t3 + t4);

+    c[4 * stride] = CLIP(t3 - t4);

+    c[5 * stride] = CLIP(t2 - t5);

+    c[6 * stride] = CLIP(t1 - t6);

+    c[7 * stride] = CLIP(t0 - t7);

-void dav1d_inv_dct16_1d_c(const int32_t *const in, const ptrdiff_t in_s,

-                          int32_t *const out, const ptrdiff_t out_s,

-                          const int max)

+void dav1d_inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,

+                         const int min, const int max)

-    const int min = -max - 1;

-    int32_t tmp[8];

+    inv_dct8_1d_internal_c(c, stride, min, max, 0);

+}

-    dav1d_inv_dct8_1d_c(in, in_s * 2, tmp, 1, max);

+static NOINLINE void

+inv_dct16_1d_internal_c(int32_t *const c, const ptrdiff_t stride,

+                        const int min, const int max, int tx64)

+{

+    assert(stride > 0);

+    inv_dct8_1d_internal_c(c, stride << 1, min, max, tx64);

-    const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];

-    const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];

-    const int in9  = in[ 9 * in_s], in11 = in[11 * in_s];

-    const int in13 = in[13 * in_s], in15 = in[15 * in_s];

+    const int in1 = c[1 * stride], in3 = c[3 * stride];

+    const int in5 = c[5 * stride], in7 = c[7 * stride];

-    int t8a  = ((in1  *   401         - in15 * (4076 - 4096) + 2048) >> 12) - in15;

-    int t15a = ((in1  * (4076 - 4096) + in15 *   401         + 2048) >> 12) + in1;

-    int t9a  =  (in9  *  1583         - in7  *  1299         + 1024) >> 11;

-    int t14a =  (in9  *  1299         + in7  *  1583         + 1024) >> 11;

-    int t10a = ((in5  *  1931         - in11 * (3612 - 4096) + 2048) >> 12) - in11;

-    int t13a = ((in5  * (3612 - 4096) + in11 *  1931         + 2048) >> 12) + in5;

-    int t11a = ((in13 * (3920 - 4096) - in3  *  1189         + 2048) >> 12) + in13;

-    int t12a = ((in13 *  1189         + in3  * (3920 - 4096) + 2048) >> 12) + in3;

+    int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;

+    if (tx64) {

+        t8a  = (in1 *   401 + 2048) >> 12;

+        t9a  = (in7 * -2598 + 2048) >> 12;

+        t10a = (in5 *  1931 + 2048) >> 12;

+        t11a = (in3 * -1189 + 2048) >> 12;

+        t12a = (in3 *  3920 + 2048) >> 12;

+        t13a = (in5 *  3612 + 2048) >> 12;

+        t14a = (in7 *  3166 + 2048) >> 12;

+        t15a = (in1 *  4076 + 2048) >> 12;

+    } else {

+        const int in9  = c[ 9 * stride], in11 = c[11 * stride];

+        const int in13 = c[13 * stride], in15 = c[15 * stride];

+        t8a  = ((in1  *   401         - in15 * (4076 - 4096) + 2048) >> 12) - in15;

+        t9a  =  (in9  *  1583         - in7  *  1299         + 1024) >> 11;

+        t10a = ((in5  *  1931         - in11 * (3612 - 4096) + 2048) >> 12) - in11;

+        t11a = ((in13 * (3920 - 4096) - in3  *  1189         + 2048) >> 12) + in13;

+        t12a = ((in13 *  1189         + in3  * (3920 - 4096) + 2048) >> 12) + in3;

+        t13a = ((in5  * (3612 - 4096) + in11 *  1931         + 2048) >> 12) + in5;

+        t14a =  (in9  *  1299         + in7  *  1583         + 1024) >> 11;

+        t15a = ((in1  * (4076 - 4096) + in15 *   401         + 2048) >> 12) + in1;

+    }

     int t8  = CLIP(t8a  + t9a);

     int t9  = CLIP(t8a  - t9a);

     int t10 = CLIP(t11a - t10a);

@@ -167,59 +210,94 @@

     t11  = ((t12a - t11a) * 181 + 128) >> 8;

     t12  = ((t12a + t11a) * 181 + 128) >> 8;

-    out[ 0 * out_s] = CLIP(tmp[0] + t15a);

-    out[ 1 * out_s] = CLIP(tmp[1] + t14);

-    out[ 2 * out_s] = CLIP(tmp[2] + t13a);

-    out[ 3 * out_s] = CLIP(tmp[3] + t12);

-    out[ 4 * out_s] = CLIP(tmp[4] + t11);

-    out[ 5 * out_s] = CLIP(tmp[5] + t10a);

-    out[ 6 * out_s] = CLIP(tmp[6] + t9);

-    out[ 7 * out_s] = CLIP(tmp[7] + t8a);

-    out[ 8 * out_s] = CLIP(tmp[7] - t8a);

-    out[ 9 * out_s] = CLIP(tmp[6] - t9);

-    out[10 * out_s] = CLIP(tmp[5] - t10a);

-    out[11 * out_s] = CLIP(tmp[4] - t11);

-    out[12 * out_s] = CLIP(tmp[3] - t12);

-    out[13 * out_s] = CLIP(tmp[2] - t13a);

-    out[14 * out_s] = CLIP(tmp[1] - t14);

-    out[15 * out_s] = CLIP(tmp[0] - t15a);

+    const int t0 = c[ 0 * stride];

+    const int t1 = c[ 2 * stride];

+    const int t2 = c[ 4 * stride];

+    const int t3 = c[ 6 * stride];

+    const int t4 = c[ 8 * stride];

+    const int t5 = c[10 * stride];

+    const int t6 = c[12 * stride];

+    const int t7 = c[14 * stride];

+    c[ 0 * stride] = CLIP(t0 + t15a);

+    c[ 1 * stride] = CLIP(t1 + t14);

+    c[ 2 * stride] = CLIP(t2 + t13a);

+    c[ 3 * stride] = CLIP(t3 + t12);

+    c[ 4 * stride] = CLIP(t4 + t11);

+    c[ 5 * stride] = CLIP(t5 + t10a);

+    c[ 6 * stride] = CLIP(t6 + t9);

+    c[ 7 * stride] = CLIP(t7 + t8a);

+    c[ 8 * stride] = CLIP(t7 - t8a);

+    c[ 9 * stride] = CLIP(t6 - t9);

+    c[10 * stride] = CLIP(t5 - t10a);

+    c[11 * stride] = CLIP(t4 - t11);

+    c[12 * stride] = CLIP(t3 - t12);

+    c[13 * stride] = CLIP(t2 - t13a);

+    c[14 * stride] = CLIP(t1 - t14);

+    c[15 * stride] = CLIP(t0 - t15a);

-void dav1d_inv_dct32_1d_c(const int32_t *const in, const ptrdiff_t in_s,

-                          int32_t *const out, const ptrdiff_t out_s,

-                          const int max)

+void dav1d_inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,

+                          const int min, const int max)

-    const int min = -max - 1;

-    int32_t tmp[16];

+    inv_dct16_1d_internal_c(c, stride, min, max, 0);

+}

-    dav1d_inv_dct16_1d_c(in, in_s * 2, tmp, 1, max);

+static NOINLINE void

+inv_dct32_1d_internal_c(int32_t *const c, const ptrdiff_t stride,

+                        const int min, const int max, const int tx64)

+{

+    assert(stride > 0);

+    inv_dct16_1d_internal_c(c, stride << 1, min, max, tx64);

-    const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];

-    const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];

-    const int in9  = in[ 9 * in_s], in11 = in[11 * in_s];

-    const int in13 = in[13 * in_s], in15 = in[15 * in_s];

-    const int in17 = in[17 * in_s], in19 = in[19 * in_s];

-    const int in21 = in[21 * in_s], in23 = in[23 * in_s];

-    const int in25 = in[25 * in_s], in27 = in[27 * in_s];

-    const int in29 = in[29 * in_s], in31 = in[31 * in_s];

+    const int in1  = c[ 1 * stride], in3  = c[ 3 * stride];

+    const int in5  = c[ 5 * stride], in7  = c[ 7 * stride];

+    const int in9  = c[ 9 * stride], in11 = c[11 * stride];

+    const int in13 = c[13 * stride], in15 = c[15 * stride];

-    int t16a = ((in1  *   201         - in31 * (4091 - 4096) + 2048) >> 12) - in31;

-    int t31a = ((in1  * (4091 - 4096) + in31 *   201         + 2048) >> 12) + in1;

-    int t17a = ((in17 * (3035 - 4096) - in15 *  2751         + 2048) >> 12) + in17;

-    int t30a = ((in17 *  2751         + in15 * (3035 - 4096) + 2048) >> 12) + in15;

-    int t18a = ((in9  *  1751         - in23 * (3703 - 4096) + 2048) >> 12) - in23;

-    int t29a = ((in9  * (3703 - 4096) + in23 *  1751         + 2048) >> 12) + in9;

-    int t19a = ((in25 * (3857 - 4096) - in7  *  1380         + 2048) >> 12) + in25;

-    int t28a = ((in25 *  1380         + in7  * (3857 - 4096) + 2048) >> 12) + in7;

-    int t20a = ((in5  *   995         - in27 * (3973 - 4096) + 2048) >> 12) - in27;

-    int t27a = ((in5  * (3973 - 4096) + in27 *   995         + 2048) >> 12) + in5;

-    int t21a = ((in21 * (3513 - 4096) - in11 *  2106         + 2048) >> 12) + in21;

-    int t26a = ((in21 *  2106         + in11 * (3513 - 4096) + 2048) >> 12) + in11;

-    int t22a =  (in13 *  1220         - in19 *  1645         + 1024) >> 11;

-    int t25a =  (in13 *  1645         + in19 *  1220         + 1024) >> 11;

-    int t23a = ((in29 * (4052 - 4096) - in3  *   601         + 2048) >> 12) + in29;

-    int t24a = ((in29 *   601         + in3  * (4052 - 4096) + 2048) >> 12) + in3;

+    int t16a, t17a, t18a, t19a, t20a, t21a, t22a, t23a;

+    int t24a, t25a, t26a, t27a, t28a, t29a, t30a, t31a;

+    if (tx64) {

+        t16a = (in1  *   201 + 2048) >> 12;

+        t17a = (in15 * -2751 + 2048) >> 12;

+        t18a = (in9  *  1751 + 2048) >> 12;

+        t19a = (in7  * -1380 + 2048) >> 12;

+        t20a = (in5  *   995 + 2048) >> 12;

+        t21a = (in11 * -2106 + 2048) >> 12;

+        t22a = (in13 *  2440 + 2048) >> 12;

+        t23a = (in3  *  -601 + 2048) >> 12;

+        t24a = (in3  *  4052 + 2048) >> 12;

+        t25a = (in13 *  3290 + 2048) >> 12;

+        t26a = (in11 *  3513 + 2048) >> 12;

+        t27a = (in5  *  3973 + 2048) >> 12;

+        t28a = (in7  *  3857 + 2048) >> 12;

+        t29a = (in9  *  3703 + 2048) >> 12;

+        t30a = (in15 *  3035 + 2048) >> 12;

+        t31a = (in1  *  4091 + 2048) >> 12;

+    } else {

+        const int in17 = c[17 * stride], in19 = c[19 * stride];

+        const int in21 = c[21 * stride], in23 = c[23 * stride];

+        const int in25 = c[25 * stride], in27 = c[27 * stride];

+        const int in29 = c[29 * stride], in31 = c[31 * stride];

+        t16a = ((in1  *   201         - in31 * (4091 - 4096) + 2048) >> 12) - in31;

+        t17a = ((in17 * (3035 - 4096) - in15 *  2751         + 2048) >> 12) + in17;

+        t18a = ((in9  *  1751         - in23 * (3703 - 4096) + 2048) >> 12) - in23;

+        t19a = ((in25 * (3857 - 4096) - in7  *  1380         + 2048) >> 12) + in25;

+        t20a = ((in5  *   995         - in27 * (3973 - 4096) + 2048) >> 12) - in27;

+        t21a = ((in21 * (3513 - 4096) - in11 *  2106         + 2048) >> 12) + in21;

+        t22a =  (in13 *  1220         - in19 *  1645         + 1024) >> 11;

+        t23a = ((in29 * (4052 - 4096) - in3  *   601         + 2048) >> 12) + in29;

+        t24a = ((in29 *   601         + in3  * (4052 - 4096) + 2048) >> 12) + in3;

+        t25a =  (in13 *  1645         + in19 *  1220         + 1024) >> 11;

+        t26a = ((in21 *  2106         + in11 * (3513 - 4096) + 2048) >> 12) + in11;

+        t27a = ((in5  * (3973 - 4096) + in27 *   995         + 2048) >> 12) + in5;

+        t28a = ((in25 *  1380         + in7  * (3857 - 4096) + 2048) >> 12) + in7;

+        t29a = ((in9  * (3703 - 4096) + in23 *  1751         + 2048) >> 12) + in9;

+        t30a = ((in17 *  2751         + in15 * (3035 - 4096) + 2048) >> 12) + in15;

+        t31a = ((in1  * (4091 - 4096) + in31 *   201         + 2048) >> 12) + in1;

+    }

     int t16 = CLIP(t16a + t17a);

     int t17 = CLIP(t16a - t17a);

     int t18 = CLIP(t19a - t18a);

@@ -298,98 +376,110 @@

     t23a = ((t24  - t23 ) * 181 + 128) >> 8;

     t24a = ((t24  + t23 ) * 181 + 128) >> 8;

-    out[ 0 * out_s] = CLIP(tmp[ 0] + t31);

-    out[ 1 * out_s] = CLIP(tmp[ 1] + t30a);

-    out[ 2 * out_s] = CLIP(tmp[ 2] + t29);

-    out[ 3 * out_s] = CLIP(tmp[ 3] + t28a);

-    out[ 4 * out_s] = CLIP(tmp[ 4] + t27);

-    out[ 5 * out_s] = CLIP(tmp[ 5] + t26a);

-    out[ 6 * out_s] = CLIP(tmp[ 6] + t25);

-    out[ 7 * out_s] = CLIP(tmp[ 7] + t24a);

-    out[ 8 * out_s] = CLIP(tmp[ 8] + t23a);

-    out[ 9 * out_s] = CLIP(tmp[ 9] + t22);

-    out[10 * out_s] = CLIP(tmp[10] + t21a);

-    out[11 * out_s] = CLIP(tmp[11] + t20);

-    out[12 * out_s] = CLIP(tmp[12] + t19a);

-    out[13 * out_s] = CLIP(tmp[13] + t18);

-    out[14 * out_s] = CLIP(tmp[14] + t17a);

-    out[15 * out_s] = CLIP(tmp[15] + t16);

-    out[16 * out_s] = CLIP(tmp[15] - t16);

-    out[17 * out_s] = CLIP(tmp[14] - t17a);

-    out[18 * out_s] = CLIP(tmp[13] - t18);

-    out[19 * out_s] = CLIP(tmp[12] - t19a);

-    out[20 * out_s] = CLIP(tmp[11] - t20);

-    out[21 * out_s] = CLIP(tmp[10] - t21a);

-    out[22 * out_s] = CLIP(tmp[ 9] - t22);

-    out[23 * out_s] = CLIP(tmp[ 8] - t23a);

-    out[24 * out_s] = CLIP(tmp[ 7] - t24a);

-    out[25 * out_s] = CLIP(tmp[ 6] - t25);

-    out[26 * out_s] = CLIP(tmp[ 5] - t26a);

-    out[27 * out_s] = CLIP(tmp[ 4] - t27);

-    out[28 * out_s] = CLIP(tmp[ 3] - t28a);

-    out[29 * out_s] = CLIP(tmp[ 2] - t29);

-    out[30 * out_s] = CLIP(tmp[ 1] - t30a);

-    out[31 * out_s] = CLIP(tmp[ 0] - t31);

+    const int t0  = c[ 0 * stride];

+    const int t1  = c[ 2 * stride];

+    const int t2  = c[ 4 * stride];

+    const int t3  = c[ 6 * stride];

+    const int t4  = c[ 8 * stride];

+    const int t5  = c[10 * stride];

+    const int t6  = c[12 * stride];

+    const int t7  = c[14 * stride];

+    const int t8  = c[16 * stride];

+    const int t9  = c[18 * stride];

+    const int t10 = c[20 * stride];

+    const int t11 = c[22 * stride];

+    const int t12 = c[24 * stride];

+    const int t13 = c[26 * stride];

+    const int t14 = c[28 * stride];

+    const int t15 = c[30 * stride];

+    c[ 0 * stride] = CLIP(t0  + t31);

+    c[ 1 * stride] = CLIP(t1  + t30a);

+    c[ 2 * stride] = CLIP(t2  + t29);

+    c[ 3 * stride] = CLIP(t3  + t28a);

+    c[ 4 * stride] = CLIP(t4  + t27);

+    c[ 5 * stride] = CLIP(t5  + t26a);

+    c[ 6 * stride] = CLIP(t6  + t25);

+    c[ 7 * stride] = CLIP(t7  + t24a);

+    c[ 8 * stride] = CLIP(t8  + t23a);

+    c[ 9 * stride] = CLIP(t9  + t22);

+    c[10 * stride] = CLIP(t10 + t21a);

+    c[11 * stride] = CLIP(t11 + t20);

+    c[12 * stride] = CLIP(t12 + t19a);

+    c[13 * stride] = CLIP(t13 + t18);

+    c[14 * stride] = CLIP(t14 + t17a);

+    c[15 * stride] = CLIP(t15 + t16);

+    c[16 * stride] = CLIP(t15 - t16);

+    c[17 * stride] = CLIP(t14 - t17a);

+    c[18 * stride] = CLIP(t13 - t18);

+    c[19 * stride] = CLIP(t12 - t19a);

+    c[20 * stride] = CLIP(t11 - t20);

+    c[21 * stride] = CLIP(t10 - t21a);

+    c[22 * stride] = CLIP(t9  - t22);

+    c[23 * stride] = CLIP(t8  - t23a);

+    c[24 * stride] = CLIP(t7  - t24a);

+    c[25 * stride] = CLIP(t6  - t25);

+    c[26 * stride] = CLIP(t5  - t26a);

+    c[27 * stride] = CLIP(t4  - t27);

+    c[28 * stride] = CLIP(t3  - t28a);

+    c[29 * stride] = CLIP(t2  - t29);

+    c[30 * stride] = CLIP(t1  - t30a);

+    c[31 * stride] = CLIP(t0  - t31);

-void dav1d_inv_dct64_1d_c(const int32_t *const in, const ptrdiff_t in_s,

-                          int32_t *const out, const ptrdiff_t out_s,

-                          const int max)

+void dav1d_inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,

+                          const int min, const int max)

-    const int min = -max - 1;

-    int32_t tmp[32];

+    inv_dct32_1d_internal_c(c, stride, min, max, 0);

+}

-    dav1d_inv_dct32_1d_c(in, in_s * 2, tmp, 1, max);

+void dav1d_inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,

+                          const int min, const int max)

+{

+    assert(stride > 0);

+    inv_dct32_1d_internal_c(c, stride << 1, min, max, 1);

-    const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];

-    const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];

-    const int in9  = in[ 9 * in_s], in11 = in[11 * in_s];

-    const int in13 = in[13 * in_s], in15 = in[15 * in_s];

-    const int in17 = in[17 * in_s], in19 = in[19 * in_s];

-    const int in21 = in[21 * in_s], in23 = in[23 * in_s];

-    const int in25 = in[25 * in_s], in27 = in[27 * in_s];

-    const int in29 = in[29 * in_s], in31 = in[31 * in_s];

-    const int in33 = in[33 * in_s], in35 = in[35 * in_s];

-    const int in37 = in[37 * in_s], in39 = in[39 * in_s];

-    const int in41 = in[41 * in_s], in43 = in[43 * in_s];

-    const int in45 = in[45 * in_s], in47 = in[47 * in_s];

-    const int in49 = in[49 * in_s], in51 = in[51 * in_s];

-    const int in53 = in[53 * in_s], in55 = in[55 * in_s];

-    const int in57 = in[57 * in_s], in59 = in[59 * in_s];

-    const int in61 = in[61 * in_s], in63 = in[63 * in_s];

+    const int in1  = c[ 1 * stride], in3  = c[ 3 * stride];

+    const int in5  = c[ 5 * stride], in7  = c[ 7 * stride];

+    const int in9  = c[ 9 * stride], in11 = c[11 * stride];

+    const int in13 = c[13 * stride], in15 = c[15 * stride];

+    const int in17 = c[17 * stride], in19 = c[19 * stride];

+    const int in21 = c[21 * stride], in23 = c[23 * stride];

+    const int in25 = c[25 * stride], in27 = c[27 * stride];

+    const int in29 = c[29 * stride], in31 = c[31 * stride];

-    int t32a = ((in1  *   101         - in63 * (4095 - 4096) + 2048) >> 12) - in63;

-    int t33a = ((in33 * (2967 - 4096) - in31 *  2824         + 2048) >> 12) + in33;

-    int t34a = ((in17 *  1660         - in47 * (3745 - 4096) + 2048) >> 12) - in47;

-    int t35a =  (in49 *  1911         - in15 *   737         + 1024) >> 11;

-    int t36a = ((in9  *   897         - in55 * (3996 - 4096) + 2048) >> 12) - in55;

-    int t37a = ((in41 * (3461 - 4096) - in23 *  2191         + 2048) >> 12) + in41;

-    int t38a = ((in25 *  2359         - in39 * (3349 - 4096) + 2048) >> 12) - in39;

-    int t39a =  (in57 *  2018         - in7  *   350         + 1024) >> 11;

-    int t40a = ((in5  *   501         - in59 * (4065 - 4096) + 2048) >> 12) - in59;

-    int t41a = ((in37 * (3229 - 4096) - in27 *  2520         + 2048) >> 12) + in37;

-    int t42a = ((in21 *  2019         - in43 * (3564 - 4096) + 2048) >> 12) - in43;

-    int t43a =  (in53 *  1974         - in11 *   546         + 1024) >> 11;

-    int t44a = ((in13 *  1285         - in51 * (3889 - 4096) + 2048) >> 12) - in51;

-    int t45a = ((in45 * (3659 - 4096) - in19 *  1842         + 2048) >> 12) + in45;

-    int t46a = ((in29 *  2675         - in35 * (3102 - 4096) + 2048) >> 12) - in35;

-    int t47a = ((in61 * (4085 - 4096) - in3  *   301         + 2048) >> 12) + in61;

-    int t48a = ((in61 *   301         + in3  * (4085 - 4096) + 2048) >> 12) + in3;

-    int t49a = ((in29 * (3102 - 4096) + in35 *  2675         + 2048) >> 12) + in29;

-    int t50a = ((in45 *  1842         + in19 * (3659 - 4096) + 2048) >> 12) + in19;

-    int t51a = ((in13 * (3889 - 4096) + in51 *  1285         + 2048) >> 12) + in13;

-    int t52a =  (in53 *   546         + in11 *  1974         + 1024) >> 11;

-    int t53a = ((in21 * (3564 - 4096) + in43 *  2019         + 2048) >> 12) + in21;

-    int t54a = ((in37 *  2520         + in27 * (3229 - 4096) + 2048) >> 12) + in27;

-    int t55a = ((in5  * (4065 - 4096) + in59 *   501         + 2048) >> 12) + in5;

-    int t56a =  (in57 *   350         + in7  *  2018         + 1024) >> 11;

-    int t57a = ((in25 * (3349 - 4096) + in39 *  2359         + 2048) >> 12) + in25;

-    int t58a = ((in41 *  2191         + in23 * (3461 - 4096) + 2048) >> 12) + in23;

-    int t59a = ((in9  * (3996 - 4096) + in55 *   897         + 2048) >> 12) + in9;

-    int t60a =  (in49 *   737         + in15 *  1911         + 1024) >> 11;

-    int t61a = ((in17 * (3745 - 4096) + in47 *  1660         + 2048) >> 12) + in17;

-    int t62a = ((in33 *  2824         + in31 * (2967 - 4096) + 2048) >> 12) + in31;

-    int t63a = ((in1  * (4095 - 4096) + in63 *   101         + 2048) >> 12) + in1;

+    int t32a = (in1  *   101 + 2048) >> 12;

+    int t33a = (in31 * -2824 + 2048) >> 12;

+    int t34a = (in17 *  1660 + 2048) >> 12;

+    int t35a = (in15 * -1474 + 2048) >> 12;

+    int t36a = (in9  *   897 + 2048) >> 12;

+    int t37a = (in23 * -2191 + 2048) >> 12;

+    int t38a = (in25 *  2359 + 2048) >> 12;

+    int t39a = (in7  *  -700 + 2048) >> 12;

+    int t40a = (in5  *   501 + 2048) >> 12;

+    int t41a = (in27 * -2520 + 2048) >> 12;

+    int t42a = (in21 *  2019 + 2048) >> 12;

+    int t43a = (in11 * -1092 + 2048) >> 12;

+    int t44a = (in13 *  1285 + 2048) >> 12;

+    int t45a = (in19 * -1842 + 2048) >> 12;

+    int t46a = (in29 *  2675 + 2048) >> 12;

+    int t47a = (in3  *  -301 + 2048) >> 12;

+    int t48a = (in3  *  4085 + 2048) >> 12;

+    int t49a = (in29 *  3102 + 2048) >> 12;

+    int t50a = (in19 *  3659 + 2048) >> 12;

+    int t51a = (in13 *  3889 + 2048) >> 12;

+    int t52a = (in11 *  3948 + 2048) >> 12;

+    int t53a = (in21 *  3564 + 2048) >> 12;

+    int t54a = (in27 *  3229 + 2048) >> 12;

+    int t55a = (in5  *  4065 + 2048) >> 12;

+    int t56a = (in7  *  4036 + 2048) >> 12;

+    int t57a = (in25 *  3349 + 2048) >> 12;

+    int t58a = (in23 *  3461 + 2048) >> 12;

+    int t59a = (in9  *  3996 + 2048) >> 12;

+    int t60a = (in15 *  3822 + 2048) >> 12;

+    int t61a = (in17 *  3745 + 2048) >> 12;

+    int t62a = (in31 *  2967 + 2048) >> 12;

+    int t63a = (in1  *  4095 + 2048) >> 12;

     int t32 = CLIP(t32a + t33a);

     int t33 = CLIP(t32a - t33a);

@@ -591,76 +681,111 @@

     t54  = ((t41a + t54a) * 181 + 128) >> 8;

     t55a = ((t40  + t55 ) * 181 + 128) >> 8;

-    out[ 0 * out_s] = CLIP(tmp[ 0] + t63a);

-    out[ 1 * out_s] = CLIP(tmp[ 1] + t62);

-    out[ 2 * out_s] = CLIP(tmp[ 2] + t61a);

-    out[ 3 * out_s] = CLIP(tmp[ 3] + t60);

-    out[ 4 * out_s] = CLIP(tmp[ 4] + t59a);

-    out[ 5 * out_s] = CLIP(tmp[ 5] + t58);

-    out[ 6 * out_s] = CLIP(tmp[ 6] + t57a);

-    out[ 7 * out_s] = CLIP(tmp[ 7] + t56);

-    out[ 8 * out_s] = CLIP(tmp[ 8] + t55a);

-    out[ 9 * out_s] = CLIP(tmp[ 9] + t54);

-    out[10 * out_s] = CLIP(tmp[10] + t53a);

-    out[11 * out_s] = CLIP(tmp[11] + t52);

-    out[12 * out_s] = CLIP(tmp[12] + t51a);

-    out[13 * out_s] = CLIP(tmp[13] + t50);

-    out[14 * out_s] = CLIP(tmp[14] + t49a);

-    out[15 * out_s] = CLIP(tmp[15] + t48);

-    out[16 * out_s] = CLIP(tmp[16] + t47);

-    out[17 * out_s] = CLIP(tmp[17] + t46a);

-    out[18 * out_s] = CLIP(tmp[18] + t45);

-    out[19 * out_s] = CLIP(tmp[19] + t44a);

-    out[20 * out_s] = CLIP(tmp[20] + t43);

-    out[21 * out_s] = CLIP(tmp[21] + t42a);

-    out[22 * out_s] = CLIP(tmp[22] + t41);

-    out[23 * out_s] = CLIP(tmp[23] + t40a);

-    out[24 * out_s] = CLIP(tmp[24] + t39);

-    out[25 * out_s] = CLIP(tmp[25] + t38a);

-    out[26 * out_s] = CLIP(tmp[26] + t37);

-    out[27 * out_s] = CLIP(tmp[27] + t36a);

-    out[28 * out_s] = CLIP(tmp[28] + t35);

-    out[29 * out_s] = CLIP(tmp[29] + t34a);

-    out[30 * out_s] = CLIP(tmp[30] + t33);

-    out[31 * out_s] = CLIP(tmp[31] + t32a);

-    out[32 * out_s] = CLIP(tmp[31] - t32a);

-    out[33 * out_s] = CLIP(tmp[30] - t33);

-    out[34 * out_s] = CLIP(tmp[29] - t34a);

-    out[35 * out_s] = CLIP(tmp[28] - t35);

-    out[36 * out_s] = CLIP(tmp[27] - t36a);

-    out[37 * out_s] = CLIP(tmp[26] - t37);

-    out[38 * out_s] = CLIP(tmp[25] - t38a);

-    out[39 * out_s] = CLIP(tmp[24] - t39);

-    out[40 * out_s] = CLIP(tmp[23] - t40a);

-    out[41 * out_s] = CLIP(tmp[22] - t41);

-    out[42 * out_s] = CLIP(tmp[21] - t42a);

-    out[43 * out_s] = CLIP(tmp[20] - t43);

-    out[44 * out_s] = CLIP(tmp[19] - t44a);

-    out[45 * out_s] = CLIP(tmp[18] - t45);

-    out[46 * out_s] = CLIP(tmp[17] - t46a);

-    out[47 * out_s] = CLIP(tmp[16] - t47);

-    out[48 * out_s] = CLIP(tmp[15] - t48);

-    out[49 * out_s] = CLIP(tmp[14] - t49a);

-    out[50 * out_s] = CLIP(tmp[13] - t50);

-    out[51 * out_s] = CLIP(tmp[12] - t51a);

-    out[52 * out_s] = CLIP(tmp[11] - t52);

-    out[53 * out_s] = CLIP(tmp[10] - t53a);

-    out[54 * out_s] = CLIP(tmp[ 9] - t54);

-    out[55 * out_s] = CLIP(tmp[ 8] - t55a);

-    out[56 * out_s] = CLIP(tmp[ 7] - t56);

-    out[57 * out_s] = CLIP(tmp[ 6] - t57a);

-    out[58 * out_s] = CLIP(tmp[ 5] - t58);

-    out[59 * out_s] = CLIP(tmp[ 4] - t59a);

-    out[60 * out_s] = CLIP(tmp[ 3] - t60);

-    out[61 * out_s] = CLIP(tmp[ 2] - t61a);

-    out[62 * out_s] = CLIP(tmp[ 1] - t62);

-    out[63 * out_s] = CLIP(tmp[ 0] - t63a);

+    const int t0  = c[ 0 * stride];

+    const int t1  = c[ 2 * stride];

+    const int t2  = c[ 4 * stride];

+    const int t3  = c[ 6 * stride];

+    const int t4  = c[ 8 * stride];

+    const int t5  = c[10 * stride];

+    const int t6  = c[12 * stride];

+    const int t7  = c[14 * stride];

+    const int t8  = c[16 * stride];

+    const int t9  = c[18 * stride];

+    const int t10 = c[20 * stride];

+    const int t11 = c[22 * stride];

+    const int t12 = c[24 * stride];

+    const int t13 = c[26 * stride];

+    const int t14 = c[28 * stride];

+    const int t15 = c[30 * stride];

+    const int t16 = c[32 * stride];

+    const int t17 = c[34 * stride];

+    const int t18 = c[36 * stride];

+    const int t19 = c[38 * stride];

+    const int t20 = c[40 * stride];

+    const int t21 = c[42 * stride];

+    const int t22 = c[44 * stride];

+    const int t23 = c[46 * stride];

+    const int t24 = c[48 * stride];

+    const int t25 = c[50 * stride];

+    const int t26 = c[52 * stride];

+    const int t27 = c[54 * stride];

+    const int t28 = c[56 * stride];

+    const int t29 = c[58 * stride];

+    const int t30 = c[60 * stride];

+    const int t31 = c[62 * stride];

+    c[ 0 * stride] = CLIP(t0  + t63a);

+    c[ 1 * stride] = CLIP(t1  + t62);

+    c[ 2 * stride] = CLIP(t2  + t61a);

+    c[ 3 * stride] = CLIP(t3  + t60);

+    c[ 4 * stride] = CLIP(t4  + t59a);

+    c[ 5 * stride] = CLIP(t5  + t58);

+    c[ 6 * stride] = CLIP(t6  + t57a);

+    c[ 7 * stride] = CLIP(t7  + t56);

+    c[ 8 * stride] = CLIP(t8  + t55a);

+    c[ 9 * stride] = CLIP(t9  + t54);

+    c[10 * stride] = CLIP(t10 + t53a);

+    c[11 * stride] = CLIP(t11 + t52);

+    c[12 * stride] = CLIP(t12 + t51a);

+    c[13 * stride] = CLIP(t13 + t50);

+    c[14 * stride] = CLIP(t14 + t49a);

+    c[15 * stride] = CLIP(t15 + t48);

+    c[16 * stride] = CLIP(t16 + t47);

+    c[17 * stride] = CLIP(t17 + t46a);

+    c[18 * stride] = CLIP(t18 + t45);

+    c[19 * stride] = CLIP(t19 + t44a);

+    c[20 * stride] = CLIP(t20 + t43);

+    c[21 * stride] = CLIP(t21 + t42a);

+    c[22 * stride] = CLIP(t22 + t41);

+    c[23 * stride] = CLIP(t23 + t40a);

+    c[24 * stride] = CLIP(t24 + t39);

+    c[25 * stride] = CLIP(t25 + t38a);

+    c[26 * stride] = CLIP(t26 + t37);

+    c[27 * stride] = CLIP(t27 + t36a);

+    c[28 * stride] = CLIP(t28 + t35);

+    c[29 * stride] = CLIP(t29 + t34a);

+    c[30 * stride] = CLIP(t30 + t33);

+    c[31 * stride] = CLIP(t31 + t32a);

+    c[32 * stride] = CLIP(t31 - t32a);

+    c[33 * stride] = CLIP(t30 - t33);

+    c[34 * stride] = CLIP(t29 - t34a);

+    c[35 * stride] = CLIP(t28 - t35);

+    c[36 * stride] = CLIP(t27 - t36a);

+    c[37 * stride] = CLIP(t26 - t37);

+    c[38 * stride] = CLIP(t25 - t38a);

+    c[39 * stride] = CLIP(t24 - t39);

+    c[40 * stride] = CLIP(t23 - t40a);

+    c[41 * stride] = CLIP(t22 - t41);

+    c[42 * stride] = CLIP(t21 - t42a);

+    c[43 * stride] = CLIP(t20 - t43);

+    c[44 * stride] = CLIP(t19 - t44a);

+    c[45 * stride] = CLIP(t18 - t45);

+    c[46 * stride] = CLIP(t17 - t46a);

+    c[47 * stride] = CLIP(t16 - t47);

+    c[48 * stride] = CLIP(t15 - t48);

+    c[49 * stride] = CLIP(t14 - t49a);

+    c[50 * stride] = CLIP(t13 - t50);

+    c[51 * stride] = CLIP(t12 - t51a);

+    c[52 * stride] = CLIP(t11 - t52);

+    c[53 * stride] = CLIP(t10 - t53a);

+    c[54 * stride] = CLIP(t9  - t54);

+    c[55 * stride] = CLIP(t8  - t55a);

+    c[56 * stride] = CLIP(t7  - t56);

+    c[57 * stride] = CLIP(t6  - t57a);

+    c[58 * stride] = CLIP(t5  - t58);

+    c[59 * stride] = CLIP(t4  - t59a);

+    c[60 * stride] = CLIP(t3  - t60);

+    c[61 * stride] = CLIP(t2  - t61a);

+    c[62 * stride] = CLIP(t1  - t62);

+    c[63 * stride] = CLIP(t0  - t63a);

-void dav1d_inv_adst4_1d_c(const int32_t *const in, const ptrdiff_t in_s,

-                          int32_t *const out, const ptrdiff_t out_s,

-                          const int range)

+static NOINLINE void

+inv_adst4_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,

+                        const int min, const int max,

+                        int32_t *const out, const ptrdiff_t out_s)

+    assert(in_s > 0 && out_s != 0);

     const int in0 = in[0 * in_s], in1 = in[1 * in_s];

     const int in2 = in[2 * in_s], in3 = in[3 * in_s];

@@ -676,11 +801,12 @@

                      in0 + in2 - in1;

-void dav1d_inv_adst8_1d_c(const int32_t *const in, const ptrdiff_t in_s,

-                          int32_t *const out, const ptrdiff_t out_s,

-                          const int max)

+static NOINLINE void

+inv_adst8_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,

+                        const int min, const int max,

+                        int32_t *const out, const ptrdiff_t out_s)

-    const int min = -max - 1;

+    assert(in_s > 0 && out_s != 0);

     const int in0 = in[0 * in_s], in1 = in[1 * in_s];

     const int in2 = in[2 * in_s], in3 = in[3 * in_s];

     const int in4 = in[4 * in_s], in5 = in[5 * in_s];

@@ -724,11 +850,12 @@

     out[5 * out_s] = -(((t6 - t7) * 181 + 128) >> 8);

-void dav1d_inv_adst16_1d_c(const int32_t *const in, const ptrdiff_t in_s,

-                           int32_t *const out, const ptrdiff_t out_s,

-                           const int max)

+static NOINLINE void

+inv_adst16_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,

+                         const int min, const int max,

+                         int32_t *const out, const ptrdiff_t out_s)

-    const int min = -max - 1;

+    assert(in_s > 0 && out_s != 0);

     const int in0  = in[ 0 * in_s], in1  = in[ 1 * in_s];

     const int in2  = in[ 2 * in_s], in3  = in[ 3 * in_s];

     const int in4  = in[ 4 * in_s], in5  = in[ 5 * in_s];

@@ -834,59 +961,66 @@

     out[10 * out_s] =   ((t14a - t15a) * 181 + 128) >> 8;

-#define flip_inv_adst(sz) \

-void dav1d_inv_flipadst##sz##_1d_c(const int32_t *const in, const ptrdiff_t in_s, \

-                                   int32_t *const out, const ptrdiff_t out_s, \

-                                   const int range) \

+#define inv_adst_1d(sz) \

+void dav1d_inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \

+                               const int min, const int max) \

{ \

-    dav1d_inv_adst##sz##_1d_c(in, in_s, &out[(sz - 1) * out_s], -out_s, range); \

+    inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \

+} \

+void dav1d_inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \

+                                   const int min, const int max) \

+{ \

+    inv_adst##sz##_1d_internal_c(c, stride, min, max, \

+                                 &c[(sz - 1) * stride], -stride); \

-flip_inv_adst(4)

-flip_inv_adst(8)

-flip_inv_adst(16)

+inv_adst_1d( 4)

+inv_adst_1d( 8)

+inv_adst_1d(16)

-#undef flip_inv_adst

+#undef inv_adst_1d

-void dav1d_inv_identity4_1d_c(const int32_t *const in, const ptrdiff_t in_s,

-                              int32_t *const out, const ptrdiff_t out_s,

-                              const int range)

+void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,

+                              const int min, const int max)

-    for (int i = 0; i < 4; i++)

-        out[out_s * i] = in[in_s * i] + ((in[in_s * i] * 1697 + 2048) >> 12);

+    assert(stride > 0);

+    for (int i = 0; i < 4; i++) {

+        const int in = c[stride * i];

+        c[stride * i] = in + ((in * 1697 + 2048) >> 12);

+    }

-void dav1d_inv_identity8_1d_c(const int32_t *const in, const ptrdiff_t in_s,

-                              int32_t *const out, const ptrdiff_t out_s,

-                              const int range)

+void dav1d_inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,

+                              const int min, const int max)

+    assert(stride > 0);

     for (int i = 0; i < 8; i++)

-        out[out_s * i] = in[in_s * i] * 2;

+        c[stride * i] *= 2;

-void dav1d_inv_identity16_1d_c(const int32_t *const in, const ptrdiff_t in_s,

-                               int32_t *const out, const ptrdiff_t out_s,

-                               const int range)

+void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,

+                               const int min, const int max)

-    for (int i = 0; i < 16; i++)

-        out[out_s * i] = 2 * in[in_s * i] + ((in[in_s * i] * 1697 + 1024) >> 11);

+    assert(stride > 0);

+    for (int i = 0; i < 16; i++) {

+        const int in = c[stride * i];

+        c[stride * i] = 2 * in + ((in * 1697 + 1024) >> 11);

+    }

-void dav1d_inv_identity32_1d_c(const int32_t *const in, const ptrdiff_t in_s,

-                               int32_t *const out, const ptrdiff_t out_s,

-                               const int range)

+void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,

+                               const int min, const int max)

+    assert(stride > 0);

     for (int i = 0; i < 32; i++)

-        out[out_s * i] = in[in_s * i] * 4;

+        c[stride * i] *= 4;

-void dav1d_inv_wht4_1d_c(const int32_t *const in, const ptrdiff_t in_s,

-                         int32_t *const out, const ptrdiff_t out_s,

-                         const int pass)

-{

-    const int sh = 2 * !pass;

-    const int in0 = in[0 * in_s] >> sh, in1 = in[1 * in_s] >> sh;

-    const int in2 = in[2 * in_s] >> sh, in3 = in[3 * in_s] >> sh;

+void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) {

+    assert(stride > 0);

+    const int in0 = c[0 * stride], in1 = c[1 * stride];

+    const int in2 = c[2 * stride], in3 = c[3 * stride];

     const int t0 = in0 + in1;

     const int t2 = in2 - in3;

     const int t4 = (t0 - t2) >> 1;

@@ -893,8 +1027,8 @@

     const int t3 = t4 - in3;

     const int t1 = t4 - in1;

-    out[0 * out_s] = t0 - t3;

-    out[1 * out_s] = t3;

-    out[2 * out_s] = t1;

-    out[3 * out_s] = t2 + t1;

+    c[0 * stride] = t0 - t3;

+    c[1 * stride] = t3;

+    c[2 * stride] = t1;

+    c[3 * stride] = t2 + t1;

--- a/src/itx_1d.h

+++ b/src/itx_1d.h

@@ -32,8 +32,7 @@

 #define DAV1D_SRC_ITX_1D_H

 #define decl_itx_1d_fn(name) \

-void (name)(const int32_t *in, ptrdiff_t in_s, \

-            int32_t *out, ptrdiff_t out_s, const int range)

+void (name)(int32_t *c, ptrdiff_t stride, int min, int max)

 typedef decl_itx_1d_fn(*itx_1d_fn);

 decl_itx_1d_fn(dav1d_inv_dct4_1d_c);

@@ -55,6 +54,6 @@

 decl_itx_1d_fn(dav1d_inv_identity16_1d_c);

 decl_itx_1d_fn(dav1d_inv_identity32_1d_c);

-decl_itx_1d_fn(dav1d_inv_wht4_1d_c);

+void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride);

 #endif /* DAV1D_SRC_ITX_1D_H */

--- a/src/itx_tmpl.c

+++ b/src/itx_tmpl.c

@@ -37,64 +37,66 @@

 #include "src/itx.h"

 #include "src/itx_1d.h"

-static void NOINLINE

-inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,

-               coef *const coeff, const int eob,

-               const int w, const int h, const int shift,

+static NOINLINE void

+inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,

+               const int eob, const int w, const int h, const int shift,

                const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,

                const int has_dconly HIGHBD_DECL_SUFFIX)

-    int i, j;

-    assert((h >= 4 && h <= 64) && (w >= 4 && w <= 64));

+    assert(w >= 4 && w <= 64);

+    assert(h >= 4 && h <= 64);

+    assert(eob >= 0);

     const int is_rect2 = w * 2 == h || h * 2 == w;

-    const int bitdepth = bitdepth_from_max(bitdepth_max);

     const int rnd = (1 << shift) >> 1;

-    if (has_dconly && eob == 0) {

+    if (eob < has_dconly) {

         int dc = coeff[0];

         coeff[0] = 0;

         if (is_rect2)

-            dc = (dc * 2896 + 2048) >> 12;

-        dc = (dc * 2896 + 2048) >> 12;

+            dc = (dc * 181 + 128) >> 8;

+        dc = (dc * 181 + 128) >> 8;

         dc = (dc + rnd) >> shift;

-        dc = (dc * 2896 + 2048) >> 12;

-        dc = (dc + 8) >> 4;

-        for (j = 0; j < h; j++)

-            for (i = 0; i < w; i++)

-                dst[i + j * PXSTRIDE(stride)] =

-                    iclip_pixel(dst[i + j * PXSTRIDE(stride)] + dc);

+        dc = (dc * 181 + 128 + 2048) >> 12;

+        for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))

+            for (int x = 0; x < w; x++)

+                dst[x] = iclip_pixel(dst[x] + dc);

         return;

-    assert(eob > 0 || (eob == 0 && !has_dconly));

-    const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);

-    // Maximum value for h and w is 64

-    int32_t tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];

-    const int row_clip_max = (1 << (bitdepth + 8 - 1)) - 1;

-    const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) - 1;

+    const int sh = imin(h, 32), sw = imin(w, 32);

+#if BITDEPTH == 8

+    const int row_clip_min = INT16_MIN;

+    const int col_clip_min = INT16_MIN;

+#else

+    const int row_clip_min = (int) ((unsigned) ~bitdepth_max << 7);

+    const int col_clip_min = (int) ((unsigned) ~bitdepth_max << 5);

+#endif

+    const int row_clip_max = ~row_clip_min;

+    const int col_clip_max = ~col_clip_min;

-    if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));

-    for (i = 0; i < sh; i++) {

-        for (j = 0; j < sw; j++) {

-            in_mem[j] = coeff[i + j * sh];

-            if (is_rect2)

-                in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;

-        }

-        first_1d_fn(in_mem, 1, &tmp[i * w], 1, row_clip_max);

-        for (j = 0; j < w; j++)

-            tmp[i * w + j] = iclip((tmp[i * w + j] + rnd) >> shift,

-                                   -col_clip_max - 1, col_clip_max);

+    int32_t tmp[64 * 64], *c = tmp;

+    for (int y = 0; y < sh; y++, c += w) {

+        if (is_rect2)

+            for (int x = 0; x < sw; x++)

+                c[x] = (coeff[y + x * sh] * 181 + 128) >> 8;

+        else

+            for (int x = 0; x < sw; x++)

+                c[x] = coeff[y + x * sh];

+        first_1d_fn(c, 1, row_clip_min, row_clip_max);

-    if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));

-    for (i = 0; i < w; i++) {

-        second_1d_fn(&tmp[i], w, out, 1, col_clip_max);

-        for (j = 0; j < h; j++)

-            dst[i + j * PXSTRIDE(stride)] =

-                iclip_pixel(dst[i + j * PXSTRIDE(stride)] +

-                            ((out[j] + 8) >> 4));

-    }

-    memset(coeff, 0, sizeof(*coeff) * sh * sw);

+    memset(coeff, 0, sizeof(*coeff) * sw * sh);

+    for (int i = 0; i < w * sh; i++)

+        tmp[i] = iclip((tmp[i] + rnd) >> shift, col_clip_min, col_clip_max);

+    for (int x = 0; x < w; x++)

+        second_1d_fn(&tmp[x], w, col_clip_min, col_clip_max);

+    c = tmp;

+    for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))

+        for (int x = 0; x < w; x++)

+            dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4));

 #define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \

@@ -161,21 +163,21 @@

                                        coef *const coeff, const int eob

                                        HIGHBD_DECL_SUFFIX)

-    int32_t tmp[4 * 4], out[4], in_mem[4];

-    for (int i = 0; i < 4; i++) {

-        for (int j = 0; j < 4; j++)

-            in_mem[j] = coeff[i + j * 4];

-        dav1d_inv_wht4_1d_c(in_mem, 1, &tmp[i * 4], 1, 0);

+    int32_t tmp[4 * 4], *c = tmp;

+    for (int y = 0; y < 4; y++, c += 4) {

+        for (int x = 0; x < 4; x++)

+            c[x] = coeff[y + x * 4] >> 2;

+        dav1d_inv_wht4_1d_c(c, 1);

-    for (int i = 0; i < 4; i++) {

-        dav1d_inv_wht4_1d_c(&tmp[i], 4, out, 1, 1);

-        for (int j = 0; j < 4; j++)

-            dst[i + j * PXSTRIDE(stride)] =

-                iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);

-    }

     memset(coeff, 0, sizeof(*coeff) * 4 * 4);

+    for (int x = 0; x < 4; x++)

+        dav1d_inv_wht4_1d_c(&tmp[x], 4);

+    c = tmp;

+    for (int y = 0; y < 4; y++, dst += PXSTRIDE(stride))

+        for (int x = 0; x < 4; x++)

+            dst[x] = iclip_pixel(dst[x] + *c++);

 COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {