ref: 5098b23ab3a98dc2202449dd3bd0c06902c4ff64
parent: c627f16f5ae671c4583b6c85006ce648224814e3
author: Janne Grunau <janne-vlc@jannau.net>
date: Mon Nov 12 18:59:02 EST 2018
itx: clip according to spec, fixes #103, #158 This does not adjust the AVX2 asm. The asm clips in many places to the required range (16-bit signed) for performance reason. No mismatch observed with coefs generated by the forward transform in checkasm in 10 thousand runs.
--- a/src/itx_1d.c
+++ b/src/itx_1d.c
@@ -32,10 +32,13 @@
#include "common/attributes.h"
+#define CLIP(a) iclip(a, min, max)
+
static void NOINLINE
inv_dct4_1d(const coef *const in, const ptrdiff_t in_s,
- coef *const out, const ptrdiff_t out_s)
+ coef *const out, const ptrdiff_t out_s, const int max)
{
+ const int min = -max - 1;
const int in0 = in[0 * in_s], in1 = in[1 * in_s];
const int in2 = in[2 * in_s], in3 = in[3 * in_s];
@@ -44,19 +47,20 @@
int t2 = (in1 * 1567 - in3 * 3784 + 2048) >> 12;
int t3 = (in1 * 3784 + in3 * 1567 + 2048) >> 12;
- out[0 * out_s] = t0 + t3;
- out[1 * out_s] = t1 + t2;
- out[2 * out_s] = t1 - t2;
- out[3 * out_s] = t0 - t3;
+ out[0 * out_s] = CLIP(t0 + t3);
+ out[1 * out_s] = CLIP(t1 + t2);
+ out[2 * out_s] = CLIP(t1 - t2);
+ out[3 * out_s] = CLIP(t0 - t3);
}
static void NOINLINE
inv_dct8_1d(const coef *const in, const ptrdiff_t in_s,
- coef *const out, const ptrdiff_t out_s)
+ coef *const out, const ptrdiff_t out_s, const int max)
{
+ const int min = -max - 1;
coef tmp[4];
- inv_dct4_1d(in, in_s * 2, tmp, 1);
+ inv_dct4_1d(in, in_s * 2, tmp, 1, max);
const int in1 = in[1 * in_s], in3 = in[3 * in_s];
const int in5 = in[5 * in_s], in7 = in[7 * in_s];
@@ -66,31 +70,32 @@
int t6a = (in5 * 2276 + in3 * 3406 + 2048) >> 12;
int t7a = (in1 * 4017 + in7 * 799 + 2048) >> 12;
- int t4 = t4a + t5a;
- t5a = t4a - t5a;
- int t7 = t7a + t6a;
- t6a = t7a - t6a;
+ int t4 = CLIP(t4a + t5a);
+ t5a = CLIP(t4a - t5a);
+ int t7 = CLIP(t7a + t6a);
+ t6a = CLIP(t7a - t6a);
int t5 = ((t6a - t5a) * 2896 + 2048) >> 12;
int t6 = ((t6a + t5a) * 2896 + 2048) >> 12;
- out[0 * out_s] = tmp[0] + t7;
- out[1 * out_s] = tmp[1] + t6;
- out[2 * out_s] = tmp[2] + t5;
- out[3 * out_s] = tmp[3] + t4;
- out[4 * out_s] = tmp[3] - t4;
- out[5 * out_s] = tmp[2] - t5;
- out[6 * out_s] = tmp[1] - t6;
- out[7 * out_s] = tmp[0] - t7;
+ out[0 * out_s] = CLIP(tmp[0] + t7);
+ out[1 * out_s] = CLIP(tmp[1] + t6);
+ out[2 * out_s] = CLIP(tmp[2] + t5);
+ out[3 * out_s] = CLIP(tmp[3] + t4);
+ out[4 * out_s] = CLIP(tmp[3] - t4);
+ out[5 * out_s] = CLIP(tmp[2] - t5);
+ out[6 * out_s] = CLIP(tmp[1] - t6);
+ out[7 * out_s] = CLIP(tmp[0] - t7);
}
static void NOINLINE
inv_dct16_1d(const coef *const in, const ptrdiff_t in_s,
- coef *const out, const ptrdiff_t out_s)
+ coef *const out, const ptrdiff_t out_s, const int max)
{
+ const int min = -max - 1;
coef tmp[8];
- inv_dct8_1d(in, in_s * 2, tmp, 1);
+ inv_dct8_1d(in, in_s * 2, tmp, 1, max);
const int in1 = in[ 1 * in_s], in3 = in[ 3 * in_s];
const int in5 = in[ 5 * in_s], in7 = in[ 7 * in_s];
@@ -106,14 +111,14 @@
int t11a = (in13 * 3920 - in3 * 1189 + 2048) >> 12;
int t12a = (in13 * 1189 + in3 * 3920 + 2048) >> 12;
- int t8 = t8a + t9a;
- int t9 = t8a - t9a;
- int t10 = t11a - t10a;
- int t11 = t11a + t10a;
- int t12 = t12a + t13a;
- int t13 = t12a - t13a;
- int t14 = t15a - t14a;
- int t15 = t15a + t14a;
+ int t8 = CLIP(t8a + t9a);
+ int t9 = CLIP(t8a - t9a);
+ int t10 = CLIP(t11a - t10a);
+ int t11 = CLIP(t11a + t10a);
+ int t12 = CLIP(t12a + t13a);
+ int t13 = CLIP(t12a - t13a);
+ int t14 = CLIP(t15a - t14a);
+ int t15 = CLIP(t15a + t14a);
t9a = ( t14 * 1567 - t9 * 3784 + 2048) >> 12;
t14a = ( t14 * 3784 + t9 * 1567 + 2048) >> 12;
@@ -120,14 +125,14 @@
t10a = (-(t13 * 3784 + t10 * 1567) + 2048) >> 12;
t13a = ( t13 * 1567 - t10 * 3784 + 2048) >> 12;
- t8a = t8 + t11;
- t9 = t9a + t10a;
- t10 = t9a - t10a;
- t11a = t8 - t11;
- t12a = t15 - t12;
- t13 = t14a - t13a;
- t14 = t14a + t13a;
- t15a = t15 + t12;
+ t8a = CLIP(t8 + t11);
+ t9 = CLIP(t9a + t10a);
+ t10 = CLIP(t9a - t10a);
+ t11a = CLIP(t8 - t11);
+ t12a = CLIP(t15 - t12);
+ t13 = CLIP(t14a - t13a);
+ t14 = CLIP(t14a + t13a);
+ t15a = CLIP(t15 + t12);
t10a = ((t13 - t10) * 2896 + 2048) >> 12;
t13a = ((t13 + t10) * 2896 + 2048) >> 12;
@@ -134,31 +139,32 @@
t11 = ((t12a - t11a) * 2896 + 2048) >> 12;
t12 = ((t12a + t11a) * 2896 + 2048) >> 12;
- out[ 0 * out_s] = tmp[0] + t15a;
- out[ 1 * out_s] = tmp[1] + t14;
- out[ 2 * out_s] = tmp[2] + t13a;
- out[ 3 * out_s] = tmp[3] + t12;
- out[ 4 * out_s] = tmp[4] + t11;
- out[ 5 * out_s] = tmp[5] + t10a;
- out[ 6 * out_s] = tmp[6] + t9;
- out[ 7 * out_s] = tmp[7] + t8a;
- out[ 8 * out_s] = tmp[7] - t8a;
- out[ 9 * out_s] = tmp[6] - t9;
- out[10 * out_s] = tmp[5] - t10a;
- out[11 * out_s] = tmp[4] - t11;
- out[12 * out_s] = tmp[3] - t12;
- out[13 * out_s] = tmp[2] - t13a;
- out[14 * out_s] = tmp[1] - t14;
- out[15 * out_s] = tmp[0] - t15a;
+ out[ 0 * out_s] = CLIP(tmp[0] + t15a);
+ out[ 1 * out_s] = CLIP(tmp[1] + t14);
+ out[ 2 * out_s] = CLIP(tmp[2] + t13a);
+ out[ 3 * out_s] = CLIP(tmp[3] + t12);
+ out[ 4 * out_s] = CLIP(tmp[4] + t11);
+ out[ 5 * out_s] = CLIP(tmp[5] + t10a);
+ out[ 6 * out_s] = CLIP(tmp[6] + t9);
+ out[ 7 * out_s] = CLIP(tmp[7] + t8a);
+ out[ 8 * out_s] = CLIP(tmp[7] - t8a);
+ out[ 9 * out_s] = CLIP(tmp[6] - t9);
+ out[10 * out_s] = CLIP(tmp[5] - t10a);
+ out[11 * out_s] = CLIP(tmp[4] - t11);
+ out[12 * out_s] = CLIP(tmp[3] - t12);
+ out[13 * out_s] = CLIP(tmp[2] - t13a);
+ out[14 * out_s] = CLIP(tmp[1] - t14);
+ out[15 * out_s] = CLIP(tmp[0] - t15a);
}
static void NOINLINE
inv_dct32_1d(const coef *const in, const ptrdiff_t in_s,
- coef *const out, const ptrdiff_t out_s)
+ coef *const out, const ptrdiff_t out_s, const int max)
{
+ const int min = -max - 1;
coef tmp[16];
- inv_dct16_1d(in, in_s * 2, tmp, 1);
+ inv_dct16_1d(in, in_s * 2, tmp, 1, max);
const int in1 = in[ 1 * in_s], in3 = in[ 3 * in_s];
const int in5 = in[ 5 * in_s], in7 = in[ 7 * in_s];
@@ -186,22 +192,22 @@
int t23a = (in29 * 4052 - in3 * 601 + 2048) >> 12;
int t24a = (in29 * 601 + in3 * 4052 + 2048) >> 12;
- int t16 = t16a + t17a;
- int t17 = t16a - t17a;
- int t18 = t19a - t18a;
- int t19 = t19a + t18a;
- int t20 = t20a + t21a;
- int t21 = t20a - t21a;
- int t22 = t23a - t22a;
- int t23 = t23a + t22a;
- int t24 = t24a + t25a;
- int t25 = t24a - t25a;
- int t26 = t27a - t26a;
- int t27 = t27a + t26a;
- int t28 = t28a + t29a;
- int t29 = t28a - t29a;
- int t30 = t31a - t30a;
- int t31 = t31a + t30a;
+ int t16 = CLIP(t16a + t17a);
+ int t17 = CLIP(t16a - t17a);
+ int t18 = CLIP(t19a - t18a);
+ int t19 = CLIP(t19a + t18a);
+ int t20 = CLIP(t20a + t21a);
+ int t21 = CLIP(t20a - t21a);
+ int t22 = CLIP(t23a - t22a);
+ int t23 = CLIP(t23a + t22a);
+ int t24 = CLIP(t24a + t25a);
+ int t25 = CLIP(t24a - t25a);
+ int t26 = CLIP(t27a - t26a);
+ int t27 = CLIP(t27a + t26a);
+ int t28 = CLIP(t28a + t29a);
+ int t29 = CLIP(t28a - t29a);
+ int t30 = CLIP(t31a - t30a);
+ int t31 = CLIP(t31a + t30a);
t17a = ( t30 * 799 - t17 * 4017 + 2048) >> 12;
t30a = ( t30 * 4017 + t17 * 799 + 2048) >> 12;
@@ -212,22 +218,22 @@
t22a = (-(t25 * 2276 + t22 * 3406) + 2048) >> 12;
t25a = ( t25 * 3406 - t22 * 2276 + 2048) >> 12;
- t16a = t16 + t19;
- t17 = t17a + t18a;
- t18 = t17a - t18a;
- t19a = t16 - t19;
- t20a = t23 - t20;
- t21 = t22a - t21a;
- t22 = t22a + t21a;
- t23a = t23 + t20;
- t24a = t24 + t27;
- t25 = t25a + t26a;
- t26 = t25a - t26a;
- t27a = t24 - t27;
- t28a = t31 - t28;
- t29 = t30a - t29a;
- t30 = t30a + t29a;
- t31a = t31 + t28;
+ t16a = CLIP(t16 + t19);
+ t17 = CLIP(t17a + t18a);
+ t18 = CLIP(t17a - t18a);
+ t19a = CLIP(t16 - t19);
+ t20a = CLIP(t23 - t20);
+ t21 = CLIP(t22a - t21a);
+ t22 = CLIP(t22a + t21a);
+ t23a = CLIP(t23 + t20);
+ t24a = CLIP(t24 + t27);
+ t25 = CLIP(t25a + t26a);
+ t26 = CLIP(t25a - t26a);
+ t27a = CLIP(t24 - t27);
+ t28a = CLIP(t31 - t28);
+ t29 = CLIP(t30a - t29a);
+ t30 = CLIP(t30a + t29a);
+ t31a = CLIP(t31 + t28);
t18a = ( t29 * 1567 - t18 * 3784 + 2048) >> 12;
t29a = ( t29 * 3784 + t18 * 1567 + 2048) >> 12;
@@ -238,22 +244,22 @@
t21a = (-(t26 * 3784 + t21 * 1567) + 2048) >> 12;
t26a = ( t26 * 1567 - t21 * 3784 + 2048) >> 12;
- t16 = t16a + t23a;
- t17a = t17 + t22;
- t18 = t18a + t21a;
- t19a = t19 + t20;
- t20a = t19 - t20;
- t21 = t18a - t21a;
- t22a = t17 - t22;
- t23 = t16a - t23a;
- t24 = t31a - t24a;
- t25a = t30 - t25;
- t26 = t29a - t26a;
- t27a = t28 - t27;
- t28a = t28 + t27;
- t29 = t29a + t26a;
- t30a = t30 + t25;
- t31 = t31a + t24a;
+ t16 = CLIP(t16a + t23a);
+ t17a = CLIP(t17 + t22);
+ t18 = CLIP(t18a + t21a);
+ t19a = CLIP(t19 + t20);
+ t20a = CLIP(t19 - t20);
+ t21 = CLIP(t18a - t21a);
+ t22a = CLIP(t17 - t22);
+ t23 = CLIP(t16a - t23a);
+ t24 = CLIP(t31a - t24a);
+ t25a = CLIP(t30 - t25);
+ t26 = CLIP(t29a - t26a);
+ t27a = CLIP(t28 - t27);
+ t28a = CLIP(t28 + t27);
+ t29 = CLIP(t29a + t26a);
+ t30a = CLIP(t30 + t25);
+ t31 = CLIP(t31a + t24a);
t20 = ((t27a - t20a) * 2896 + 2048) >> 12;
t27 = ((t27a + t20a) * 2896 + 2048) >> 12;
@@ -264,47 +270,48 @@
t23a = ((t24 - t23 ) * 2896 + 2048) >> 12;
t24a = ((t24 + t23 ) * 2896 + 2048) >> 12;
- out[ 0 * out_s] = tmp[ 0] + t31;
- out[ 1 * out_s] = tmp[ 1] + t30a;
- out[ 2 * out_s] = tmp[ 2] + t29;
- out[ 3 * out_s] = tmp[ 3] + t28a;
- out[ 4 * out_s] = tmp[ 4] + t27;
- out[ 5 * out_s] = tmp[ 5] + t26a;
- out[ 6 * out_s] = tmp[ 6] + t25;
- out[ 7 * out_s] = tmp[ 7] + t24a;
- out[ 8 * out_s] = tmp[ 8] + t23a;
- out[ 9 * out_s] = tmp[ 9] + t22;
- out[10 * out_s] = tmp[10] + t21a;
- out[11 * out_s] = tmp[11] + t20;
- out[12 * out_s] = tmp[12] + t19a;
- out[13 * out_s] = tmp[13] + t18;
- out[14 * out_s] = tmp[14] + t17a;
- out[15 * out_s] = tmp[15] + t16;
- out[16 * out_s] = tmp[15] - t16;
- out[17 * out_s] = tmp[14] - t17a;
- out[18 * out_s] = tmp[13] - t18;
- out[19 * out_s] = tmp[12] - t19a;
- out[20 * out_s] = tmp[11] - t20;
- out[21 * out_s] = tmp[10] - t21a;
- out[22 * out_s] = tmp[ 9] - t22;
- out[23 * out_s] = tmp[ 8] - t23a;
- out[24 * out_s] = tmp[ 7] - t24a;
- out[25 * out_s] = tmp[ 6] - t25;
- out[26 * out_s] = tmp[ 5] - t26a;
- out[27 * out_s] = tmp[ 4] - t27;
- out[28 * out_s] = tmp[ 3] - t28a;
- out[29 * out_s] = tmp[ 2] - t29;
- out[30 * out_s] = tmp[ 1] - t30a;
- out[31 * out_s] = tmp[ 0] - t31;
+ out[ 0 * out_s] = CLIP(tmp[ 0] + t31);
+ out[ 1 * out_s] = CLIP(tmp[ 1] + t30a);
+ out[ 2 * out_s] = CLIP(tmp[ 2] + t29);
+ out[ 3 * out_s] = CLIP(tmp[ 3] + t28a);
+ out[ 4 * out_s] = CLIP(tmp[ 4] + t27);
+ out[ 5 * out_s] = CLIP(tmp[ 5] + t26a);
+ out[ 6 * out_s] = CLIP(tmp[ 6] + t25);
+ out[ 7 * out_s] = CLIP(tmp[ 7] + t24a);
+ out[ 8 * out_s] = CLIP(tmp[ 8] + t23a);
+ out[ 9 * out_s] = CLIP(tmp[ 9] + t22);
+ out[10 * out_s] = CLIP(tmp[10] + t21a);
+ out[11 * out_s] = CLIP(tmp[11] + t20);
+ out[12 * out_s] = CLIP(tmp[12] + t19a);
+ out[13 * out_s] = CLIP(tmp[13] + t18);
+ out[14 * out_s] = CLIP(tmp[14] + t17a);
+ out[15 * out_s] = CLIP(tmp[15] + t16);
+ out[16 * out_s] = CLIP(tmp[15] - t16);
+ out[17 * out_s] = CLIP(tmp[14] - t17a);
+ out[18 * out_s] = CLIP(tmp[13] - t18);
+ out[19 * out_s] = CLIP(tmp[12] - t19a);
+ out[20 * out_s] = CLIP(tmp[11] - t20);
+ out[21 * out_s] = CLIP(tmp[10] - t21a);
+ out[22 * out_s] = CLIP(tmp[ 9] - t22);
+ out[23 * out_s] = CLIP(tmp[ 8] - t23a);
+ out[24 * out_s] = CLIP(tmp[ 7] - t24a);
+ out[25 * out_s] = CLIP(tmp[ 6] - t25);
+ out[26 * out_s] = CLIP(tmp[ 5] - t26a);
+ out[27 * out_s] = CLIP(tmp[ 4] - t27);
+ out[28 * out_s] = CLIP(tmp[ 3] - t28a);
+ out[29 * out_s] = CLIP(tmp[ 2] - t29);
+ out[30 * out_s] = CLIP(tmp[ 1] - t30a);
+ out[31 * out_s] = CLIP(tmp[ 0] - t31);
}
static void NOINLINE
inv_dct64_1d(const coef *const in, const ptrdiff_t in_s,
- coef *const out, const ptrdiff_t out_s)
+ coef *const out, const ptrdiff_t out_s, const int max)
{
+ const int min = -max - 1;
coef tmp[32];
- inv_dct32_1d(in, in_s * 2, tmp, 1);
+ inv_dct32_1d(in, in_s * 2, tmp, 1, max);
const int in1 = in[ 1 * in_s], in3 = in[ 3 * in_s];
const int in5 = in[ 5 * in_s], in7 = in[ 7 * in_s];
@@ -356,38 +363,38 @@
int t62a = (in33 * 2824 + in31 * 2967 + 2048) >> 12;
int t63a = (in1 * 4095 + in63 * 101 + 2048) >> 12;
- int t32 = t32a + t33a;
- int t33 = t32a - t33a;
- int t34 = t35a - t34a;
- int t35 = t35a + t34a;
- int t36 = t36a + t37a;
- int t37 = t36a - t37a;
- int t38 = t39a - t38a;
- int t39 = t39a + t38a;
- int t40 = t40a + t41a;
- int t41 = t40a - t41a;
- int t42 = t43a - t42a;
- int t43 = t43a + t42a;
- int t44 = t44a + t45a;
- int t45 = t44a - t45a;
- int t46 = t47a - t46a;
- int t47 = t47a + t46a;
- int t48 = t48a + t49a;
- int t49 = t48a - t49a;
- int t50 = t51a - t50a;
- int t51 = t51a + t50a;
- int t52 = t52a + t53a;
- int t53 = t52a - t53a;
- int t54 = t55a - t54a;
- int t55 = t55a + t54a;
- int t56 = t56a + t57a;
- int t57 = t56a - t57a;
- int t58 = t59a - t58a;
- int t59 = t59a + t58a;
- int t60 = t60a + t61a;
- int t61 = t60a - t61a;
- int t62 = t63a - t62a;
- int t63 = t63a + t62a;
+ int t32 = CLIP(t32a + t33a);
+ int t33 = CLIP(t32a - t33a);
+ int t34 = CLIP(t35a - t34a);
+ int t35 = CLIP(t35a + t34a);
+ int t36 = CLIP(t36a + t37a);
+ int t37 = CLIP(t36a - t37a);
+ int t38 = CLIP(t39a - t38a);
+ int t39 = CLIP(t39a + t38a);
+ int t40 = CLIP(t40a + t41a);
+ int t41 = CLIP(t40a - t41a);
+ int t42 = CLIP(t43a - t42a);
+ int t43 = CLIP(t43a + t42a);
+ int t44 = CLIP(t44a + t45a);
+ int t45 = CLIP(t44a - t45a);
+ int t46 = CLIP(t47a - t46a);
+ int t47 = CLIP(t47a + t46a);
+ int t48 = CLIP(t48a + t49a);
+ int t49 = CLIP(t48a - t49a);
+ int t50 = CLIP(t51a - t50a);
+ int t51 = CLIP(t51a + t50a);
+ int t52 = CLIP(t52a + t53a);
+ int t53 = CLIP(t52a - t53a);
+ int t54 = CLIP(t55a - t54a);
+ int t55 = CLIP(t55a + t54a);
+ int t56 = CLIP(t56a + t57a);
+ int t57 = CLIP(t56a - t57a);
+ int t58 = CLIP(t59a - t58a);
+ int t59 = CLIP(t59a + t58a);
+ int t60 = CLIP(t60a + t61a);
+ int t61 = CLIP(t60a - t61a);
+ int t62 = CLIP(t63a - t62a);
+ int t63 = CLIP(t63a + t62a);
t33a = (t33 * -4076 + t62 * 401 + 2048) >> 12;
t34a = (t34 * - 401 + t61 * -4076 + 2048) >> 12;
@@ -406,38 +413,38 @@
t61a = (t34 * -4076 + t61 * 401 + 2048) >> 12;
t62a = (t33 * 401 + t62 * 4076 + 2048) >> 12;
- t32a = t32 + t35;
- t33 = t33a + t34a;
- t34 = t33a - t34a;
- t35a = t32 - t35;
- t36a = t39 - t36;
- t37 = t38a - t37a;
- t38 = t38a + t37a;
- t39a = t39 + t36;
- t40a = t40 + t43;
- t41 = t41a + t42a;
- t42 = t41a - t42a;
- t43a = t40 - t43;
- t44a = t47 - t44;
- t45 = t46a - t45a;
- t46 = t46a + t45a;
- t47a = t47 + t44;
- t48a = t48 + t51;
- t49 = t49a + t50a;
- t50 = t49a - t50a;
- t51a = t48 - t51;
- t52a = t55 - t52;
- t53 = t54a - t53a;
- t54 = t54a + t53a;
- t55a = t55 + t52;
- t56a = t56 + t59;
- t57 = t57a + t58a;
- t58 = t57a - t58a;
- t59a = t56 - t59;
- t60a = t63 - t60;
- t61 = t62a - t61a;
- t62 = t62a + t61a;
- t63a = t63 + t60;
+ t32a = CLIP(t32 + t35);
+ t33 = CLIP(t33a + t34a);
+ t34 = CLIP(t33a - t34a);
+ t35a = CLIP(t32 - t35);
+ t36a = CLIP(t39 - t36);
+ t37 = CLIP(t38a - t37a);
+ t38 = CLIP(t38a + t37a);
+ t39a = CLIP(t39 + t36);
+ t40a = CLIP(t40 + t43);
+ t41 = CLIP(t41a + t42a);
+ t42 = CLIP(t41a - t42a);
+ t43a = CLIP(t40 - t43);
+ t44a = CLIP(t47 - t44);
+ t45 = CLIP(t46a - t45a);
+ t46 = CLIP(t46a + t45a);
+ t47a = CLIP(t47 + t44);
+ t48a = CLIP(t48 + t51);
+ t49 = CLIP(t49a + t50a);
+ t50 = CLIP(t49a - t50a);
+ t51a = CLIP(t48 - t51);
+ t52a = CLIP(t55 - t52);
+ t53 = CLIP(t54a - t53a);
+ t54 = CLIP(t54a + t53a);
+ t55a = CLIP(t55 + t52);
+ t56a = CLIP(t56 + t59);
+ t57 = CLIP(t57a + t58a);
+ t58 = CLIP(t57a - t58a);
+ t59a = CLIP(t56 - t59);
+ t60a = CLIP(t63 - t60);
+ t61 = CLIP(t62a - t61a);
+ t62 = CLIP(t62a + t61a);
+ t63a = CLIP(t63 + t60);
t34a = (t34 * -4017 + t61 * 799 + 2048) >> 12;
t35 = (t35a * -4017 + t60a * 799 + 2048) >> 12;
@@ -456,38 +463,38 @@
t60 = (t35a * 799 + t60a * 4017 + 2048) >> 12;
t61a = (t34 * 799 + t61 * 4017 + 2048) >> 12;
- t32 = t32a + t39a;
- t33a = t33 + t38;
- t34 = t34a + t37a;
- t35a = t35 + t36;
- t36a = t35 - t36;
- t37 = t34a - t37a;
- t38a = t33 - t38;
- t39 = t32a - t39a;
- t40 = t47a - t40a;
- t41a = t46 - t41;
- t42 = t45a - t42a;
- t43a = t44 - t43;
- t44a = t44 + t43;
- t45 = t45a + t42a;
- t46a = t46 + t41;
- t47 = t47a + t40a;
- t48 = t48a + t55a;
- t49a = t49 + t54;
- t50 = t50a + t53a;
- t51a = t51 + t52;
- t52a = t51 - t52;
- t53 = t50a - t53a;
- t54a = t49 - t54;
- t55 = t48a - t55a;
- t56 = t63a - t56a;
- t57a = t62 - t57;
- t58 = t61a - t58a;
- t59a = t60 - t59;
- t60a = t60 + t59;
- t61 = t61a + t58a;
- t62a = t62 + t57;
- t63 = t63a + t56a;
+ t32 = CLIP(t32a + t39a);
+ t33a = CLIP(t33 + t38);
+ t34 = CLIP(t34a + t37a);
+ t35a = CLIP(t35 + t36);
+ t36a = CLIP(t35 - t36);
+ t37 = CLIP(t34a - t37a);
+ t38a = CLIP(t33 - t38);
+ t39 = CLIP(t32a - t39a);
+ t40 = CLIP(t47a - t40a);
+ t41a = CLIP(t46 - t41);
+ t42 = CLIP(t45a - t42a);
+ t43a = CLIP(t44 - t43);
+ t44a = CLIP(t44 + t43);
+ t45 = CLIP(t45a + t42a);
+ t46a = CLIP(t46 + t41);
+ t47 = CLIP(t47a + t40a);
+ t48 = CLIP(t48a + t55a);
+ t49a = CLIP(t49 + t54);
+ t50 = CLIP(t50a + t53a);
+ t51a = CLIP(t51 + t52);
+ t52a = CLIP(t51 - t52);
+ t53 = CLIP(t50a - t53a);
+ t54a = CLIP(t49 - t54);
+ t55 = CLIP(t48a - t55a);
+ t56 = CLIP(t63a - t56a);
+ t57a = CLIP(t62 - t57);
+ t58 = CLIP(t61a - t58a);
+ t59a = CLIP(t60 - t59);
+ t60a = CLIP(t60 + t59);
+ t61 = CLIP(t61a + t58a);
+ t62a = CLIP(t62 + t57);
+ t63 = CLIP(t63a + t56a);
t36 = (t36a * -3784 + t59a * 1567 + 2048) >> 12;
t37a = (t37 * -3784 + t58 * 1567 + 2048) >> 12;
@@ -506,38 +513,38 @@
t58a = (t37 * 1567 + t58 * 3784 + 2048) >> 12;
t59 = (t36a * 1567 + t59a * 3784 + 2048) >> 12;
- t32a = t32 + t47;
- t33 = t33a + t46a;
- t34a = t34 + t45;
- t35 = t35a + t44a;
- t36a = t36 + t43;
- t37 = t37a + t42a;
- t38a = t38 + t41;
- t39 = t39a + t40a;
- t40 = t39a - t40a;
- t41a = t38 - t41;
- t42 = t37a - t42a;
- t43a = t36 - t43;
- t44 = t35a - t44a;
- t45a = t34 - t45;
- t46 = t33a - t46a;
- t47a = t32 - t47;
- t48a = t63 - t48;
- t49 = t62a - t49a;
- t50a = t61 - t50;
- t51 = t60a - t51a;
- t52a = t59 - t52;
- t53 = t58a - t53a;
- t54a = t57 - t54;
- t55 = t56a - t55a;
- t56 = t56a + t55a;
- t57a = t57 + t54;
- t58 = t58a + t53a;
- t59a = t59 + t52;
- t60 = t60a + t51a;
- t61a = t61 + t50;
- t62 = t62a + t49a;
- t63a = t63 + t48;
+ t32a = CLIP(t32 + t47);
+ t33 = CLIP(t33a + t46a);
+ t34a = CLIP(t34 + t45);
+ t35 = CLIP(t35a + t44a);
+ t36a = CLIP(t36 + t43);
+ t37 = CLIP(t37a + t42a);
+ t38a = CLIP(t38 + t41);
+ t39 = CLIP(t39a + t40a);
+ t40 = CLIP(t39a - t40a);
+ t41a = CLIP(t38 - t41);
+ t42 = CLIP(t37a - t42a);
+ t43a = CLIP(t36 - t43);
+ t44 = CLIP(t35a - t44a);
+ t45a = CLIP(t34 - t45);
+ t46 = CLIP(t33a - t46a);
+ t47a = CLIP(t32 - t47);
+ t48a = CLIP(t63 - t48);
+ t49 = CLIP(t62a - t49a);
+ t50a = CLIP(t61 - t50);
+ t51 = CLIP(t60a - t51a);
+ t52a = CLIP(t59 - t52);
+ t53 = CLIP(t58a - t53a);
+ t54a = CLIP(t57 - t54);
+ t55 = CLIP(t56a - t55a);
+ t56 = CLIP(t56a + t55a);
+ t57a = CLIP(t57 + t54);
+ t58 = CLIP(t58a + t53a);
+ t59a = CLIP(t59 + t52);
+ t60 = CLIP(t60a + t51a);
+ t61a = CLIP(t61 + t50);
+ t62 = CLIP(t62a + t49a);
+ t63a = CLIP(t63 + t48);
t40a = (t40 * -2896 + t55 * 2896 + 2048) >> 12;
t41 = (t41a * -2896 + t54a * 2896 + 2048) >> 12;
@@ -556,75 +563,75 @@
t54 = (t41a * 2896 + t54a * 2896 + 2048) >> 12;
t55a = (t40 * 2896 + t55 * 2896 + 2048) >> 12;
- out[ 0 * out_s] = tmp[ 0] + t63a;
- out[ 1 * out_s] = tmp[ 1] + t62;
- out[ 2 * out_s] = tmp[ 2] + t61a;
- out[ 3 * out_s] = tmp[ 3] + t60;
- out[ 4 * out_s] = tmp[ 4] + t59a;
- out[ 5 * out_s] = tmp[ 5] + t58;
- out[ 6 * out_s] = tmp[ 6] + t57a;
- out[ 7 * out_s] = tmp[ 7] + t56;
- out[ 8 * out_s] = tmp[ 8] + t55a;
- out[ 9 * out_s] = tmp[ 9] + t54;
- out[10 * out_s] = tmp[10] + t53a;
- out[11 * out_s] = tmp[11] + t52;
- out[12 * out_s] = tmp[12] + t51a;
- out[13 * out_s] = tmp[13] + t50;
- out[14 * out_s] = tmp[14] + t49a;
- out[15 * out_s] = tmp[15] + t48;
- out[16 * out_s] = tmp[16] + t47;
- out[17 * out_s] = tmp[17] + t46a;
- out[18 * out_s] = tmp[18] + t45;
- out[19 * out_s] = tmp[19] + t44a;
- out[20 * out_s] = tmp[20] + t43;
- out[21 * out_s] = tmp[21] + t42a;
- out[22 * out_s] = tmp[22] + t41;
- out[23 * out_s] = tmp[23] + t40a;
- out[24 * out_s] = tmp[24] + t39;
- out[25 * out_s] = tmp[25] + t38a;
- out[26 * out_s] = tmp[26] + t37;
- out[27 * out_s] = tmp[27] + t36a;
- out[28 * out_s] = tmp[28] + t35;
- out[29 * out_s] = tmp[29] + t34a;
- out[30 * out_s] = tmp[30] + t33;
- out[31 * out_s] = tmp[31] + t32a;
- out[32 * out_s] = tmp[31] - t32a;
- out[33 * out_s] = tmp[30] - t33;
- out[34 * out_s] = tmp[29] - t34a;
- out[35 * out_s] = tmp[28] - t35;
- out[36 * out_s] = tmp[27] - t36a;
- out[37 * out_s] = tmp[26] - t37;
- out[38 * out_s] = tmp[25] - t38a;
- out[39 * out_s] = tmp[24] - t39;
- out[40 * out_s] = tmp[23] - t40a;
- out[41 * out_s] = tmp[22] - t41;
- out[42 * out_s] = tmp[21] - t42a;
- out[43 * out_s] = tmp[20] - t43;
- out[44 * out_s] = tmp[19] - t44a;
- out[45 * out_s] = tmp[18] - t45;
- out[46 * out_s] = tmp[17] - t46a;
- out[47 * out_s] = tmp[16] - t47;
- out[48 * out_s] = tmp[15] - t48;
- out[49 * out_s] = tmp[14] - t49a;
- out[50 * out_s] = tmp[13] - t50;
- out[51 * out_s] = tmp[12] - t51a;
- out[52 * out_s] = tmp[11] - t52;
- out[53 * out_s] = tmp[10] - t53a;
- out[54 * out_s] = tmp[ 9] - t54;
- out[55 * out_s] = tmp[ 8] - t55a;
- out[56 * out_s] = tmp[ 7] - t56;
- out[57 * out_s] = tmp[ 6] - t57a;
- out[58 * out_s] = tmp[ 5] - t58;
- out[59 * out_s] = tmp[ 4] - t59a;
- out[60 * out_s] = tmp[ 3] - t60;
- out[61 * out_s] = tmp[ 2] - t61a;
- out[62 * out_s] = tmp[ 1] - t62;
- out[63 * out_s] = tmp[ 0] - t63a;
+ out[ 0 * out_s] = CLIP(tmp[ 0] + t63a);
+ out[ 1 * out_s] = CLIP(tmp[ 1] + t62);
+ out[ 2 * out_s] = CLIP(tmp[ 2] + t61a);
+ out[ 3 * out_s] = CLIP(tmp[ 3] + t60);
+ out[ 4 * out_s] = CLIP(tmp[ 4] + t59a);
+ out[ 5 * out_s] = CLIP(tmp[ 5] + t58);
+ out[ 6 * out_s] = CLIP(tmp[ 6] + t57a);
+ out[ 7 * out_s] = CLIP(tmp[ 7] + t56);
+ out[ 8 * out_s] = CLIP(tmp[ 8] + t55a);
+ out[ 9 * out_s] = CLIP(tmp[ 9] + t54);
+ out[10 * out_s] = CLIP(tmp[10] + t53a);
+ out[11 * out_s] = CLIP(tmp[11] + t52);
+ out[12 * out_s] = CLIP(tmp[12] + t51a);
+ out[13 * out_s] = CLIP(tmp[13] + t50);
+ out[14 * out_s] = CLIP(tmp[14] + t49a);
+ out[15 * out_s] = CLIP(tmp[15] + t48);
+ out[16 * out_s] = CLIP(tmp[16] + t47);
+ out[17 * out_s] = CLIP(tmp[17] + t46a);
+ out[18 * out_s] = CLIP(tmp[18] + t45);
+ out[19 * out_s] = CLIP(tmp[19] + t44a);
+ out[20 * out_s] = CLIP(tmp[20] + t43);
+ out[21 * out_s] = CLIP(tmp[21] + t42a);
+ out[22 * out_s] = CLIP(tmp[22] + t41);
+ out[23 * out_s] = CLIP(tmp[23] + t40a);
+ out[24 * out_s] = CLIP(tmp[24] + t39);
+ out[25 * out_s] = CLIP(tmp[25] + t38a);
+ out[26 * out_s] = CLIP(tmp[26] + t37);
+ out[27 * out_s] = CLIP(tmp[27] + t36a);
+ out[28 * out_s] = CLIP(tmp[28] + t35);
+ out[29 * out_s] = CLIP(tmp[29] + t34a);
+ out[30 * out_s] = CLIP(tmp[30] + t33);
+ out[31 * out_s] = CLIP(tmp[31] + t32a);
+ out[32 * out_s] = CLIP(tmp[31] - t32a);
+ out[33 * out_s] = CLIP(tmp[30] - t33);
+ out[34 * out_s] = CLIP(tmp[29] - t34a);
+ out[35 * out_s] = CLIP(tmp[28] - t35);
+ out[36 * out_s] = CLIP(tmp[27] - t36a);
+ out[37 * out_s] = CLIP(tmp[26] - t37);
+ out[38 * out_s] = CLIP(tmp[25] - t38a);
+ out[39 * out_s] = CLIP(tmp[24] - t39);
+ out[40 * out_s] = CLIP(tmp[23] - t40a);
+ out[41 * out_s] = CLIP(tmp[22] - t41);
+ out[42 * out_s] = CLIP(tmp[21] - t42a);
+ out[43 * out_s] = CLIP(tmp[20] - t43);
+ out[44 * out_s] = CLIP(tmp[19] - t44a);
+ out[45 * out_s] = CLIP(tmp[18] - t45);
+ out[46 * out_s] = CLIP(tmp[17] - t46a);
+ out[47 * out_s] = CLIP(tmp[16] - t47);
+ out[48 * out_s] = CLIP(tmp[15] - t48);
+ out[49 * out_s] = CLIP(tmp[14] - t49a);
+ out[50 * out_s] = CLIP(tmp[13] - t50);
+ out[51 * out_s] = CLIP(tmp[12] - t51a);
+ out[52 * out_s] = CLIP(tmp[11] - t52);
+ out[53 * out_s] = CLIP(tmp[10] - t53a);
+ out[54 * out_s] = CLIP(tmp[ 9] - t54);
+ out[55 * out_s] = CLIP(tmp[ 8] - t55a);
+ out[56 * out_s] = CLIP(tmp[ 7] - t56);
+ out[57 * out_s] = CLIP(tmp[ 6] - t57a);
+ out[58 * out_s] = CLIP(tmp[ 5] - t58);
+ out[59 * out_s] = CLIP(tmp[ 4] - t59a);
+ out[60 * out_s] = CLIP(tmp[ 3] - t60);
+ out[61 * out_s] = CLIP(tmp[ 2] - t61a);
+ out[62 * out_s] = CLIP(tmp[ 1] - t62);
+ out[63 * out_s] = CLIP(tmp[ 0] - t63a);
}
static void NOINLINE
inv_adst4_1d(const coef *const in, const ptrdiff_t in_s,
- coef *const out, const ptrdiff_t out_s)
+ coef *const out, const ptrdiff_t out_s, const int range)
{
const int in0 = in[0 * in_s], in1 = in[1 * in_s];
const int in2 = in[2 * in_s], in3 = in[3 * in_s];
@@ -642,8 +649,9 @@
static void NOINLINE
inv_adst8_1d(const coef *const in, const ptrdiff_t in_s,
- coef *const out, const ptrdiff_t out_s)
+ coef *const out, const ptrdiff_t out_s, const int max)
{
+ const int min = -max - 1;
const int in0 = in[0 * in_s], in1 = in[1 * in_s];
const int in2 = in[2 * in_s], in3 = in[3 * in_s];
const int in4 = in[4 * in_s], in5 = in[5 * in_s];
@@ -658,14 +666,14 @@
int t6a = (1189 * in1 + 3920 * in6 + 2048) >> 12;
int t7a = (3920 * in1 - 1189 * in6 + 2048) >> 12;
- int t0 = t0a + t4a;
- int t1 = t1a + t5a;
- int t2 = t2a + t6a;
- int t3 = t3a + t7a;
- int t4 = t0a - t4a;
- int t5 = t1a - t5a;
- int t6 = t2a - t6a;
- int t7 = t3a - t7a;
+ int t0 = CLIP(t0a + t4a);
+ int t1 = CLIP(t1a + t5a);
+ int t2 = CLIP(t2a + t6a);
+ int t3 = CLIP(t3a + t7a);
+ int t4 = CLIP(t0a - t4a);
+ int t5 = CLIP(t1a - t5a);
+ int t6 = CLIP(t2a - t6a);
+ int t7 = CLIP(t3a - t7a);
t4a = (3784 * t4 + 1567 * t5 + 2048) >> 12;
t5a = (1567 * t4 - 3784 * t5 + 2048) >> 12;
@@ -672,15 +680,15 @@
t6a = (3784 * t7 - 1567 * t6 + 2048) >> 12;
t7a = (1567 * t7 + 3784 * t6 + 2048) >> 12;
- out[0 * out_s] = t0 + t2;
- out[7 * out_s] = -(t1 + t3);
- t2 = t0 - t2;
- t3 = t1 - t3;
+ out[0 * out_s] = CLIP( t0 + t2);
+ out[7 * out_s] = CLIP(-(t1 + t3));
+ t2 = CLIP( t0 - t2);
+ t3 = CLIP( t1 - t3);
- out[1 * out_s] = -(t4a + t6a);
- out[6 * out_s] = t5a + t7a;
- t6 = t4a - t6a;
- t7 = t5a - t7a;
+ out[1 * out_s] = CLIP(-(t4a + t6a));
+ out[6 * out_s] = CLIP( t5a + t7a );
+ t6 = CLIP( t4a - t6a );
+ t7 = CLIP( t5a - t7a );
out[3 * out_s] = -(((t2 + t3) * 2896 + 2048) >> 12);
out[4 * out_s] = ((t2 - t3) * 2896 + 2048) >> 12;
@@ -690,8 +698,9 @@
static void NOINLINE
inv_adst16_1d(const coef *const in, const ptrdiff_t in_s,
- coef *const out, const ptrdiff_t out_s)
+ coef *const out, const ptrdiff_t out_s, const int max)
{
+ const int min = -max - 1;
const int in0 = in[ 0 * in_s], in1 = in[ 1 * in_s];
const int in2 = in[ 2 * in_s], in3 = in[ 3 * in_s];
const int in4 = in[ 4 * in_s], in5 = in[ 5 * in_s];
@@ -718,22 +727,22 @@
int t14 = (in1 * 601 + in14 * 4052 + 2048) >> 12;
int t15 = (in1 * 4052 - in14 * 601 + 2048) >> 12;
- int t0a = t0 + t8;
- int t1a = t1 + t9;
- int t2a = t2 + t10;
- int t3a = t3 + t11;
- int t4a = t4 + t12;
- int t5a = t5 + t13;
- int t6a = t6 + t14;
- int t7a = t7 + t15;
- int t8a = t0 - t8;
- int t9a = t1 - t9;
- int t10a = t2 - t10;
- int t11a = t3 - t11;
- int t12a = t4 - t12;
- int t13a = t5 - t13;
- int t14a = t6 - t14;
- int t15a = t7 - t15;
+ int t0a = CLIP(t0 + t8 );
+ int t1a = CLIP(t1 + t9 );
+ int t2a = CLIP(t2 + t10);
+ int t3a = CLIP(t3 + t11);
+ int t4a = CLIP(t4 + t12);
+ int t5a = CLIP(t5 + t13);
+ int t6a = CLIP(t6 + t14);
+ int t7a = CLIP(t7 + t15);
+ int t8a = CLIP(t0 - t8 );
+ int t9a = CLIP(t1 - t9 );
+ int t10a = CLIP(t2 - t10);
+ int t11a = CLIP(t3 - t11);
+ int t12a = CLIP(t4 - t12);
+ int t13a = CLIP(t5 - t13);
+ int t14a = CLIP(t6 - t14);
+ int t15a = CLIP(t7 - t15);
t8 = (t8a * 4017 + t9a * 799 + 2048) >> 12;
t9 = (t8a * 799 - t9a * 4017 + 2048) >> 12;
@@ -744,22 +753,22 @@
t14 = (t15a * 2276 - t14a * 3406 + 2048) >> 12;
t15 = (t15a * 3406 + t14a * 2276 + 2048) >> 12;
- t0 = t0a + t4a;
- t1 = t1a + t5a;
- t2 = t2a + t6a;
- t3 = t3a + t7a;
- t4 = t0a - t4a;
- t5 = t1a - t5a;
- t6 = t2a - t6a;
- t7 = t3a - t7a;
- t8a = t8 + t12;
- t9a = t9 + t13;
- t10a = t10 + t14;
- t11a = t11 + t15;
- t12a = t8 - t12;
- t13a = t9 - t13;
- t14a = t10 - t14;
- t15a = t11 - t15;
+ t0 = CLIP(t0a + t4a);
+ t1 = CLIP(t1a + t5a);
+ t2 = CLIP(t2a + t6a);
+ t3 = CLIP(t3a + t7a);
+ t4 = CLIP(t0a - t4a);
+ t5 = CLIP(t1a - t5a);
+ t6 = CLIP(t2a - t6a);
+ t7 = CLIP(t3a - t7a);
+ t8a = CLIP(t8 + t12);
+ t9a = CLIP(t9 + t13);
+ t10a = CLIP(t10 + t14);
+ t11a = CLIP(t11 + t15);
+ t12a = CLIP(t8 - t12);
+ t13a = CLIP(t9 - t13);
+ t14a = CLIP(t10 - t14);
+ t15a = CLIP(t11 - t15);
t4a = (t4 * 3784 + t5 * 1567 + 2048) >> 12;
t5a = (t4 * 1567 - t5 * 3784 + 2048) >> 12;
@@ -770,22 +779,22 @@
t14 = (t15a * 3784 - t14a * 1567 + 2048) >> 12;
t15 = (t15a * 1567 + t14a * 3784 + 2048) >> 12;
- out[ 0 * out_s] = t0 + t2;
- out[15 * out_s] = -(t1 + t3);
- t2a = t0 - t2;
- t3a = t1 - t3;
- out[ 3 * out_s] = -(t4a + t6a);
- out[12 * out_s] = t5a + t7a;
- t6 = t4a - t6a;
- t7 = t5a - t7a;
- out[ 1 * out_s] = -(t8a + t10a);
- out[14 * out_s] = t9a + t11a;
- t10 = t8a - t10a;
- t11 = t9a - t11a;
- out[ 2 * out_s] = t12 + t14;
- out[13 * out_s] = -(t13 + t15);
- t14a = t12 - t14;
- t15a = t13 - t15;
+ out[ 0 * out_s] = CLIP( t0 + t2 );
+ out[15 * out_s] = CLIP(-(t1 + t3) );
+ t2a = CLIP( t0 - t2 );
+ t3a = CLIP( t1 - t3 );
+ out[ 3 * out_s] = CLIP(-(t4a + t6a) );
+ out[12 * out_s] = CLIP( t5a + t7a );
+ t6 = CLIP( t4a - t6a );
+ t7 = CLIP( t5a - t7a );
+ out[ 1 * out_s] = CLIP(-(t8a + t10a));
+ out[14 * out_s] = CLIP( t9a + t11a );
+ t10 = CLIP( t8a - t10a );
+ t11 = CLIP( t9a - t11a );
+ out[ 2 * out_s] = CLIP( t12 + t14 );
+ out[13 * out_s] = CLIP(-(t13 + t15) );
+ t14a = CLIP( t12 - t14 );
+ t15a = CLIP( t13 - t15 );
out[ 7 * out_s] = -(((t2a + t3a) * 2896 + 2048) >> 12);
out[ 8 * out_s] = ((t2a - t3a) * 2896 + 2048) >> 12;
@@ -799,9 +808,9 @@
#define flip_inv_adst(sz) \
static void inv_flipadst##sz##_1d(const coef *const in, const ptrdiff_t in_s, \
- coef *const out, const ptrdiff_t out_s) \
+ coef *const out, const ptrdiff_t out_s, const int range) \
{ \
- inv_adst##sz##_1d(in, in_s, &out[(sz - 1) * out_s], -out_s); \
+ inv_adst##sz##_1d(in, in_s, &out[(sz - 1) * out_s], -out_s, range); \
}
flip_inv_adst(4)
@@ -812,7 +821,7 @@
static void NOINLINE
inv_identity4_1d(const coef *const in, const ptrdiff_t in_s,
- coef *const out, const ptrdiff_t out_s)
+ coef *const out, const ptrdiff_t out_s, const int range)
{
for (int i = 0; i < 4; i++)
out[out_s * i] = (in[in_s * i] * 5793 + 2048) >> 12;
@@ -820,7 +829,7 @@
static void NOINLINE
inv_identity8_1d(const coef *const in, const ptrdiff_t in_s,
- coef *const out, const ptrdiff_t out_s)
+ coef *const out, const ptrdiff_t out_s, const int range)
{
for (int i = 0; i < 8; i++)
out[out_s * i] = in[in_s * i] * 2;
@@ -828,7 +837,7 @@
static void NOINLINE
inv_identity16_1d(const coef *const in, const ptrdiff_t in_s,
- coef *const out, const ptrdiff_t out_s)
+ coef *const out, const ptrdiff_t out_s, const int range)
{
for (int i = 0; i < 16; i++)
out[out_s * i] = (in[in_s * i] * 2 * 5793 + 2048) >> 12;
@@ -836,7 +845,7 @@
static void NOINLINE
inv_identity32_1d(const coef *const in, const ptrdiff_t in_s,
- coef *const out, const ptrdiff_t out_s)
+ coef *const out, const ptrdiff_t out_s, const int range)
{
for (int i = 0; i < 32; i++)
out[out_s * i] = in[in_s * i] * 4;
--- a/src/itx_tmpl.c
+++ b/src/itx_tmpl.c
@@ -40,7 +40,7 @@
#include "src/itx_1d.c"
typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
- coef *out, ptrdiff_t out_s);
+ coef *out, ptrdiff_t out_s, const int range);
static void NOINLINE
inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
@@ -54,6 +54,9 @@
// Maximum value for h and w is 64
coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
const int is_rect2 = w * 2 == h || h * 2 == w;
+ const int row_clip_max = (1 << (BITDEPTH + 8 - 1)) - 1;
+ const int col_clip_max = (1 << (imax(BITDEPTH + 6, 16) - 1)) -1;
+ const int col_clip_min = -col_clip_max - 1;
if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));
const int rnd1 = (1 << shift1) >> 1;
@@ -64,18 +67,19 @@
if (is_rect2)
in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;
}
- first_1d_fn(in_mem, 1, &tmp[i * w], 1);
+ first_1d_fn(in_mem, 1, &tmp[i * w], 1, row_clip_max);
} else {
- first_1d_fn(&coeff[i], sh, &tmp[i * w], 1);
+ first_1d_fn(&coeff[i], sh, &tmp[i * w], 1, row_clip_max);
}
for (j = 0; j < w; j++)
- tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;
+ tmp[i * w + j] = iclip((tmp[i * w + j] + (rnd1)) >> shift1,
+ col_clip_min, col_clip_max);
}
if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));
const int rnd2 = (1 << shift2) >> 1;
for (i = 0; i < w; i++) {
- second_1d_fn(&tmp[i], w, out, 1);
+ second_1d_fn(&tmp[i], w, out, 1, col_clip_max);
for (j = 0; j < h; j++)
dst[i + j * PXSTRIDE(stride)] =
iclip_pixel(dst[i + j * PXSTRIDE(stride)] +
@@ -145,15 +149,18 @@
static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
coef *const coeff, const int eob)
{
- int i, j;
+ const int col_clip_max = (1 << (imax(BITDEPTH + 6, 16) - 1)) -1;
+ const int col_clip_min = -col_clip_max - 1;
coef tmp[4 * 4], out[4];
- for (i = 0; i < 4; i++)
+ for (int i = 0; i < 4; i++)
inv_wht4_1d(&coeff[i], 4, &tmp[i * 4], 1, 0);
+ for (int k = 0; k < 4 * 4; k++)
+ tmp[k] = iclip(tmp[k], col_clip_min, col_clip_max);
- for (i = 0; i < 4; i++) {
+ for (int i = 0; i < 4; i++) {
inv_wht4_1d(&tmp[i], 4, out, 1, 1);
- for (j = 0; j < 4; j++)
+ for (int j = 0; j < 4; j++)
dst[i + j * PXSTRIDE(stride)] =
iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);
}