ref: a755b6e3d3ea865e43f4d033b8c66e42cd559d15
parent: eb01bdb9763f3c1990d748682cc5b853fd05ca69
author: Henrik Gramner <gramner@twoorioles.com>
date: Sat Dec 15 14:02:07 EST 2018
Clip coefficients in SSSE3/AVX2 inverse transform asm
--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -231,10 +231,10 @@
psubw m%1, m%3
pmulhrsw m%1, m%6 ; t1
pmulhrsw m%5, m%6 ; t0
- psubw m%3, m%1, m%2
- paddw m%2, m%1
- paddw m%1, m%5, m%4
- psubw m%4, m%5, m%4
+ psubsw m%3, m%1, m%2
+ paddsw m%2, m%1
+ paddsw m%1, m%5, m%4
+ psubsw m%4, m%5, m%4
%endmacro
%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
@@ -241,10 +241,10 @@
ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a
ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
- paddw m%9, m%2, m%6 ; t4
- psubw m%2, m%6 ; t5a
- paddw m%10, m%8, m%4 ; t7
- psubw m%8, m%4 ; t6a
+ paddsw m%9, m%2, m%6 ; t4
+ psubsw m%2, m%6 ; t5a
+ paddsw m%10, m%8, m%4 ; t7
+ psubsw m%8, m%4 ; t6a
vpbroadcastd m%4, [o(pw_2896x8)]
psubw m%6, m%1, m%5
paddw m%1, m%5
@@ -254,18 +254,18 @@
pmulhrsw m%6, m%4 ; t1
pmulhrsw m%8, m%4 ; t6
pmulhrsw m%5, m%4 ; t5
- psubw m%4, m%1, m%7 ; dct4 out3
- paddw m%1, m%7 ; dct4 out0
- paddw m%7, m%6, m%3 ; dct4 out1
- psubw m%6, m%3 ; dct4 out2
- paddw m%2, m%7, m%8 ; out1
- psubw m%7, m%8 ; out6
- psubw m%8, m%1, m%10 ; out7
- paddw m%1, m%10 ; out0
- paddw m%3, m%6, m%5 ; out2
- psubw m%6, m%5 ; out5
- psubw m%5, m%4, m%9 ; out4
- paddw m%4, m%9 ; out3
+ psubsw m%4, m%1, m%7 ; dct4 out3
+ paddsw m%1, m%7 ; dct4 out0
+ paddsw m%7, m%6, m%3 ; dct4 out1
+ psubsw m%6, m%3 ; dct4 out2
+ paddsw m%2, m%7, m%8 ; out1
+ psubsw m%7, m%8 ; out6
+ psubsw m%8, m%1, m%10 ; out7
+ paddsw m%1, m%10 ; out0
+ paddsw m%3, m%6, m%5 ; out2
+ psubsw m%6, m%5 ; out5
+ psubsw m%5, m%4, m%9 ; out4
+ paddsw m%4, m%9 ; out3
%endmacro
; in1 = %1, in3 = %2, in5 = %3, in7 = %4
@@ -275,25 +275,25 @@
ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a
ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
- psubw m%9, m%2, m%6 ; t13
- paddw m%6, m%2 ; t12
- psubw m%2, m%8, m%4 ; t14
- paddw m%8, m%4 ; t15
- psubw m%4, m%7, m%3 ; t10
- paddw m%3, m%7 ; t11
- psubw m%7, m%1, m%5 ; t9
- paddw m%1, m%5 ; t8
+ psubsw m%9, m%2, m%6 ; t13
+ paddsw m%6, m%2 ; t12
+ psubsw m%2, m%8, m%4 ; t14
+ paddsw m%8, m%4 ; t15
+ psubsw m%4, m%7, m%3 ; t10
+ paddsw m%3, m%7 ; t11
+ psubsw m%7, m%1, m%5 ; t9
+ paddsw m%1, m%5 ; t8
ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a
ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
vpbroadcastd m%10, [o(pw_2896x8)]
- psubw m%5, m%2, m%9 ; t10
- paddw m%2, m%9 ; t9
- psubw m%9, m%1, m%3 ; t11a
- paddw m%1, m%3 ; t8a
- psubw m%3, m%7, m%4 ; t13
- paddw m%7, m%4 ; t14
- psubw m%4, m%8, m%6 ; t12a
- paddw m%8, m%6 ; t15a
+ psubsw m%5, m%2, m%9 ; t10
+ paddsw m%2, m%9 ; t9
+ psubsw m%9, m%1, m%3 ; t11a
+ paddsw m%1, m%3 ; t8a
+ psubsw m%3, m%7, m%4 ; t13
+ paddsw m%7, m%4 ; t14
+ psubsw m%4, m%8, m%6 ; t12a
+ paddsw m%8, m%6 ; t15a
paddw m%6, m%3, m%5 ; t13a
psubw m%3, m%5 ; t10a
paddw m%5, m%4, m%9 ; t12
@@ -458,8 +458,8 @@
vpbroadcastd m4, [o(pw_2896x8)]
pmulhrsw m0, m4 ; t0 t1
%endif
- psubw m1, m0, m2 ; out3 out2
- paddw m0, m2 ; out0 out1
+ psubsw m1, m0, m2 ; out3 out2
+ paddsw m0, m2 ; out0 out1
%endmacro
%macro IADST4_1D_PACKED 0
@@ -693,8 +693,8 @@
ITX_MUL2X_PACK 4, 1, 2, 6, 3406, 2276, 1 ; t5a t6a
ITX_MUL2X_PACK 3, 1, 2, 6, 1567, 3784 ; t3 t2
vpbroadcastd m6, [o(pw_2896x8)]
- psubw m2, m5, m4 ; t4 t7
- paddw m5, m4 ; t5a t6a
+ psubsw m2, m5, m4 ; t4 t7
+ paddsw m5, m4 ; t5a t6a
pshufd m4, m2, q1032
psubw m1, m2, m4
paddw m4, m2
@@ -701,14 +701,14 @@
vpblendd m4, m4, m1, 0xcc
pmulhrsw m0, m6 ; t0 t1
pmulhrsw m4, m6 ; t6 t5
- psubw m1, m0, m3 ; tmp3 tmp2
- paddw m0, m3 ; tmp0 tmp1
+ psubsw m1, m0, m3 ; tmp3 tmp2
+ paddsw m0, m3 ; tmp0 tmp1
shufps m2, m5, m4, q1032 ; t7 t6
vpblendd m5, m5, m4, 0xcc ; t4 t5
- psubw m3, m0, m2 ; out7 out6
- paddw m0, m2 ; out0 out1
- psubw m2, m1, m5 ; out4 out5
- paddw m1, m5 ; out3 out2
+ psubsw m3, m0, m2 ; out7 out6
+ paddsw m0, m2 ; out0 out1
+ psubsw m2, m1, m5 ; out4 out5
+ paddsw m1, m5 ; out3 out2
%endmacro
%macro IADST8_1D_PACKED 0
@@ -721,19 +721,19 @@
ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a
ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a
ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a
- psubw m4, m0, m2 ; t4 t5
- paddw m0, m2 ; t0 t1
- psubw m5, m1, m3 ; t6 t7
- paddw m1, m3 ; t2 t3
+ psubsw m4, m0, m2 ; t4 t5
+ paddsw m0, m2 ; t0 t1
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
shufps m2, m5, m4, q1032
punpckhwd m4, m2
punpcklwd m5, m2
ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a
- psubw m2, m0, m1 ; t2 t3
- paddw m0, m1 ; out0 -out7
- psubw m1, m4, m5 ; t7 t6
- paddw m4, m5 ; out6 -out1
+ psubsw m2, m0, m1 ; t2 t3
+ paddsw m0, m1 ; out0 -out7
+ psubsw m1, m4, m5 ; t7 t6
+ paddsw m4, m5 ; out6 -out1
vpbroadcastd m5, [o(pw_2896x8)]
vpblendd m3, m0, m4, 0x33 ; out6 -out7
vpblendd m0, m0, m4, 0xcc ; out0 -out1
@@ -981,10 +981,10 @@
ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 1 ; t4a t7a
ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 1 ; t5a t6a
ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2
- psubw m2, m8, m0 ; t9 t14
- paddw m8, m0 ; t8 t15
- psubw m0, m1, m5 ; t10 t13
- paddw m1, m5 ; t11 t12
+ psubsw m2, m8, m0 ; t9 t14
+ paddsw m8, m0 ; t8 t15
+ psubsw m0, m1, m5 ; t10 t13
+ paddsw m1, m5 ; t11 t12
%if mmsize > 16
vbroadcasti128 m5, [o(deint_shuf)]
%else
@@ -996,12 +996,12 @@
ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 4 ; t9a t14a
vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 4 ; t10a t13a
- psubw m5, m7, m3 ; t5a t6a
- paddw m7, m3 ; t4 t7
- psubw m4, m8, m1 ; t11a t12a
- paddw m8, m1 ; t8a t15a
- paddw m1, m2, m0 ; t9 t14
- psubw m2, m0 ; t10 t13
+ psubsw m5, m7, m3 ; t5a t6a
+ paddsw m7, m3 ; t4 t7
+ psubsw m4, m8, m1 ; t11a t12a
+ paddsw m8, m1 ; t8a t15a
+ paddsw m1, m2, m0 ; t9 t14
+ psubsw m2, m0 ; t10 t13
punpckhqdq m0, m8, m1 ; t15a t14
punpcklqdq m8, m1 ; t8a t9
pshufd m3, m5, q1032
@@ -1019,20 +1019,20 @@
pmulhrsw m5, m1 ; t12 t13a
shufps m2, m7, m3, q1032 ; t7 t6
vpblendd m7, m7, m3, 0xcc ; t4 t5
- psubw m1, m9, m6 ; dct4 out3 out2
- paddw m9, m6 ; dct4 out0 out1
- psubw m3, m9, m2 ; dct8 out7 out6
- paddw m9, m2 ; dct8 out0 out1
- psubw m2, m1, m7 ; dct8 out4 out5
- paddw m1, m7 ; dct8 out3 out2
- psubw m7, m9, m0 ; out15 out14
- paddw m0, m9 ; out0 out1
- psubw m6, m1, m5 ; out12 out13
- paddw m1, m5 ; out3 out2
- psubw m5, m2, m4 ; out11 out10
- paddw m2, m4 ; out4 out5
- psubw m4, m3, m8 ; out8 out9
- paddw m3, m8 ; out7 out6
+ psubsw m1, m9, m6 ; dct4 out3 out2
+ paddsw m9, m6 ; dct4 out0 out1
+ psubsw m3, m9, m2 ; dct8 out7 out6
+ paddsw m9, m2 ; dct8 out0 out1
+ psubsw m2, m1, m7 ; dct8 out4 out5
+ paddsw m1, m7 ; dct8 out3 out2
+ psubsw m7, m9, m0 ; out15 out14
+ paddsw m0, m9 ; out0 out1
+ psubsw m6, m1, m5 ; out12 out13
+ paddsw m1, m5 ; out3 out2
+ psubsw m5, m2, m4 ; out11 out10
+ paddsw m2, m4 ; out4 out5
+ psubsw m4, m3, m8 ; out8 out9
+ paddsw m3, m8 ; out7 out6
%endmacro
INV_TXFM_4X16_FN dct, dct, 0
@@ -1153,20 +1153,20 @@
ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3
- psubw m2, m0, m3 ; t9a t8a t11a t10a
- paddw m0, m3 ; t1a t0a t3a t2a
- psubw m3, m1, m4 ; t13a t12a t15a t14a
- paddw m1, m4 ; t5a t4a t7a t6a
+ psubsw m2, m0, m3 ; t9a t8a t11a t10a
+ paddsw m0, m3 ; t1a t0a t3a t2a
+ psubsw m3, m1, m4 ; t13a t12a t15a t14a
+ paddsw m1, m4 ; t5a t4a t7a t6a
ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3
psubw m6, m7, m5
ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6
vpbroadcastd m6, [o(pw_m3784_1567)]
vpbroadcastd m5, [o(pw_1567_3784)]
- psubw m4, m0, m1 ; t5 t4 t7 t6
- paddw m0, m1 ; t1 t0 t3 t2
- psubw m1, m2, m3 ; t13a t12a t15a t14a
- paddw m2, m3 ; t9a t8a t11a t10a
- psubw m3, m7, m6
+ psubsw m4, m0, m1 ; t5 t4 t7 t6
+ paddsw m0, m1 ; t1 t0 t3 t2
+ psubsw m1, m2, m3 ; t13a t12a t15a t14a
+ paddsw m2, m3 ; t9a t8a t11a t10a
+ psubw m3, m7, m6 ; pw_3784_m1567
vpblendd m6, m6, m3, 0xf0
ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
@@ -1179,10 +1179,10 @@
vinserti128 m4, m4, xm1, 1 ; t4a t5a t12 t13
vpbroadcastd m5, [o(pw_2896x8)]
pshufd m2, m2, q1032 ; t6a t7a t14 t15
- psubw m1, m0, m3 ; t3a t2a t11 t10
- paddw m0, m3 ; -out15 out0 out14 -out1
- paddw m3, m4, m2 ; -out3 out12 out2 -out13
- psubw m4, m2 ; t6 t7 t14a t15a
+ psubsw m1, m0, m3 ; t3a t2a t11 t10
+ paddsw m0, m3 ; -out15 out0 out14 -out1
+ paddsw m3, m4, m2 ; -out3 out12 out2 -out13
+ psubsw m4, m2 ; t6 t7 t14a t15a
shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a
vpblendd m4, m4, m1, 0x33 ; t3a t7 t11 t15a
paddw m1, m2, m4
@@ -1902,53 +1902,53 @@
ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15
- psubw m4, m0, m5 ; t9a t8a
- paddw m0, m5 ; t1a t0a
- psubw m5, m1, m6 ; t11a t10a
- paddw m1, m6 ; t3a t2a
- psubw m6, m2, m7 ; t13a t12a
- paddw m2, m7 ; t5a t4a
- psubw m7, m3, m8 ; t15a t14a
- paddw m3, m8 ; t7a t6a
+ psubsw m4, m0, m5 ; t9a t8a
+ paddsw m0, m5 ; t1a t0a
+ psubsw m5, m1, m6 ; t11a t10a
+ paddsw m1, m6 ; t3a t2a
+ psubsw m6, m2, m7 ; t13a t12a
+ paddsw m2, m7 ; t5a t4a
+ psubsw m7, m3, m8 ; t15a t14a
+ paddsw m3, m8 ; t7a t6a
vpbroadcastd m11, [o(pw_m4017_799)]
vpbroadcastd m12, [o(pw_799_4017)]
pxor m9, m9
ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9
- psubw m8, m9, m11
+ psubw m8, m9, m11 ; pw_4017_m799
ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13
vpbroadcastd m11, [o(pw_m2276_3406)]
vpbroadcastd m12, [o(pw_3406_2276)]
ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11
- psubw m8, m9, m11
+ psubw m8, m9, m11 ; pw_2276_m3406
ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15
- psubw m8, m1, m3 ; t7 t6
- paddw m1, m3 ; t3 t2
- psubw m3, m0, m2 ; t5 t4
- paddw m0, m2 ; t1 t0
- psubw m2, m5, m7 ; t14a t15a
- paddw m7, m5 ; t10a t11a
- psubw m5, m4, m6 ; t12a t13a
- paddw m4, m6 ; t8a t9a
+ psubsw m8, m1, m3 ; t7 t6
+ paddsw m1, m3 ; t3 t2
+ psubsw m3, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m2, m5, m7 ; t14a t15a
+ paddsw m7, m5 ; t10a t11a
+ psubsw m5, m4, m6 ; t12a t13a
+ paddsw m4, m6 ; t8a t9a
vpbroadcastd m11, [o(pw_m3784_1567)]
vpbroadcastd m12, [o(pw_1567_3784)]
ITX_MUL2X_PACK 3, 6, _, 10, 11, 12, 4 ; t4a t5a
- psubw m6, m9, m11
+ psubw m6, m9, m11 ; pw_3784_m1567
ITX_MUL2X_PACK 8, 12, _, 10, 12, 6, 4 ; t6a t7a
vpbroadcastd m11, [o(pw_m1567_3784)]
vpbroadcastd m12, [o(pw_3784_1567)]
ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 4 ; t15 t14
- psubw m6, m9, m11
+ psubw m6, m9, m11 ; pw_1567_m3784
ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 4 ; t13 t12
vbroadcasti128 m11, [o(deint_shuf)]
vpbroadcastd m12, [o(pw_2896x8)]
- psubw m6, m0, m1 ; t3a t2a
- paddw m0, m1 ; -out15 out0
- paddw m1, m2, m5 ; -out13 out2
- psubw m5, m2 ; t15a t14a
- paddw m2, m4, m7 ; -out1 out14
- psubw m4, m7 ; t10 t11
- psubw m7, m3, m8 ; t6 t7
- paddw m8, m3 ; -out3 out12
+ psubsw m6, m0, m1 ; t3a t2a
+ paddsw m0, m1 ; -out15 out0
+ paddsw m1, m2, m5 ; -out13 out2
+ psubsw m5, m2 ; t15a t14a
+ paddsw m2, m4, m7 ; -out1 out14
+ psubsw m4, m7 ; t10 t11
+ psubsw m7, m3, m8 ; t6 t7
+ paddsw m8, m3 ; -out3 out12
REPX {pshufb x, m11}, m6, m4, m0, m2
vpblendd m3, m6, m4, 0xcc ; t3a t11
shufps m6, m6, m4, q1032 ; t2a t10
@@ -2580,25 +2580,25 @@
ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
- psubw m8, m2, m6 ; t6
- paddw m2, m6 ; t2
- psubw m6, m0, m4 ; t4
- paddw m0, m4 ; t0
- psubw m4, m5, m1 ; t7
- paddw m5, m1 ; t3
- psubw m1, m7, m3 ; t5
- paddw m7, m3 ; t1
+ psubsw m8, m2, m6 ; t6
+ paddsw m2, m6 ; t2
+ psubsw m6, m0, m4 ; t4
+ paddsw m0, m4 ; t0
+ psubsw m4, m5, m1 ; t7
+ paddsw m5, m1 ; t3
+ psubsw m1, m7, m3 ; t5
+ paddsw m7, m3 ; t1
ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
- psubw m9, m6, m8 ; t7
- paddw m6, m8 ; out6
+ psubsw m9, m6, m8 ; t7
+ paddsw m6, m8 ; out6
vpbroadcastd m8, [o(pw_2896x8)]
- psubw m3, m7, m5 ; t3
- paddw m7, m5 ; -out7
- psubw m5, m0, m2 ; t2
- paddw m0, m2 ; out0
- psubw m2, m1, m4 ; t6
- paddw m1, m4 ; -out1
+ psubsw m3, m7, m5 ; t3
+ paddsw m7, m5 ; -out7
+ psubsw m5, m0, m2 ; t2
+ paddsw m0, m2 ; out0
+ psubsw m2, m1, m4 ; t6
+ paddsw m1, m4 ; -out1
psubw m4, m5, m3
paddw m3, m5
psubw m5, m2, m9
@@ -2959,25 +2959,25 @@
mova [rsp+gprsize+32*0], m6 ; tmp3
IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15
mova m6, [rsp+gprsize+32*1] ; tmp5
- psubw m15, m0, m14 ; out15
- paddw m0, m14 ; out0
- psubw m14, m2, m13 ; out14
- paddw m2, m13 ; out1
+ psubsw m15, m0, m14 ; out15
+ paddsw m0, m14 ; out0
+ psubsw m14, m2, m13 ; out14
+ paddsw m2, m13 ; out1
mova [rsp+gprsize+32*1], m2
- psubw m13, m4, m11 ; out13
- paddw m2, m4, m11 ; out2
- psubw m11, m8, m7 ; out11
- paddw m4, m8, m7 ; out4
+ psubsw m13, m4, m11 ; out13
+ paddsw m2, m4, m11 ; out2
+ psubsw m11, m8, m7 ; out11
+ paddsw m4, m8, m7 ; out4
mova m7, [rsp+gprsize+32*2] ; tmp7
- psubw m10, m6, m5 ; out10
- paddw m5, m6 ; out5
- psubw m8, m7, m9 ; out8
- paddw m7, m9 ; out7
- psubw m9, m12, m3 ; out9
- paddw m6, m12, m3 ; out6
+ psubsw m10, m6, m5 ; out10
+ paddsw m5, m6 ; out5
+ psubsw m8, m7, m9 ; out8
+ paddsw m7, m9 ; out7
+ psubsw m9, m12, m3 ; out9
+ paddsw m6, m12, m3 ; out6
mova m3, [rsp+gprsize+32*0] ; tmp3
- psubw m12, m3, m1 ; out12
- paddw m3, m1 ; out3
+ psubsw m12, m3, m1 ; out12
+ paddsw m3, m1 ; out3
ret
INV_TXFM_16X16_FN adst, dct
@@ -3012,24 +3012,24 @@
ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6
ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10
ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14
- psubw m0, m2, m10 ; t10a
- paddw m2, m10 ; t2a
- psubw m10, m13, m5 ; t11a
- paddw m13, m5 ; t3a
- psubw m5, m6, m14 ; t14a
- paddw m6, m14 ; t6a
- psubw m14, m9, m1 ; t15a
- paddw m9, m1 ; t7a
+ psubsw m0, m2, m10 ; t10a
+ paddsw m2, m10 ; t2a
+ psubsw m10, m13, m5 ; t11a
+ paddsw m13, m5 ; t3a
+ psubsw m5, m6, m14 ; t14a
+ paddsw m6, m14 ; t6a
+ psubsw m14, m9, m1 ; t15a
+ paddsw m9, m1 ; t7a
ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10
ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15
- psubw m1, m10, m14 ; t14a
- paddw m10, m14 ; t10a
- psubw m14, m0, m5 ; t15a
- paddw m0, m5 ; t11a
- psubw m5, m2, m6 ; t6
- paddw m2, m6 ; t2
- psubw m6, m13, m9 ; t7
- paddw m13, m9 ; t3
+ psubsw m1, m10, m14 ; t14a
+ paddsw m10, m14 ; t10a
+ psubsw m14, m0, m5 ; t15a
+ paddsw m0, m5 ; t11a
+ psubsw m5, m2, m6 ; t6
+ paddsw m2, m6 ; t2
+ psubsw m6, m13, m9 ; t7
+ paddsw m13, m9 ; t3
ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a
ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15
mova m9, [rsp+gprsize+32*0] ; in15
@@ -3042,46 +3042,46 @@
ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4
ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8
ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12
- psubw m10, m4, m8 ; t8a
- paddw m8, m4 ; t0a
- psubw m4, m9, m7 ; t9a
- paddw m9, m7 ; t1a
- psubw m7, m6, m12 ; t12a
- paddw m6, m12 ; t4a
- psubw m12, m11, m3 ; t13a
- paddw m11, m3 ; t5a
+ psubsw m10, m4, m8 ; t8a
+ paddsw m8, m4 ; t0a
+ psubsw m4, m9, m7 ; t9a
+ paddsw m9, m7 ; t1a
+ psubsw m7, m6, m12 ; t12a
+ paddsw m6, m12 ; t4a
+ psubsw m12, m11, m3 ; t13a
+ paddsw m11, m3 ; t5a
ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8
ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13
- psubw m3, m9, m11 ; t5
- paddw m9, m11 ; t1
- psubw m11, m4, m12 ; t12a
- paddw m4, m12 ; t8a
- paddw m12, m8, m6 ; t0
- psubw m8, m6 ; t4
- paddw m6, m10, m7 ; t9a
- psubw m10, m7 ; t13a
+ psubsw m3, m9, m11 ; t5
+ paddsw m9, m11 ; t1
+ psubsw m11, m4, m12 ; t12a
+ paddsw m4, m12 ; t8a
+ paddsw m12, m8, m6 ; t0
+ psubsw m8, m6 ; t4
+ paddsw m6, m10, m7 ; t9a
+ psubsw m10, m7 ; t13a
ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a
ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12
mova m7, [rsp+gprsize+32*0] ; t10a
mova m2, [rsp+gprsize+32*1] ; t6a
- paddw m15, m9, m13 ; -out15
- psubw m9, m13 ; t3a
- paddw m13, m11, m1 ; -out13
- psubw m11, m1 ; t15a
- psubw m1, m4, m7 ; t10
- paddw m7, m4 ; -out1
- psubw m4, m3, m2 ; t6
- paddw m3, m2 ; -out3
- paddw m2, m10, m14 ; out2
- psubw m10, m14 ; t14a
- paddw m14, m6, m0 ; out14
- psubw m6, m0 ; t11
+ paddsw m15, m9, m13 ; -out15
+ psubsw m9, m13 ; t3a
+ paddsw m13, m11, m1 ; -out13
+ psubsw m11, m1 ; t15a
+ psubsw m1, m4, m7 ; t10
+ paddsw m7, m4 ; -out1
+ psubsw m4, m3, m2 ; t6
+ paddsw m3, m2 ; -out3
+ paddsw m2, m10, m14 ; out2
+ psubsw m10, m14 ; t14a
+ paddsw m14, m6, m0 ; out14
+ psubsw m6, m0 ; t11
mova m0, [rsp+gprsize+32*2] ; t2
mova [rsp+gprsize+32*1], m7
- psubw m7, m12, m0 ; t2a
- paddw m0, m12 ; out0
- paddw m12, m8, m5 ; out12
- psubw m8, m5 ; t7
+ psubsw m7, m12, m0 ; t2a
+ paddsw m0, m12 ; out0
+ paddsw m12, m8, m5 ; out12
+ psubsw m8, m5 ; t7
paddw m5, m10, m11 ; -out5
psubw m10, m11 ; out10
psubw m11, m4, m8 ; -out11
@@ -3475,26 +3475,26 @@
ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a
ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a
.main2:
- psubw m6, m1, m11 ; t17 t30
- paddw m1, m11 ; t16 t31
- psubw m11, m9, m14 ; t18 t29
- paddw m9, m14 ; t19 t28
- psubw m14, m15, m0 ; t21 t26
- paddw m15, m0 ; t20 t27
- psubw m0, m8, m13 ; t22 t25
- paddw m8, m13 ; t23 t24
+ psubsw m6, m1, m11 ; t17 t30
+ paddsw m1, m11 ; t16 t31
+ psubsw m11, m9, m14 ; t18 t29
+ paddsw m9, m14 ; t19 t28
+ psubsw m14, m15, m0 ; t21 t26
+ paddsw m15, m0 ; t20 t27
+ psubsw m0, m8, m13 ; t22 t25
+ paddsw m8, m13 ; t23 t24
ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a
ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a
ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a
ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
- psubw m13, m1, m9 ; t19a t28a
- paddw m1, m9 ; t16a t31a
- psubw m9, m8, m15 ; t20a t27a
- paddw m8, m15 ; t23a t24a
- psubw m15, m6, m11 ; t18 t29
- paddw m6, m11 ; t17 t30
- psubw m11, m0, m14 ; t21 t26
- paddw m0, m14 ; t22 t25
+ psubsw m13, m1, m9 ; t19a t28a
+ paddsw m1, m9 ; t16a t31a
+ psubsw m9, m8, m15 ; t20a t27a
+ paddsw m8, m15 ; t23a t24a
+ psubsw m15, m6, m11 ; t18 t29
+ paddsw m6, m11 ; t17 t30
+ psubsw m11, m0, m14 ; t21 t26
+ paddsw m0, m14 ; t22 t25
ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 1 ; t18a t29a
ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 1 ; t19 t28
ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 1 ; t20 t27
@@ -3501,48 +3501,48 @@
ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 1 ; t21a t26a
vbroadcasti128 m12, [o(deint_shuf)]
REPX {pshufb x, m12}, m0, m1, m6, m8
- psubw m14, m1, m8 ; t23 t24
- paddw m1, m8 ; t16 t31
- psubw m8, m6, m0 ; t22a t25a
- paddw m6, m0 ; t17a t30a
- psubw m0, m15, m11 ; t21 t26
- paddw m15, m11 ; t18 t29
- psubw m11, m13, m9 ; t20a t27a
- paddw m13, m9 ; t19a t28a
+ psubsw m14, m1, m8 ; t23 t24
+ paddsw m1, m8 ; t16 t31
+ psubsw m8, m6, m0 ; t22a t25a
+ paddsw m6, m0 ; t17a t30a
+ psubsw m0, m15, m11 ; t21 t26
+ paddsw m15, m11 ; t18 t29
+ psubsw m11, m13, m9 ; t20a t27a
+ paddsw m13, m9 ; t19a t28a
vpbroadcastd m12, [o(pw_2896x8)]
- punpcklqdq m9, m11, m0 ; t20a t21
- punpckhqdq m11, m0 ; t27a t26
- punpcklqdq m0, m14, m8 ; t23 t22a
- punpckhqdq m14, m8 ; t24 t25a
- psubw m8, m11, m9 ; t20 t21a
- paddw m11, m9 ; t27 t26a
- psubw m9, m14, m0 ; t23a t22
- paddw m14, m0 ; t24a t25
- REPX {pmulhrsw x, m12}, m8, m9, m14, m11
+ punpcklqdq m9, m11, m0 ; t20a t21
+ punpckhqdq m11, m0 ; t27a t26
+ punpcklqdq m0, m14, m8 ; t23 t22a
+ punpckhqdq m14, m8 ; t24 t25a
+ psubw m8, m11, m9 ; t20 t21a
+ paddw m11, m9 ; t27 t26a
+ psubw m9, m14, m0 ; t23a t22
+ paddw m14, m0 ; t24a t25
+ REPX {pmulhrsw x, m12}, m8, m9, m14, m11
punpcklqdq m0, m1, m6 ; t16 t17a
punpckhqdq m1, m6 ; t31 t30a
- psubw m10, m5, m8 ; out20 out21
- paddw m5, m8 ; out11 out10
- psubw m6, m3, m14 ; out24 out25
- paddw m3, m14 ; out7 out6
- psubw m8, m7, m0 ; out16 out17
- paddw m7, m0 ; out15 out14
+ psubsw m10, m5, m8 ; out20 out21
+ paddsw m5, m8 ; out11 out10
+ psubsw m6, m3, m14 ; out24 out25
+ paddsw m3, m14 ; out7 out6
+ psubsw m8, m7, m0 ; out16 out17
+ paddsw m7, m0 ; out15 out14
mova m0, [rsp+gprsize+0*32]
punpcklqdq m12, m13, m15 ; t19a t18
punpckhqdq m13, m15 ; t28a t29
- psubw m15, m0, m1 ; out31 out30
- paddw m0, m1 ; out0 out1
+ psubsw m15, m0, m1 ; out31 out30
+ paddsw m0, m1 ; out0 out1
mova m1, [rsp+gprsize+1*32]
mova [rsp+gprsize+0*32], m6
mova m6, [rsp+gprsize+2*32]
- psubw m14, m1, m13 ; out28 out29
- paddw m1, m13 ; out3 out2
- psubw m13, m2, m11 ; out27 out26
- paddw m2, m11 ; out4 out5
- psubw m11, m4, m9 ; out23 out22
- paddw m4, m9 ; out8 out9
- psubw m9, m6, m12 ; out19 out18
- paddw m6, m12 ; out12 out13
+ psubsw m14, m1, m13 ; out28 out29
+ paddsw m1, m13 ; out3 out2
+ psubsw m13, m2, m11 ; out27 out26
+ paddsw m2, m11 ; out4 out5
+ psubsw m11, m4, m9 ; out23 out22
+ paddsw m4, m9 ; out8 out9
+ psubsw m9, m6, m12 ; out19 out18
+ paddsw m6, m12 ; out12 out13
ret
%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
@@ -3873,8 +3873,8 @@
%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
mova m%4, [%2]
- paddw m%3, m%1, m%4
- psubw m%1, m%4
+ paddsw m%3, m%1, m%4
+ psubsw m%1, m%4
pmovzxbw m%4, [dstq+%6]
pmulhrsw m%3, m%5
pmulhrsw m%1, m%5
@@ -4057,29 +4057,29 @@
ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290 ; t22a, t25a
ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601 ; t23a, t24a
.main2:
- psubw m7, m12, m4 ; t18
- paddw m12, m4 ; t19
- psubw m4, m2, m10 ; t21
- paddw m2, m10 ; t20
- psubw m10, m14, m6 ; t22
- paddw m14, m6 ; t23
- psubw m6, m1, m9 ; t25
- paddw m1, m9 ; t24
- psubw m9, m13, m5 ; t26
- paddw m13, m5 ; t27
- psubw m5, m3, m11 ; t29
- paddw m3, m11 ; t28
+ psubsw m7, m12, m4 ; t18
+ paddsw m12, m4 ; t19
+ psubsw m4, m2, m10 ; t21
+ paddsw m2, m10 ; t20
+ psubsw m10, m14, m6 ; t22
+ paddsw m14, m6 ; t23
+ psubsw m6, m1, m9 ; t25
+ paddsw m1, m9 ; t24
+ psubsw m9, m13, m5 ; t26
+ paddsw m13, m5 ; t27
+ psubsw m5, m3, m11 ; t29
+ paddsw m3, m11 ; t28
ITX_MULSUB_2W 5, 7, 8, 11, 15, m4017, 799 ; t18a, t29a
ITX_MULSUB_2W 9, 4, 8, 11, 15, 3406, 2276 ; t21a, t26a
ITX_MULSUB_2W 6, 10, 8, 11, 15, m2276, 3406 ; t22a, t25a
- psubw m8, m14, m2 ; t20a
- paddw m14, m2 ; t23a
- psubw m2, m1, m13 ; t27a
- paddw m1, m13 ; t24a
- psubw m13, m6, m9 ; t21
- paddw m6, m9 ; t22
- psubw m9, m10, m4 ; t26
- paddw m10, m4 ; t25
+ psubsw m8, m14, m2 ; t20a
+ paddsw m14, m2 ; t23a
+ psubsw m2, m1, m13 ; t27a
+ paddsw m1, m13 ; t24a
+ psubsw m13, m6, m9 ; t21
+ paddsw m6, m9 ; t22
+ psubsw m9, m10, m4 ; t26
+ paddsw m10, m4 ; t25
ITX_MULSUB_2W 2, 8, 4, 11, 15, m3784, 1567 ; t20, t27
ITX_MULSUB_2W 9, 13, 4, 11, 15, m3784, 1567 ; t21a, t26a
mova m4, [rsp+gprsize+32*0] ; in31
@@ -4090,31 +4090,31 @@
mova [rsp+gprsize+32*2], m1 ; t24a
ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091 ; t16a, t31a
ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751 ; t17a, t30a
- psubw m1, m0, m14 ; t17
- paddw m0, m14 ; t16
- psubw m14, m4, m6 ; t30
- paddw m4, m6 ; t31
+ psubsw m1, m0, m14 ; t17
+ paddsw m0, m14 ; t16
+ psubsw m14, m4, m6 ; t30
+ paddsw m4, m6 ; t31
ITX_MULSUB_2W 14, 1, 6, 11, 15, 799, 4017 ; t17a, t30a
- psubw m6, m0, m12 ; t19a
- paddw m0, m12 ; t16a
- psubw m12, m4, m3 ; t28a
- paddw m4, m3 ; t31a
- psubw m3, m14, m5 ; t18
- paddw m14, m5 ; t17
- psubw m5, m1, m7 ; t29
- paddw m1, m7 ; t30
+ psubsw m6, m0, m12 ; t19a
+ paddsw m0, m12 ; t16a
+ psubsw m12, m4, m3 ; t28a
+ paddsw m4, m3 ; t31a
+ psubsw m3, m14, m5 ; t18
+ paddsw m14, m5 ; t17
+ psubsw m5, m1, m7 ; t29
+ paddsw m1, m7 ; t30
ITX_MULSUB_2W 5, 3, 7, 11, 15, 1567, 3784 ; t18a, t29a
ITX_MULSUB_2W 12, 6, 7, 11, 15, 1567, 3784 ; t19, t28
- psubw m7, m1, m10 ; t25a
- paddw m1, m10 ; t30a
- psubw m10, m5, m9 ; t21
- paddw m5, m9 ; t18
- psubw m9, m12, m2 ; t20a
- paddw m12, m2 ; t19a
- psubw m2, m3, m13 ; t26
- paddw m3, m13 ; t29
- psubw m13, m6, m8 ; t27a
- paddw m6, m8 ; t28a
+ psubsw m7, m1, m10 ; t25a
+ paddsw m1, m10 ; t30a
+ psubsw m10, m5, m9 ; t21
+ paddsw m5, m9 ; t18
+ psubsw m9, m12, m2 ; t20a
+ paddsw m12, m2 ; t19a
+ psubsw m2, m3, m13 ; t26
+ paddsw m3, m13 ; t29
+ psubsw m13, m6, m8 ; t27a
+ paddsw m6, m8 ; t28a
mova [tmp1q-32*2], m5
mova [tmp1q-32*1], m12
mova [tmp2q+32*0], m6
@@ -4124,12 +4124,12 @@
mova m6, [rsp+gprsize+32*1] ; t23
mova m3, [rsp+gprsize+32*2] ; t24a
vpbroadcastd m8, [o(pw_2896x8)]
- psubw m1, m14, m5 ; t22a
- paddw m14, m5 ; t17a
- psubw m5, m0, m6 ; t23
- paddw m0, m6 ; t16
- psubw m6, m4, m3 ; t24
- paddw m4, m3 ; t31
+ psubsw m1, m14, m5 ; t22a
+ paddsw m14, m5 ; t17a
+ psubsw m5, m0, m6 ; t23
+ paddsw m0, m6 ; t16
+ psubsw m6, m4, m3 ; t24
+ paddsw m4, m3 ; t31
mova [tmp1q-32*4], m0
mova [tmp1q-32*3], m14
mova [tmp2q+32*3], m4
@@ -4242,13 +4242,13 @@
; Perform the final sumsub step and YMM lane shuffling
%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
mova m%3, [tmp2q+32*( 3-%1)]
- psubw m%4, m%1, m%3
- paddw m%1, m%3
+ psubsw m%4, m%1, m%3
+ paddsw m%1, m%3
mova m%3, [tmp1q+32*(11-%2)]
mova [tmp1q+32*(11-%2)+16], xm%4
vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
- paddw m%4, m%2, m%3
- psubw m%2, m%3
+ paddsw m%4, m%2, m%3
+ psubsw m%2, m%3
mova [tmp1q+32*(11-%2)], xm%2
vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
vperm2i128 m%2, m%1, m%4, 0x31
@@ -4709,12 +4709,12 @@
mova m%5, [tmp1q-32*(45-%1)]
mova m%4, [tmp2q-32*(20+%1)]
%endif
- psubw m%6, m%5, m%4 ; idct32 out31-n
- paddw m%5, m%4 ; idct32 out 0+n
- psubw m%4, m%6, m%3 ; out32+n
- paddw m%6, m%3 ; out31-n
- psubw m%3, m%5, m%2 ; out63-n
- paddw m%5, m%2 ; out 0+n
+ psubsw m%6, m%5, m%4 ; idct32 out31-n
+ paddsw m%5, m%4 ; idct32 out 0+n
+ psubsw m%4, m%6, m%3 ; out32+n
+ paddsw m%6, m%3 ; out31-n
+ psubsw m%3, m%5, m%2 ; out63-n
+ paddsw m%5, m%2 ; out 0+n
%if %0 == 6 ; pass 1
%if %1 & 1
mova [tmp2q-32*(19-%1)], m%4
@@ -4949,25 +4949,25 @@
pmulhrsw m2, m13 ; t34a
pmulhrsw m8, m3 ; t60a
pmulhrsw m3, m12 ; t35a
- psubw m12, m0, m1 ; t33
- paddw m0, m1 ; t32
- psubw m1, m3, m2 ; t34
- paddw m3, m2 ; t35
- psubw m2, m8, m9 ; t61
- paddw m8, m9 ; t60
- psubw m9, m11, m10 ; t62
- paddw m11, m10 ; t63
+ psubsw m12, m0, m1 ; t33
+ paddsw m0, m1 ; t32
+ psubsw m1, m3, m2 ; t34
+ paddsw m3, m2 ; t35
+ psubsw m2, m8, m9 ; t61
+ paddsw m8, m9 ; t60
+ psubsw m9, m11, m10 ; t62
+ paddsw m11, m10 ; t63
ITX_MULSUB_2W 2, 1, 10, 13, 15, m4076, 401 ; t34a, t61a
vpbroadcastd m14, [o(pw_401_4076)]
ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13 ; t33a, t62a
- psubw m10, m0, m3 ; t35a
- paddw m0, m3 ; t32a
- psubw m3, m11, m8 ; t60a
- paddw m11, m8 ; t63a
- psubw m8, m9, m2 ; t34
- paddw m9, m2 ; t33
- psubw m2, m12, m1 ; t61
- paddw m12, m1 ; t62
+ psubsw m10, m0, m3 ; t35a
+ paddsw m0, m3 ; t32a
+ psubsw m3, m11, m8 ; t60a
+ paddsw m11, m8 ; t63a
+ psubsw m8, m9, m2 ; t34
+ paddsw m9, m2 ; t33
+ psubsw m2, m12, m1 ; t61
+ paddsw m12, m1 ; t62
mova [tmp1q-32*4], m0
mova [tmp1q-32*3], m9
mova [tmp2q+32*2], m12
@@ -4996,25 +4996,25 @@
pmulhrsw m6, m9 ; t38a
pmulhrsw m0, m7 ; t56a
pmulhrsw m7, m8 ; t39a
- psubw m8, m4, m5 ; t37
- paddw m4, m5 ; t36
- psubw m5, m7, m6 ; t38
- paddw m7, m6 ; t39
- psubw m6, m0, m1 ; t57
- paddw m0, m1 ; t56
- psubw m1, m3, m2 ; t58
- paddw m3, m2 ; t59
+ psubsw m8, m4, m5 ; t37
+ paddsw m4, m5 ; t36
+ psubsw m5, m7, m6 ; t38
+ paddsw m7, m6 ; t39
+ psubsw m6, m0, m1 ; t57
+ paddsw m0, m1 ; t56
+ psubsw m1, m3, m2 ; t58
+ paddsw m3, m2 ; t59
ITX_MULSUB_2W 6, 5, 2, 9, 15, m2598, 3166 ; t38a, t57a
vpbroadcastd m10, [o(pw_3166_2598)]
ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9 ; t37a, t58a
- psubw m2, m7, m4 ; t36a
- paddw m7, m4 ; t39a
- psubw m4, m0, m3 ; t59a
- paddw m0, m3 ; t56a
- psubw m3, m6, m1 ; t37
- paddw m6, m1 ; t38
- psubw m1, m5, m8 ; t58
- paddw m5, m8 ; t57
+ psubsw m2, m7, m4 ; t36a
+ paddsw m7, m4 ; t39a
+ psubsw m4, m0, m3 ; t59a
+ paddsw m0, m3 ; t56a
+ psubsw m3, m6, m1 ; t37
+ paddsw m6, m1 ; t38
+ psubsw m1, m5, m8 ; t58
+ paddsw m5, m8 ; t57
mova [tmp1q+32*2], m6
mova [tmp1q+32*3], m7
mova [tmp2q-32*4], m0
@@ -5056,24 +5056,24 @@
mova m3, [tmp2q-32* 4] ; t47a
mova m6, [tmp1q+32*11] ; t56a
mova m7, [tmp2q+32*12] ; t63a
- psubw m8, m0, m1 ; t39
- paddw m0, m1 ; t32
- psubw m1, m3, m2 ; t40
- paddw m3, m2 ; t47
- psubw m2, m4, m5 ; t55
- paddw m4, m5 ; t48
- psubw m5, m7, m6 ; t56
- paddw m7, m6 ; t63
+ psubsw m8, m0, m1 ; t39
+ paddsw m0, m1 ; t32
+ psubsw m1, m3, m2 ; t40
+ paddsw m3, m2 ; t47
+ psubsw m2, m4, m5 ; t55
+ paddsw m4, m5 ; t48
+ psubsw m5, m7, m6 ; t56
+ paddsw m7, m6 ; t63
ITX_MULSUB_2W 5, 8, 6, 9, 15, 11, 12 ; t39a, t56a
ITX_MULSUB_2W 2, 1, 6, 9, 15, 12, 13 ; t40a, t55a
- psubw m6, m0, m3 ; t47a
- paddw m0, m3 ; t32a
- psubw m3, m7, m4 ; t48a
- paddw m7, m4 ; t63a
- psubw m4, m5, m2 ; t40
- paddw m5, m2 ; t39
- psubw m2, m8, m1 ; t55
- paddw m8, m1 ; t56
+ psubsw m6, m0, m3 ; t47a
+ paddsw m0, m3 ; t32a
+ psubsw m3, m7, m4 ; t48a
+ paddsw m7, m4 ; t63a
+ psubsw m4, m5, m2 ; t40
+ paddsw m5, m2 ; t39
+ psubsw m2, m8, m1 ; t55
+ paddsw m8, m1 ; t56
psubw m1, m2, m4 ; t40a
paddw m2, m4 ; t55a
psubw m4, m3, m6 ; t47
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -135,8 +135,8 @@
pmulhrsw m0, [qw_2896x8] ;high: t1 ;low: t0
%endif
- psubw m1, m0, m2 ;high: out2 ;low: out3
- paddw m0, m2 ;high: out1 ;low: out0
+ psubsw m1, m0, m2 ;high: out2 ;low: out3
+ paddsw m0, m2 ;high: out1 ;low: out0
%endmacro
%macro IADST4_1D_PACKED 0