ref: a9315f5fde02530f64358375c3d2444a506b3a58
parent: e2702eaf5f13d5f93be75084a5bfecc77a67c001
author: Henrik Gramner <gramner@twoorioles.com>
date: Wed Sep 4 18:06:58 EDT 2019
x86: Increase precision of the final inverse ADST transform stages 16-bit precision is sufficient for the second pass, but the first pass requires 32-bit precision to correctly handle some esoteric edge cases.
--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -47,9 +47,11 @@
pw_3803_1321: dw 3803, 1321
pw_m1321_2482: dw -1321, 2482
pw_2482_3344: dw 2482, 3344
+pw_m3344_3344: dw -3344, 3344
pw_m3803_3344: dw -3803, 3344
pw_m3803_m6688: dw -3803, -6688
-%define pw_3344x8 iadst4_dconly2b
+COEF_PAIR 2896, 2896
+pw_2896_m2896: dw 2896, -2896
pw_5: times 2 dw 5
pw_2048: times 2 dw 2048
@@ -464,13 +466,15 @@
%macro IADST4_1D_PACKED 0
punpcklwd m2, m1, m0
punpckhwd m3, m1, m0
- psubw m0, m1
- punpckhqdq m1, m1
- paddw m1, m0 ; in0 - in2 + in3
+ vpbroadcastd m5, [o(pw_m3344_3344)]
vpbroadcastd m0, [o(pw_3803_1321)]
vpbroadcastd m4, [o(pw_m1321_2482)]
+ pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2
+ psrld m5, 16
pmaddwd m0, m2
pmaddwd m2, m4
+ pmaddwd m5, m3 ; 3344*in0
+ paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3
vpbroadcastd m4, [o(pw_2482_3344)]
vpbroadcastd m5, [o(pw_m3803_3344)]
pmaddwd m4, m3
@@ -478,19 +482,16 @@
paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
vpbroadcastd m0, [o(pw_m3803_m6688)]
pmaddwd m3, m0
- vpbroadcastd m0, [o(pw_3344x8)]
- pmulhrsw m1, m0 ; out2 ____
vpbroadcastd m0, [o(pd_2048)]
paddd m2, m0
+ paddd m1, m0
paddd m0, m4
paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
paddd m2, m4
paddd m2, m3
- psrad m0, 12
- psrad m5, 12
- psrad m2, 12
+ REPX {psrad x, 12}, m1, m2, m0, m5
packssdw m0, m5 ; out0 out1
- packssdw m2, m2 ; out3 out3
+ packssdw m1, m2 ; out2 out3
%endmacro
INV_TXFM_4X4_FN dct, dct, 0
@@ -524,7 +525,7 @@
mova m0, [cq+16*0]
mova m1, [cq+16*1]
call .main
- punpckhwd m3, m0, m2
+ punpckhwd m3, m0, m1
punpcklwd m0, m1
punpckhwd m1, m0, m3
punpcklwd m0, m3
@@ -531,7 +532,6 @@
jmp tx2q
.pass2:
call .main
- vpblendd m1, m1, m2, 0x0c ; out2 out3
.end:
pxor m2, m2
mova [cq+16*0], m2
@@ -552,14 +552,13 @@
mova m0, [cq+16*0]
mova m1, [cq+16*1]
call m(iadst_4x4_internal).main
- punpcklwd m1, m0
- punpckhwd m2, m0
- punpcklwd m0, m2, m1
- punpckhwd m1, m2, m1
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
jmp tx2q
.pass2:
call m(iadst_4x4_internal).main
- vpblendd m1, m1, m2, 0x0c ; out2 out3
.end:
pxor m2, m2
mova [cq+16*0], m2
@@ -710,12 +709,55 @@
paddsw m1, m5 ; out3 out2
%endmacro
-%macro IADST8_1D_PACKED 0
+%macro IADST8_1D_PACKED 1 ; pass
vpbroadcastd m6, [o(pd_2048)]
punpckhwd m0, m4, m3 ; 0 7
punpckhwd m1, m5, m2 ; 2 5
punpcklwd m2, m5 ; 4 3
punpcklwd m3, m4 ; 6 1
+%if %1 == 1
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
+ psubsw m4, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
+%if mmsize > 16
+ vbroadcasti128 m2, [o(deint_shuf)]
+%else
+ mova m2, [o(deint_shuf)]
+%endif
+ pshuflw m1, m1, q2301
+ pshufhw m1, m1, q2301
+ psubsw m3, m0, m1 ; t3 t2
+ paddsw m0, m1 ; -out7 out0
+ psubsw m1, m4, m5 ; t7 t6
+ paddsw m4, m5 ; out6 -out1
+ pshufb m0, m2
+ pshufb m4, m2
+ vpbroadcastd m5, [o(pw_m2896_2896)]
+ pmaddwd m2, m5, m3
+ pmaddwd m5, m1
+ paddd m2, m6
+ paddd m5, m6
+ psrad m2, 12
+ psrad m5, 12
+ packssdw m2, m5 ; out4 -out5
+ vpbroadcastd m5, [o(pw_2896_2896)]
+ pmaddwd m3, m5
+ pmaddwd m1, m5
+ paddd m3, m6
+ paddd m1, m6
+ psrad m3, 12
+ psrad m1, 12
+ packssdw m1, m3 ; out2 -out3
+ punpcklqdq m3, m4, m0 ; out6 -out7
+ punpckhqdq m0, m4 ; out0 -out1
+%else
ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a
ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a
ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a
@@ -743,6 +785,7 @@
pmulhrsw m2, m5 ; out4 -out5
pshufd m1, m1, q1032
pmulhrsw m1, m5 ; out2 -out3
+%endif
%endmacro
INIT_YMM avx2
@@ -790,7 +833,7 @@
pmulhrsw m0, m2
pmulhrsw m1, m2
call m(iadst_8x4_internal).main
- punpckhwd m3, m0, m2
+ punpckhwd m3, m0, m1
punpcklwd m0, m1
punpckhwd m1, m0, m3
punpcklwd m0, m3
@@ -800,7 +843,7 @@
vextracti128 xm3, m1, 1
pshufd xm4, xm0, q1032
pshufd xm5, xm1, q1032
- call .main
+ call .main_pass2
vpbroadcastd m4, [o(pw_2048)]
vinserti128 m0, m0, xm2, 1
vinserti128 m1, m1, xm3, 1
@@ -822,9 +865,13 @@
WRITE_4X8 0, 1
RET
ALIGN function_align
-.main:
- WRAP_XMM IADST8_1D_PACKED
+.main_pass1:
+ WRAP_XMM IADST8_1D_PACKED 1
ret
+ALIGN function_align
+.main_pass2:
+ WRAP_XMM IADST8_1D_PACKED 2
+ ret
INV_TXFM_4X8_FN flipadst, dct, 0
INV_TXFM_4X8_FN flipadst, adst
@@ -839,7 +886,7 @@
pmulhrsw m1, m2
call m(iadst_8x4_internal).main
punpcklwd m3, m1, m0
- punpckhwd m1, m2, m0
+ punpckhwd m1, m0
punpcklwd m0, m1, m3
punpckhwd m1, m3
jmp tx2q
@@ -848,7 +895,7 @@
vextracti128 xm3, m1, 1
pshufd xm4, xm0, q1032
pshufd xm5, xm1, q1032
- call m(iadst_4x8_internal).main
+ call m(iadst_4x8_internal).main_pass2
vpbroadcastd m5, [o(pw_2048)]
vinserti128 m3, m3, xm1, 1
vinserti128 m2, m2, xm0, 1
@@ -1099,8 +1146,13 @@
jmp tx2q
.pass2:
call .main
- pshufd m1, m1, q1032
+ vpbroadcastd m5, [o(pw_2896x8)]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m5 ; out8 -out11 -out9 out10
vpbroadcastd m5, [o(pw_2048)]
+ pshufd m1, m1, q1032
vpblendd m4, m1, m0, 0x33
vpblendd m0, m0, m2, 0x33
vpblendd m2, m2, m3, 0x33
@@ -1176,7 +1228,6 @@
vinserti128 m0, m0, xm2, 1 ; t1 t0 t9a t8a
vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14
vinserti128 m4, m4, xm1, 1 ; t4a t5a t12 t13
- vpbroadcastd m5, [o(pw_2896x8)]
pshufd m2, m2, q1032 ; t6a t7a t14 t15
psubsw m1, m0, m3 ; t3a t2a t11 t10
paddsw m0, m3 ; -out15 out0 out14 -out1
@@ -1184,11 +1235,22 @@
psubsw m4, m2 ; t6 t7 t14a t15a
shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a
vpblendd m4, m4, m1, 0x33 ; t3a t7 t11 t15a
- paddsw m1, m2, m4
- psubsw m2, m4
- pmulhrsw m1, m5 ; -out7 out4 out6 -out5
- pmulhrsw m2, m5 ; out8 -out11 -out9 out10
ret
+ALIGN function_align
+.main_pass1_end:
+ vpbroadcastd m5, [o(pw_m2896_2896)]
+ vpbroadcastd m6, [o(pw_2896_2896)]
+ punpcklwd m1, m4, m2
+ punpckhwd m4, m2
+ pmaddwd m2, m5, m4
+ pmaddwd m4, m6
+ pmaddwd m5, m1
+ pmaddwd m1, m6
+ REPX {paddd x, m8}, m5, m1, m2, m4
+ REPX {psrad x, 12}, m5, m2, m1, m4
+ packssdw m2, m5 ; -out11 out8 out10 -out9
+ packssdw m1, m4 ; -out7 out4 out6 -out5
+ ret
INV_TXFM_4X16_FN flipadst, dct, 0
INV_TXFM_4X16_FN flipadst, adst
@@ -1214,8 +1276,13 @@
jmp tx2q
.pass2:
call m(iadst_4x16_internal).main
- pshufd m1, m1, q1032
+ vpbroadcastd m5, [o(pw_2896x8)]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m5 ; out8 -out11 -out9 out10
vpbroadcastd m6, [o(pw_2048)]
+ pshufd m1, m1, q1032
vpblendd m4, m0, m2, 0x33
vpblendd m0, m0, m1, 0xcc
vpblendd m1, m1, m3, 0xcc
@@ -1381,7 +1448,7 @@
pmulhrsw xm2, xm0, [cq+16*2]
pmulhrsw xm4, xm0
pmulhrsw xm5, xm0
- call m(iadst_4x8_internal).main
+ call m(iadst_4x8_internal).main_pass1
vinserti128 m0, m0, xm2, 1
vinserti128 m1, m1, xm3, 1
punpckhwd m2, m0, m1
@@ -1393,7 +1460,6 @@
jmp tx2q
.pass2:
call .main
- vpblendd m1, m1, m2, 0xcc
.end:
vpermq m0, m0, q3120
vpermq m1, m1, q3120
@@ -1427,7 +1493,7 @@
pmulhrsw xm2, xm0, [cq+16*2]
pmulhrsw xm4, xm0
pmulhrsw xm5, xm0
- call m(iadst_4x8_internal).main
+ call m(iadst_4x8_internal).main_pass1
vinserti128 m3, m3, xm1, 1
vinserti128 m2, m2, xm0, 1
punpckhwd m1, m3, m2
@@ -1439,7 +1505,7 @@
jmp tx2q
.pass2:
call m(iadst_8x4_internal).main
- vpblendd m2, m2, m1, 0x33
+ mova m2, m1
vpermq m1, m0, q2031
vpermq m0, m2, q2031
jmp m(iadst_8x4_internal).end2
@@ -1580,7 +1646,7 @@
vpermq m3, [cq+32*3], q3120 ; 6 7
vpermq m5, [cq+32*1], q1302 ; 3 2
vpermq m2, [cq+32*2], q3120 ; 4 5
- call .main
+ call .main_pass1
vpbroadcastd m5, [o(pw_16384)]
punpcklwd m4, m0, m1
punpckhwd m0, m1
@@ -1604,7 +1670,7 @@
.pass2:
pshufd m4, m0, q1032
pshufd m5, m1, q1032
- call .main
+ call .main_pass2
vpbroadcastd m5, [o(pw_2048)]
vpbroadcastd xm4, [o(pw_4096)]
psubw m4, m5 ; lower half = 2048, upper half = -2048
@@ -1629,9 +1695,13 @@
WRITE_8X4 2, 3, 4, 5
RET
ALIGN function_align
-.main:
- IADST8_1D_PACKED
+.main_pass1:
+ IADST8_1D_PACKED 1
ret
+ALIGN function_align
+.main_pass2:
+ IADST8_1D_PACKED 2
+ ret
INV_TXFM_8X8_FN flipadst, dct
INV_TXFM_8X8_FN flipadst, adst
@@ -1643,7 +1713,7 @@
vpermq m3, [cq+32*3], q3120 ; 6 7
vpermq m5, [cq+32*1], q1302 ; 3 2
vpermq m2, [cq+32*2], q3120 ; 4 5
- call m(iadst_8x8_internal).main
+ call m(iadst_8x8_internal).main_pass1
vpbroadcastd m5, [o(pw_16384)]
punpckhwd m4, m3, m2
punpcklwd m3, m2
@@ -1667,7 +1737,7 @@
.pass2:
pshufd m4, m0, q1032
pshufd m5, m1, q1032
- call m(iadst_8x8_internal).main
+ call m(iadst_8x8_internal).main_pass2
vpbroadcastd m4, [o(pw_2048)]
vpbroadcastd xm5, [o(pw_4096)]
psubw m4, m5 ; lower half = -2048, upper half = 2048
@@ -1867,6 +1937,7 @@
cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_8X16_LOAD_COEFS
call m(iadst_16x8_internal).main
+ call m(iadst_16x8_internal).main_pass1_end
vpbroadcastd m10, [o(pw_16384)]
pslld m9, m10, 17
psubw m10, m9 ; 16384, -16384
@@ -1874,6 +1945,7 @@
ALIGN function_align
.pass2:
call .main
+ call .main_pass2_end
vpbroadcastd m9, [o(pw_2048)]
vpbroadcastd xm8, [o(pw_4096)]
psubw m8, m9
@@ -1930,39 +2002,73 @@
paddsw m4, m6 ; t8a t9a
vpbroadcastd m11, [o(pw_m3784_1567)]
vpbroadcastd m12, [o(pw_1567_3784)]
- ITX_MUL2X_PACK 3, 6, _, 10, 11, 12, 4 ; t4a t5a
+ ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a
psubw m6, m9, m11 ; pw_3784_m1567
- ITX_MUL2X_PACK 8, 12, _, 10, 12, 6, 4 ; t6a t7a
+ ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a
vpbroadcastd m11, [o(pw_m1567_3784)]
vpbroadcastd m12, [o(pw_3784_1567)]
- ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 4 ; t15 t14
+ ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14
psubw m6, m9, m11 ; pw_1567_m3784
- ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 4 ; t13 t12
- vbroadcasti128 m11, [o(deint_shuf)]
- vpbroadcastd m12, [o(pw_2896x8)]
- psubsw m6, m0, m1 ; t3a t2a
+ ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12
+ vbroadcasti128 m12, [o(deint_shuf)]
+ paddsw m6, m4, m7 ; -out1 out14
+ psubsw m4, m7 ; t10 t11
+ psubsw m11, m3, m8 ; t7 t6
+ paddsw m8, m3 ; out12 -out3
+ psubsw m3, m0, m1 ; t3a t2a
paddsw m0, m1 ; -out15 out0
paddsw m1, m2, m5 ; -out13 out2
psubsw m5, m2 ; t15a t14a
- paddsw m2, m4, m7 ; -out1 out14
- psubsw m4, m7 ; t10 t11
- psubsw m7, m3, m8 ; t6 t7
- paddsw m8, m3 ; -out3 out12
- REPX {pshufb x, m11}, m6, m4, m0, m2
- vpblendd m3, m6, m4, 0xcc ; t3a t11
- shufps m6, m6, m4, q1032 ; t2a t10
- vpblendd m4, m5, m7, 0xcc ; t15a t7
- shufps m5, m5, m7, q1032 ; t14a t6
- shufps m7, m2, m0, q1032 ; out14 -out15
- vpblendd m0, m0, m2, 0x33 ; -out1 out0
- paddsw m2, m5, m4 ; -out5 out4
- psubsw m5, m4 ; out10 -out11
- psubsw m4, m6, m3 ; out8 -out9
- paddsw m3, m6 ; -out7 out6
- shufps m6, m8, m1, q1032 ; out12 -out13
- vpblendd m1, m1, m8, 0x33 ; -out3 out2
- REPX {pmulhrsw x, m12}, m2, m3, m4, m5
+ pshufb m0, m12
+ pshufb m6, m12
+ pshufb m8, m12
+ pshufb m1, m12
+ shufps m7, m6, m0, q1032 ; out14 -out15
+ vpblendd m0, m6, 0x33 ; -out1 out0
+ punpcklqdq m6, m8, m1 ; out12 -out13
+ punpckhqdq m1, m8, m1 ; -out3 out2
ret
+ALIGN function_align
+.main_pass1_end:
+ vpbroadcastd m8, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ pmaddwd m9, m8, m11 ; -out11
+ pmaddwd m2, m12, m5 ; -out5
+ pmaddwd m5, m8 ; out10
+ pmaddwd m11, m12 ; out4
+ REPX {paddd x, m10}, m9, m5, m2, m11
+ REPX {psrad x, 12 }, m9, m5, m2, m11
+ packssdw m5, m9 ; out10 -out11
+ packssdw m2, m11 ; -out5 out4
+ pmaddwd m11, m8, m3 ; out8
+ vpbroadcastd m8, [o(pw_2896_m2896)]
+ pmaddwd m3, m12 ; -out7
+ pmaddwd m8, m4 ; -out9
+ pmaddwd m4, m12 ; out6
+ REPX {paddd x, m10}, m11, m3, m8, m4
+ REPX {psrad x, 12 }, m11, m3, m8, m4
+ packssdw m3, m4 ; -out7 out6
+ packssdw m4, m11, m8 ; out8 -out9
+ vpbroadcastd m10, [o(pw_16384)]
+ pxor m9, m9
+ ret
+ALIGN function_align
+.main_pass2_end:
+ vpbroadcastd m8, [o(pw_2896x8)]
+ pshufb m2, m11, m12
+ pshufb m5, m12
+ pshufb m3, m12
+ pshufb m4, m12
+ punpcklqdq m11, m5, m2 ; t15a t7
+ punpckhqdq m5, m2 ; t14a t6
+ shufps m2, m3, m4, q1032 ; t2a t10
+ vpblendd m3, m4, 0xcc ; t3a t11
+ psubsw m4, m2, m3 ; out8 -out9
+ paddsw m3, m2 ; -out7 out6
+ paddsw m2, m5, m11 ; -out5 out4
+ psubsw m5, m11 ; out10 -out11
+ REPX {pmulhrsw x, m8}, m2, m3, m4, m5
+ ret
INV_TXFM_8X16_FN flipadst, dct
INV_TXFM_8X16_FN flipadst, adst
@@ -1972,6 +2078,7 @@
cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_8X16_LOAD_COEFS
call m(iadst_16x8_internal).main
+ call m(iadst_16x8_internal).main_pass1_end
vpbroadcastd m9, [o(pw_16384)]
pslld m10, m9, 17
psubw m10, m9 ; -16384, 16384
@@ -1990,6 +2097,7 @@
jmp m(idct_8x16_internal).pass1_end2
.pass2:
call m(iadst_8x16_internal).main
+ call m(iadst_8x16_internal).main_pass2_end
vpbroadcastd m8, [o(pw_2048)]
vpbroadcastd xm9, [o(pw_4096)]
psubw m8, m9
@@ -2232,7 +2340,7 @@
vpermq m1, [cq+32*1], q1230
vpermq m2, [cq+32*2], q2103
call m(iadst_4x16_internal).main2
- pshufd m2, m2, q1032
+ call m(iadst_4x16_internal).main_pass1_end
punpcklwd m4, m3, m1
punpcklwd m5, m2, m0
punpckhwd m0, m1
@@ -2276,13 +2384,15 @@
RET
ALIGN function_align
.main:
+ vpbroadcastd m6, [o(pw_m3344_3344)]
vpbroadcastd m7, [o(pw_3803_1321)]
vpbroadcastd m8, [o(pw_m1321_2482)]
vpbroadcastd m9, [o(pw_2482_3344)]
punpcklwd m4, m2, m0 ; in2 in0 l
- psubw m6, m0, m2
punpckhwd m2, m0 ; in2 in0 h
- paddw m6, m3 ; t2
+ psrld m5, m6, 16
+ pmaddwd m10, m6, m4 ; t2:02 l
+ pmaddwd m6, m2 ; t2:02 h
pmaddwd m0, m7, m4 ; t0:02 l
pmaddwd m7, m2 ; t0:02 h
pmaddwd m4, m8 ; t1:02 l
@@ -2289,7 +2399,11 @@
pmaddwd m8, m2 ; t1:02 h
punpckhwd m2, m3, m1 ; in3 in1 h
punpcklwd m3, m1 ; in3 in1 l
+ pmaddwd m1, m5, m2 ; t2:3 h
+ pmaddwd m5, m3 ; t2:3 l
+ paddd m6, m1
vpbroadcastd m1, [o(pd_2048)]
+ paddd m10, m5
pmaddwd m5, m9, m3
pmaddwd m9, m2
paddd m0, m1
@@ -2299,6 +2413,8 @@
vpbroadcastd m9, [o(pw_m3803_3344)]
pmaddwd m5, m9, m2
pmaddwd m9, m3
+ paddd m10, m1 ; t2 + 2048 l
+ paddd m6, m1 ; t2 + 2048 h
paddd m5, m1 ; t1:13 + 2048 h
paddd m1, m9 ; t1:13 + 2048 l
vpbroadcastd m9, [o(pw_m3803_m6688)]
@@ -2310,12 +2426,11 @@
paddd m4, m0
paddd m2, m8 ; t0 + t1 - t3 + 2048 h
paddd m3, m4 ; t0 + t1 - t3 + 2048 l
- REPX {psrad x, 12}, m0, m7, m5, m1, m2, m3
+ REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3
packssdw m0, m7
packssdw m1, m5
packssdw m3, m2
- vpbroadcastd m2, [o(pw_3344x8)]
- pmulhrsw m2, m6
+ packssdw m2, m10, m6
ret
INV_TXFM_16X4_FN flipadst, dct
@@ -2329,7 +2444,7 @@
vpermq m1, [cq+32*1], q1230
vpermq m2, [cq+32*2], q2103
call m(iadst_4x16_internal).main2
- pshufd m2, m2, q1032
+ call m(iadst_4x16_internal).main_pass1_end
punpckhwd m4, m3, m2
punpckhwd m5, m1, m0
punpcklwd m0, m2
@@ -2552,7 +2667,7 @@
cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_16X8_LOAD_COEFS 1302
call m(iadst_8x16_internal).main2
- vpbroadcastd m10, [o(pw_16384)]
+ call m(iadst_8x16_internal).main_pass1_end
psubw m11, m9, m10
punpcklwd m8, m0, m2
punpckhwd m0, m2
@@ -2567,7 +2682,7 @@
ALIGN function_align
.pass2:
call .main
- vpbroadcastd m9, [o(pw_2048)]
+ call .main_pass2_end
pxor m8, m8
psubw m8, m9
REPX {pmulhrsw x, m9}, m0, m2, m4, m6
@@ -2591,7 +2706,6 @@
ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
psubsw m9, m6, m8 ; t7
paddsw m6, m8 ; out6
- vpbroadcastd m8, [o(pw_2896x8)]
psubsw m3, m7, m5 ; t3
paddsw m7, m5 ; -out7
psubsw m5, m0, m2 ; t2
@@ -2598,6 +2712,35 @@
paddsw m0, m2 ; out0
psubsw m2, m1, m4 ; t6
paddsw m1, m4 ; -out1
+ ret
+ALIGN function_align
+.main_pass1_end:
+ vpbroadcastd m11, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ punpckhwd m4, m3, m5
+ punpcklwd m3, m5
+ pmaddwd m5, m11, m4
+ pmaddwd m4, m12
+ pmaddwd m8, m11, m3
+ pmaddwd m3, m12
+ REPX {paddd x, m10}, m5, m4, m8, m3
+ REPX {psrad x, 12 }, m5, m8, m4, m3
+ packssdw m3, m4 ; -out3
+ packssdw m4, m8, m5 ; out4
+ punpcklwd m5, m9, m2
+ punpckhwd m9, m2
+ pmaddwd m2, m12, m5
+ pmaddwd m5, m11
+ pmaddwd m12, m9
+ pmaddwd m11, m9
+ REPX {paddd x, m10}, m2, m5, m12, m11
+ REPX {psrad x, 12 }, m2, m12, m5, m11
+ packssdw m2, m12 ; out2
+ packssdw m5, m11 ; -out5
+ ret
+ALIGN function_align
+.main_pass2_end:
+ vpbroadcastd m8, [o(pw_2896x8)]
psubsw m4, m5, m3
paddsw m3, m5
psubsw m5, m2, m9
@@ -2606,6 +2749,7 @@
pmulhrsw m3, m8 ; -out3
pmulhrsw m4, m8 ; out4
pmulhrsw m5, m8 ; -out5
+ vpbroadcastd m9, [o(pw_2048)]
ret
INV_TXFM_16X8_FN flipadst, dct
@@ -2616,7 +2760,7 @@
cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_16X8_LOAD_COEFS 1302
call m(iadst_8x16_internal).main2
- vpbroadcastd m10, [o(pw_16384)]
+ call m(iadst_8x16_internal).main_pass1_end
psubw m9, m10
punpcklwd m8, m6, m4
punpckhwd m6, m4
@@ -2655,7 +2799,7 @@
jmp tx2q
.pass2:
call m(iadst_16x8_internal).main
- vpbroadcastd m9, [o(pw_2048)]
+ call m(iadst_16x8_internal).main_pass2_end
pxor m8, m8
psubw m8, m9
pmulhrsw m10, m7, m8
@@ -2986,8 +3130,12 @@
cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
ITX_16X16_LOAD_COEFS
call .main
- vpbroadcastd m1, [o(pw_8192)]
- REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+ call .main_pass1_end
+ pmulhrsw m0, m1, [cq+32*0]
+ pmulhrsw m2, m1, [cq+32*1]
+ REPX {pmulhrsw x, m1}, m4, m6, m8, m10
+ pmulhrsw m12, m1, [cq+32*2]
+ pmulhrsw m14, m1, [cq+32*3]
vextracti128 [rsp+16*5], m8, 1
mova [rsp+16*1], xm8
pxor m8, m8
@@ -2996,7 +3144,7 @@
ALIGN function_align
.pass2:
call .main
- vpbroadcastd m1, [o(pw_2048)]
+ call .main_pass2_end
REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
mova [rsp+32*0], m6
pxor m6, m6
@@ -3081,6 +3229,62 @@
paddsw m0, m12 ; out0
paddsw m12, m8, m5 ; out12
psubsw m8, m5 ; t7
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova [cq+32*0], m0
+ mova [cq+32*1], m2
+ mova [cq+32*2], m12
+ mova [cq+32*3], m14
+ vpbroadcastd m14, [pw_m2896_2896]
+ vpbroadcastd m12, [pw_2896_2896]
+ vpbroadcastd m2, [pd_2048]
+ punpcklwd m5, m11, m10
+ punpckhwd m11, m10
+ pmaddwd m10, m14, m5
+ pmaddwd m0, m14, m11
+ pmaddwd m5, m12
+ pmaddwd m11, m12
+ REPX {paddd x, m2}, m10, m0, m5, m11
+ REPX {psrad x, 12}, m10, m0, m5, m11
+ packssdw m10, m0 ; out10
+ packssdw m5, m11 ; -out5
+ punpcklwd m11, m8, m4
+ punpckhwd m8, m4
+ pmaddwd m4, m12, m11
+ pmaddwd m0, m12, m8
+ pmaddwd m11, m14
+ pmaddwd m8, m14
+ REPX {paddd x, m2}, m4, m0, m11, m8
+ REPX {psrad x, 12}, m4, m0, m11, m8
+ packssdw m4, m0 ; out4
+ packssdw m11, m8 ; -out11
+ punpcklwd m8, m9, m7
+ punpckhwd m9, m7
+ pmaddwd m7, m12, m8
+ pmaddwd m0, m12, m9
+ pmaddwd m8, m14
+ pmaddwd m9, m14
+ REPX {paddd x, m2}, m7, m0, m8, m9
+ REPX {psrad x, 12}, m7, m0, m8, m9
+ packssdw m7, m0 ; -out7
+ packssdw m8, m9 ; out8
+ punpckhwd m0, m6, m1
+ punpcklwd m6, m1
+ pmaddwd m1, m14, m0
+ pmaddwd m9, m14, m6
+ pmaddwd m0, m12
+ pmaddwd m6, m12
+ REPX {paddd x, m2}, m1, m9, m0, m6
+ REPX {psrad x, 12}, m1, m9, m0, m6
+ packssdw m9, m1 ; -out7
+ packssdw m6, m0 ; out8
+ vpbroadcastd m1, [o(pw_8192)]
+ ret
+ALIGN function_align
+.main_pass2_end:
+ ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to
+ ; 16-bit here will produce the same result as using 32-bit intermediates.
paddsw m5, m10, m11 ; -out5
psubsw m10, m11 ; out10
psubsw m11, m4, m8 ; -out11
@@ -3091,6 +3295,7 @@
paddsw m6, m1 ; out6
vpbroadcastd m1, [o(pw_2896x8)]
REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
+ vpbroadcastd m1, [o(pw_2048)]
ret
INV_TXFM_16X16_FN flipadst, dct
@@ -3100,16 +3305,16 @@
cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
ITX_16X16_LOAD_COEFS
call m(iadst_16x16_internal).main
- vpbroadcastd m1, [o(pw_8192)]
+ call m(iadst_16x16_internal).main_pass1_end
pmulhrsw m6, m1
+ pmulhrsw m2, m1, m8
mova [rsp+32*2], m6
pmulhrsw m6, m1, m4
pmulhrsw m4, m1, m10
- pmulhrsw m10, m1, m12
- pmulhrsw m12, m1, m2
- pmulhrsw m2, m1, m8
- pmulhrsw m8, m1, m14
- pmulhrsw m14, m1, m0
+ pmulhrsw m8, m1, [cq+32*3]
+ pmulhrsw m10, m1, [cq+32*2]
+ pmulhrsw m12, m1, [cq+32*1]
+ pmulhrsw m14, m1, [cq+32*0]
pxor m0, m0
psubw m0, m1
REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15
@@ -3136,7 +3341,7 @@
jmp m(idct_16x16_internal).pass1_end3
.pass2:
call m(iadst_16x16_internal).main
- vpbroadcastd m1, [o(pw_2048)]
+ call m(iadst_16x16_internal).main_pass2_end
pmulhrsw m0, m1
pmulhrsw m8, m1
mova [rsp+32*0], m0
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -43,8 +43,11 @@
pw_2482_m1321: times 4 dw 2482, -1321
pw_3344_2482: times 4 dw 3344, 2482
pw_3344_m3803: times 4 dw 3344, -3803
+pw_3344_m3344: times 4 dw 3344, -3344
+pw_0_3344 times 4 dw 0, 3344
pw_m6688_m3803: times 4 dw -6688, -3803
+COEF_PAIR 2896, 2896
COEF_PAIR 1567, 3784
COEF_PAIR 799, 4017
COEF_PAIR 3406, 2276
@@ -126,7 +129,6 @@
pw_4085x8: times 8 dw 4085*8
pw_m301x8: times 8 dw -301*8
-
iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
@@ -200,7 +202,6 @@
ret
%endmacro
-
; flags: 1 = swap, 2: coef_regs
%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
%if %6 & 2
@@ -239,35 +240,6 @@
paddsw m0, m2 ;high: out1 ;low: out0
%endmacro
-
-%macro IADST4_1D_PACKED 0
- punpcklwd m2, m0, m1 ;unpacked in0 in2
- punpckhwd m3, m0, m1 ;unpacked in1 in3
- psubw m0, m1
- punpckhqdq m1, m1 ;
- paddw m1, m0 ;low: in0 - in2 + in3
-
- pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
- pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
- pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
- pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
- paddd m4, m0 ;t0 + t3
- pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
- pmulhrsw m1, [o(pw_3344x8)] ;low: out2
- mova m0, [o(pd_2048)]
- paddd m2, m0
- paddd m0, m4 ;t0 + t3 + 2048
- paddd m5, m2 ;t1 + t3 + 2048
- paddd m2, m4
- paddd m2, m3 ;t0 + t1 - t3 + 2048
-
- psrad m0, 12 ;out0
- psrad m5, 12 ;out1
- psrad m2, 12 ;out3
- packssdw m0, m5 ;high: out1 ;low: out0
- packssdw m2, m2 ;high: out3 ;low: out3
-%endmacro
-
%macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack
cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
%undef cmp
@@ -392,15 +364,14 @@
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
call .main
- punpckhwd m3, m0, m2
+ punpckhwd m2, m0, m1
punpcklwd m0, m1
- punpckhwd m1, m0, m3 ;high: in3 ;low :in2
- punpcklwd m0, m3 ;high: in1 ;low: in0
+ punpckhwd m1, m0, m2 ;high: in3 ;low :in2
+ punpcklwd m0, m2 ;high: in1 ;low: in0
jmp tx2q
.pass2:
call .main
- punpcklqdq m1, m2 ;out2 out3
.end:
pxor m2, m2
@@ -412,7 +383,28 @@
ALIGN function_align
.main:
- IADST4_1D_PACKED
+ punpcklwd m2, m0, m1 ;unpacked in0 in2
+ punpckhwd m0, m1 ;unpacked in1 in3
+ mova m3, m0
+ pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2
+ pmaddwd m0, [o(pw_0_3344)] ;3344 * in3
+ paddd m1, m0 ;t2
+ pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
+ paddd m4, m0 ;t0 + t3
+ pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+ mova m0, [o(pd_2048)]
+ paddd m1, m0 ;t2 + 2048
+ paddd m2, m0
+ paddd m0, m4 ;t0 + t3 + 2048
+ paddd m5, m2 ;t1 + t3 + 2048
+ paddd m2, m4
+ paddd m2, m3 ;t0 + t1 - t3 + 2048
+ REPX {psrad x, 12}, m1, m0, m5, m2
+ packssdw m0, m5 ;high: out1 ;low: out0
+ packssdw m1, m2 ;high: out3 ;low: out3
ret
INV_TXFM_4X4_FN flipadst, dct, 0
@@ -424,16 +416,14 @@
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
call m(iadst_4x4_internal).main
- punpcklwd m1, m0
- punpckhwd m2, m0
- punpcklwd m0, m2, m1 ;high: in3 ;low :in2
- punpckhwd m2, m1 ;high: in1 ;low: in0
- mova m1, m2
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2 ;high: in3 ;low :in2
+ punpckhwd m1, m2 ;high: in1 ;low: in0
jmp tx2q
.pass2:
call m(iadst_4x4_internal).main
- punpcklqdq m1, m2 ;out2 out3
.end:
pxor m2, m2
@@ -584,99 +574,6 @@
mova m%4, m%5
%endmacro
-%macro IADST4_1D 0
- mova m4, m2
- psubw m2, m0, m4
- paddw m2, m3 ;low: in0 - in2 + in3
-
- punpckhwd m6, m0, m4 ;unpacked in0 in2
- punpckhwd m7, m1, m3 ;unpacked in1 in3
- punpcklwd m0, m4 ;unpacked in0 in2
- punpcklwd m1, m3 ;unpacked in1 in3
-
- pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
- pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
- pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
- pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
- paddd m3, m4 ;t0 + t3
-
- pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
- pmulhrsw m2, [o(pw_3344x8)] ;out2
- mova m4, [o(pd_2048)]
- paddd m0, m4
- paddd m4, m3 ;t0 + t3 + 2048
- paddd m5, m0 ;t1 + t3 + 2048
- paddd m3, m0
- paddd m3, m1 ;t0 + t1 - t3 + 2048
-
- psrad m4, 12 ;out0
- psrad m5, 12 ;out1
- psrad m3, 12 ;out3
- packssdw m0, m4, m5 ;low: out0 high: out1
-
- pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
- pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
- pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
- pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
- paddd m1, m4 ;t0 + t3
- pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
-
- mova m4, [o(pd_2048)]
- paddd m6, m4
- paddd m4, m1 ;t0 + t3 + 2048
- paddd m5, m6 ;t1 + t3 + 2048
- paddd m1, m6
- paddd m1, m7 ;t0 + t1 - t3 + 2048
-
- psrad m4, 12 ;out0
- psrad m5, 12 ;out1
- psrad m1, 12 ;out3
- packssdw m3, m1 ;out3
- packssdw m4, m5 ;low: out0 high: out1
-
- punpckhqdq m1, m0, m4 ;out1
- punpcklqdq m0, m4 ;out0
-%endmacro
-
-%macro IADST8_1D_PACKED 0
- mova m6, [o(pd_2048)]
- punpckhwd m4, m3, m0 ;unpacked in7 in0
- punpckhwd m5, m2, m1 ;unpacked in5 in2
- punpcklwd m1, m2 ;unpacked in3 in4
- punpcklwd m0, m3 ;unpacked in1 in6
- ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a
- ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a
- ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a
- ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a
-
- psubsw m3, m4, m1 ;low: t4 high: t5
- paddsw m4, m1 ;low: t0 high: t1
- psubsw m2, m5, m0 ;low: t6 high: t7
- paddsw m5, m0 ;low: t2 high: t3
-
- shufps m1, m3, m2, q1032
- punpckhwd m2, m1
- punpcklwd m3, m1
- ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a
- ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a
-
- psubsw m1, m4, m5 ;low: t2 high: t3
- paddsw m4, m5 ;low: out0 high: -out7
- psubsw m5, m3, m2 ;low: t7 high: t6
- paddsw m3, m2 ;low: out6 high: -out1
- shufps m0, m4, m3, q3210 ;low: out0 high: -out1
- shufps m3, m4, q3210 ;low: out6 high: -out7
-
- shufps m4, m1, m5, q1032 ;low: t3 high: t7
- shufps m1, m5, q3210 ;low: t2 high: t6
- mova m5, [o(pw_2896x8)]
- psubsw m2, m1, m4 ;low: t2-t3 high: t6-t7
- paddsw m1, m4 ;low: t2+t3 high: t6+t7
- pmulhrsw m2, m5 ;low: out4 high: -out5
- shufps m1, m1, q1032
- pmulhrsw m1, m5 ;low: out2 high: -out3
-%endmacro
-
%macro WRITE_4X8 4 ;row[1-4]
WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4
lea dstq, [dstq+strideq*4]
@@ -838,7 +735,48 @@
ALIGN function_align
.main:
- IADST8_1D_PACKED
+ mova m6, [o(pd_2048)]
+ punpckhwd m4, m3, m0 ;unpacked in7 in0
+ punpckhwd m5, m2, m1 ;unpacked in5 in2
+ punpcklwd m1, m2 ;unpacked in3 in4
+ punpcklwd m0, m3 ;unpacked in1 in6
+ ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a
+ ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a
+ ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a
+ ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a
+
+ psubsw m3, m4, m1 ;low: t4 high: t5
+ paddsw m4, m1 ;low: t0 high: t1
+ psubsw m2, m5, m0 ;low: t6 high: t7
+ paddsw m5, m0 ;low: t2 high: t3
+
+ shufps m1, m3, m2, q1032
+ punpckhwd m2, m1
+ punpcklwd m3, m1
+ ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a
+ ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a
+
+ psubsw m1, m4, m5 ;low: t2 high: t3
+ paddsw m4, m5 ;low: out0 high: -out7
+ psubsw m5, m3, m2 ;low: t7 high: t6
+ paddsw m3, m2 ;low: out6 high: -out1
+ shufps m0, m4, m3, q3210 ;low: out0 high: -out1
+ shufps m3, m4, q3210 ;low: out6 high: -out7
+
+ mova m2, [o(pw_2896_m2896)]
+ mova m7, [o(pw_2896_2896)]
+ shufps m4, m1, m5, q1032 ;low: t3 high: t7
+ shufps m1, m5, q3210 ;low: t2 high: t6
+ punpcklwd m5, m1, m4
+ punpckhwd m1, m4
+ pmaddwd m4, m2, m1 ;-out5
+ pmaddwd m2, m5 ; out4
+ pmaddwd m1, m7 ; out2
+ pmaddwd m5, m7 ;-out3
+ REPX {paddd x, m6}, m4, m2, m1, m5
+ REPX {psrad x, 12}, m4, m2, m1, m5
+ packssdw m1, m5 ;low: out2 high: -out3
+ packssdw m2, m4 ;low: out4 high: -out5
ret
INV_TXFM_4X8_FN flipadst, dct, 0
@@ -1109,7 +1047,67 @@
ALIGN function_align
.main:
- IADST4_1D
+ punpckhwd m6, m0, m2 ;unpacked in0 in2
+ punpcklwd m0, m2 ;unpacked in0 in2
+ punpckhwd m7, m1, m3 ;unpacked in1 in3
+ punpcklwd m1, m3 ;unpacked in1 in3
+
+ mova m2, [o(pw_3344_m3344)]
+ mova m4, [o(pw_0_3344)]
+ pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2
+ pmaddwd m5, m4, m7 ;3344 * in3
+ pmaddwd m2, m0
+ pmaddwd m4, m1
+ paddd m3, m5
+ paddd m2, m4
+ mova m4, [o(pd_2048)]
+ paddd m3, m4 ;t2 + 2048
+ paddd m2, m4
+ psrad m3, 12
+ psrad m2, 12
+ packssdw m2, m3 ;out2
+
+ pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
+ paddd m3, m4 ;t0 + t3
+
+ pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+ mova m4, [o(pd_2048)]
+ paddd m0, m4
+ paddd m4, m3 ;t0 + t3 + 2048
+ paddd m5, m0 ;t1 + t3 + 2048
+ paddd m3, m0
+ paddd m3, m1 ;t0 + t1 - t3 + 2048
+
+ psrad m4, 12 ;out0
+ psrad m5, 12 ;out1
+ psrad m3, 12 ;out3
+ packssdw m0, m4, m5 ;low: out0 high: out1
+
+ pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
+ paddd m1, m4 ;t0 + t3
+ pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+
+ mova m4, [o(pd_2048)]
+ paddd m6, m4
+ paddd m4, m1 ;t0 + t3 + 2048
+ paddd m5, m6 ;t1 + t3 + 2048
+ paddd m1, m6
+ paddd m1, m7 ;t0 + t1 - t3 + 2048
+
+ psrad m4, 12 ;out0
+ psrad m5, 12 ;out1
+ psrad m1, 12 ;out3
+ packssdw m3, m1 ;out3
+ packssdw m4, m5 ;low: out0 high: out1
+
+ punpckhqdq m1, m0, m4 ;out1
+ punpcklqdq m0, m4 ;out0
ret
INV_TXFM_8X4_FN flipadst, dct
@@ -1423,6 +1421,7 @@
.pass1:
call .main
+ call .main_pass1_end
.pass1_end:
mova m7, [o(pw_16384)]
@@ -1441,6 +1440,7 @@
.pass2_main:
call .main
+ call .main_pass2_end
.end:
mova m7, [o(pw_2048)]
@@ -1491,6 +1491,53 @@
psubsw m5, m6 ;t6
paddsw m6, m2, m7 ;out6
psubsw m2, m7 ;t7
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova [rsp+gprsize*2+16*1], m1
+ mova [rsp+gprsize*2+16*2], m6
+ punpckhwd m1, m4, m3
+ punpcklwd m4, m3
+ punpckhwd m7, m5, m2
+ punpcklwd m5, m2
+ mova m2, [o(pw_2896_2896)]
+ mova m6, [o(pd_2048)]
+ pmaddwd m3, m2, m7
+ pmaddwd m2, m5
+ paddd m3, m6
+ paddd m2, m6
+ psrad m3, 12
+ psrad m2, 12
+ packssdw m2, m3 ;out2
+ mova m3, [o(pw_2896_m2896)]
+ pmaddwd m7, m3
+ pmaddwd m5, m3
+ paddd m7, m6
+ paddd m5, m6
+ psrad m7, 12
+ psrad m5, 12
+ packssdw m5, m7 ;-out5
+ mova m3, [o(pw_2896_2896)]
+ pmaddwd m7, m3, m1
+ pmaddwd m3, m4
+ paddd m7, m6
+ paddd m3, m6
+ psrad m7, 12
+ psrad m3, 12
+ packssdw m3, m7 ;-out3
+ mova m7, [o(pw_2896_m2896)]
+ pmaddwd m1, m7
+ pmaddwd m4, m7
+ paddd m1, m6
+ paddd m4, m6
+ psrad m1, 12
+ psrad m4, 12
+ packssdw m4, m1 ;-out5
+ mova m1, [rsp+gprsize*2+16*1]
+ mova m6, [rsp+gprsize*2+16*2]
+ ret
+ALIGN function_align
+.main_pass2_end:
paddsw m7, m4, m3 ;t2 + t3
psubsw m4, m3 ;t2 - t3
paddsw m3, m5, m2 ;t6 + t7
@@ -1513,6 +1560,7 @@
.pass1:
call m(iadst_8x8_internal).main
+ call m(iadst_8x8_internal).main_pass1_end
.pass1_end:
mova m7, [o(pw_m16384)]
@@ -1542,6 +1590,7 @@
.pass2_main:
call m(iadst_8x8_internal).main
+ call m(iadst_8x8_internal).main_pass2_end
.end:
mova m7, [o(pw_2048)]
@@ -1753,6 +1802,7 @@
.pass2:
call m(iadst_16x4_internal).main
+ call m(iadst_16x4_internal).main_pass2_end
punpcklqdq m6, m5, m4 ;low: -out5 high: -out7
punpckhqdq m4, m5 ;low: out8 high: out10
@@ -1820,6 +1870,7 @@
.pass2:
call m(iadst_16x4_internal).main
+ call m(iadst_16x4_internal).main_pass2_end
punpckhqdq m6, m5, m4 ;low: out5 high: out7
punpcklqdq m4, m5 ;low: -out8 high: -out10
@@ -2160,6 +2211,7 @@
cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_7ROWS coeffq, 16
call .main
+ call .main_pass1_end
punpckhwd m6, m7, m0 ;packed -out11, -out15
punpcklwd m0, m7 ;packed out0, out4
@@ -2193,69 +2245,65 @@
ALIGN function_align
.main:
mova [coeffq+16*6], m0
- pshufd m1, m1, q1032
+ pshufd m0, m1, q1032
pshufd m2, m2, q1032
- punpckhwd m0, m6, m1 ;packed in13, in2
- punpcklwd m1, m6 ;packed in3, in12
- punpckhwd m6, m5, m2 ;packed in11, in4
+ punpckhwd m1, m6, m0 ;packed in13, in2
+ punpcklwd m0, m6 ;packed in3, in12
+ punpckhwd m7, m5, m2 ;packed in11, in4
punpcklwd m2, m5 ;packed in5, in10
- mova m7, [o(pd_2048)]
- ITX_MUL2X_PACK 0, 5, 7, 995, 3973 ;low:t2 high:t3
- ITX_MUL2X_PACK 6, 5, 7, 1751, 3703 ;low:t4 high:t5
- ITX_MUL2X_PACK 2, 5, 7, 3513, 2106 ;low:t10 high:t11
- ITX_MUL2X_PACK 1, 5, 7, 3857, 1380 ;low:t12 high:t13
- psubsw m5, m0, m2 ;low:t10a high:t11a
- paddsw m0, m2 ;low:t2a high:t3a
- psubsw m2, m6, m1 ;low:t12a high:t13a
- paddsw m6, m1 ;low:t4a high:t5a
- punpcklqdq m1, m5
- punpckhwd m1, m5 ;packed t10a, t11a
+ mova m6, [o(pd_2048)]
+ ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3
+ ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5
+ ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11
+ ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13
+ psubsw m5, m1, m2 ;low:t10a high:t11a
+ paddsw m1, m2 ;low:t2a high:t3a
+ psubsw m2, m7, m0 ;low:t12a high:t13a
+ paddsw m7, m0 ;low:t4a high:t5a
+ punpcklqdq m0, m5
+ punpckhwd m0, m5 ;packed t10a, t11a
punpcklqdq m5, m2
punpckhwd m2, m5 ;packed t13a, t12a
- ITX_MUL2X_PACK 1, 5, 7, 3406, 2276 ;low:t10 high:t11
- ITX_MUL2X_PACK 2, 5, 7, 4017, 799, 1 ;low:t12 high:t13
- mova [coeffq+16*4], m0
- mova [coeffq+16*5], m6
- mova m0, [coeffq+16*6]
- mova m6, [coeffq+16*7]
- pshufd m0, m0, q1032
+ ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11
+ ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13
+ mova [coeffq+16*4], m1
+ mova [coeffq+16*5], m7
+ mova m1, [coeffq+16*6]
+ mova m7, [coeffq+16*7]
+ pshufd m1, m1, q1032
pshufd m3, m3, q1032
- punpckhwd m5, m6, m0 ;packed in15, in0
- punpcklwd m0, m6 ;packed in1, in14
- punpckhwd m6, m4, m3 ;packed in9, in6
+ punpckhwd m5, m7, m1 ;packed in15, in0
+ punpcklwd m1, m7 ;packed in1, in14
+ punpckhwd m7, m4, m3 ;packed in9, in6
punpcklwd m3, m4 ;packed in7, in8
- ITX_MUL2X_PACK 5, 4, 7, 201, 4091 ;low:t0 high:t1
- ITX_MUL2X_PACK 6, 4, 7, 2440, 3290 ;low:t6 high:t7
- ITX_MUL2X_PACK 3, 4, 7, 3035, 2751 ;low:t8 high:t9
- ITX_MUL2X_PACK 0, 4, 7, 4052, 601 ;low:t14 high:t15
+ ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1
+ ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7
+ ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9
+ ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15
psubsw m4, m5, m3 ;low:t8a high:t9a
paddsw m5, m3 ;low:t0a high:t1a
- psubsw m3, m6, m0 ;low:t14a high:t15a
- paddsw m6, m0 ;low:t6a high:t7a
- punpcklqdq m0, m4
- punpckhwd m0, m4 ;packed t8a, t9a
+ psubsw m3, m7, m1 ;low:t14a high:t15a
+ paddsw m7, m1 ;low:t6a high:t7a
+ punpcklqdq m1, m4
+ punpckhwd m1, m4 ;packed t8a, t9a
punpcklqdq m4, m3
punpckhwd m3, m4 ;packed t15a, t14a
- ITX_MUL2X_PACK 0, 4, 7, 799, 4017 ;low:t8 high:t9
- ITX_MUL2X_PACK 3, 4, 7, 2276, 3406, 1 ;low:t14 high:t15
- psubsw m4, m0, m2 ;low:t12a high:t13a
- paddsw m0, m2 ;low:t8a high:t9a
- psubsw m2, m1, m3 ;low:t14a high:t15a
- paddsw m1, m3 ;low:t10a high:t11a
- punpcklqdq m3, m4
- punpckhwd m3, m4 ;packed t12a, t13a
- punpcklqdq m4, m2
- punpckhwd m2, m4 ;packed t15a, t14a
- ITX_MUL2X_PACK 3, 4, 7, 1567, 3784 ;low:t12 high:t13
- ITX_MUL2X_PACK 2, 4, 7, 3784, 1567, 1 ;low:t14 high:t15
- psubsw m4, m0, m1 ;low:t10 high:t11
- paddsw m0, m1 ;low:-out1 high:out14
+ ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9
+ ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15
+ paddsw m4, m1, m2 ;low:t12a high:t13a
+ psubsw m1, m2 ;low:t8a high:t9a
+ psubsw m2, m0, m3 ;low:t14a high:t15a
+ paddsw m0, m3 ;low:t10a high:t11a
+ punpcklqdq m3, m1
+ punpckhwd m3, m1 ;packed t12a, t13a
+ punpcklqdq m1, m2
+ punpckhwd m2, m1 ;packed t15a, t14a
+ ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13
+ ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15
psubsw m1, m3, m2 ;low:t14a high:t15a
paddsw m3, m2 ;low:out2 high:-out13
- punpckhqdq m2, m4, m1 ;low:t11 high:t15a
- punpcklqdq m4, m1 ;low:t10 high:t14a
- psubsw m1, m4, m2
- paddsw m2, m4
+ psubsw m2, m4, m0 ;low:t10 high:t11
+ paddsw m0, m4 ;low:-out1 high:out14
mova [coeffq+16*6], m0
mova [coeffq+16*7], m3
mova m0, [coeffq+16*4]
@@ -2262,19 +2310,68 @@
mova m3, [coeffq+16*5]
psubsw m4, m5, m3 ;low:t4 high:t5
paddsw m5, m3 ;low:t0 high:t1
- psubsw m3, m0 ,m6 ;low:t6 high:t7
- paddsw m0, m6 ;low:t2 high:t3
- punpcklqdq m6, m4
- punpckhwd m6, m4 ;packed t4, t5
+ psubsw m3, m0, m7 ;low:t6 high:t7
+ paddsw m0, m7 ;low:t2 high:t3
+ punpcklqdq m7, m4
+ punpckhwd m7, m4 ;packed t4, t5
punpcklqdq m4, m3
punpckhwd m3, m4 ;packed t7, t6
- ITX_MUL2X_PACK 6, 4, 7, 1567, 3784 ;low:t4a high:t5a
- ITX_MUL2X_PACK 3, 4, 7, 3784, 1567, 1 ;low:t6a high:t7a
+ ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a
+ ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a
psubsw m4, m5, m0 ;low:t2a high:t3a
paddsw m0, m5 ;low:out0 high:-out15
- psubsw m5, m6, m3 ;low:t6 high:t7
- paddsw m3, m6 ;low:-out3 high:out12
+ psubsw m5, m7, m3 ;low:t6 high:t7
+ paddsw m3, m7 ;low:-out3 high:out12
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova m7, [o(deint_shuf1)]
+ mova [coeffq+16*4], m0
+ mova [coeffq+16*5], m3
+ mova m0, [o(pw_2896_m2896)]
+ mova m3, [o(pw_2896_2896)]
+ pshufb m1, m7 ;t14a t15a
+ pshufb m2, m7 ;t10 t11
+ pshufb m4, m7 ;t2a t3a
+ pshufb m5, m7 ;t6 t7
+ pmaddwd m7, m0, m2
+ pmaddwd m2, m3
+ paddd m7, m6
+ paddd m2, m6
+ psrad m7, 12
+ psrad m2, 12
+ packssdw m2, m7 ;low:out6 high:-out9
+ pmaddwd m7, m0, m4
+ pmaddwd m4, m3
+ paddd m7, m6
+ paddd m4, m6
+ psrad m7, 12
+ psrad m4, 12
+ packssdw m4, m7 ;low:-out7 high:out8
+ pmaddwd m7, m3, m5
+ pmaddwd m5, m0
+ paddd m7, m6
+ paddd m5, m6
+ psrad m7, 12
+ psrad m5, 12
+ packssdw m7, m5 ;low:out4 high:-out11
+ pmaddwd m5, m3, m1
+ pmaddwd m1, m0
+ paddd m5, m6
+ paddd m1, m6
+ psrad m5, 12
+ psrad m1, 12
+ packssdw m5, m1 ;low:-out5 high:out10
+ mova m0, [coeffq+16*4]
+ mova m3, [coeffq+16*5]
+ ret
+ALIGN function_align
+.main_pass2_end:
mova m7, [o(pw_2896x8)]
+ punpckhqdq m6, m2, m1 ;low:t11 high:t15a
+ punpcklqdq m2, m1 ;low:t10 high:t14a
+ psubsw m1, m2, m6
+ paddsw m2, m6
punpckhqdq m6, m4, m5 ;low:t3a high:t7
punpcklqdq m4, m5 ;low:t2a high:t6
psubsw m5, m4, m6
@@ -2298,6 +2395,7 @@
cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_7ROWS coeffq, 16
call m(iadst_16x4_internal).main
+ call m(iadst_16x4_internal).main_pass1_end
punpcklwd m6, m7, m0 ;packed out11, out15
punpckhwd m0, m7 ;packed -out0, -out4
@@ -2360,7 +2458,7 @@
%endmacro
%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*12
+ INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*16
%ifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000
punpcklwd m0, m0
@@ -2548,6 +2646,7 @@
mova m7, [coeffq+16*11]
call m(iadst_16x8_internal).main
+ call m(iadst_16x8_internal).main_pass2_end
mov r3, dstq
lea dstq, [dstq+strideq*8]
@@ -2599,6 +2698,7 @@
mova m7, [coeffq+16*11]
call m(iadst_16x8_internal).main
+ call m(iadst_16x8_internal).main_pass2_end
jmp m(iflipadst_8x8_internal).end
.end:
@@ -2652,7 +2752,7 @@
%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 16x8, 8, 16*12
+ INV_TXFM_FN %1, %2, %3, 16x8, 8, 16*16
%ifidn %1_%2, dct_dct
movd m1, [o(pw_2896x8)]
pmulhrsw m0, m1, [coeffq]
@@ -2893,6 +2993,7 @@
pmulhrsw m7, [coeffq+16*13]
call .main
+ call .main_pass1_end
mov r3, tx2q
lea tx2q, [o(m(iadst_16x8_internal).pass1_end)]
jmp m(iadst_8x8_internal).pass1_end
@@ -2998,23 +3099,15 @@
mova [rsp+gprsize*2+16*6], m3 ;-out3
psubsw m3, m0, m4 ;t7
paddsw m0, m4 ;out12
- mova m7, [o(pw_2896x8)]
- psubsw m4, m2, m3
- paddsw m2, m3
+ mova [rsp+gprsize*2+16*12], m3
mova m3, [rsp+gprsize*2+16*7] ;t3
- pmulhrsw m4, m7 ;-out11
- pmulhrsw m2, m7 ;out4
- mova [rsp+gprsize*2+16*7], m2 ;out4
+ mova [rsp+gprsize*2+16* 7], m2 ;out4
psubsw m2, m5, m3 ;t3a
paddsw m5, m3 ;-out15
- psubsw m3, m1, m2
- paddsw m1, m2
+ mova [rsp+gprsize*2+16*11], m2
mova m2, [rsp+gprsize*2+32*5] ;t15
- pmulhrsw m3, m7 ;out8
- pmulhrsw m1, m7 ;-out7
- mova [rsp+gprsize*2+32*5 ], m1 ;-out7
+ mova [rsp+gprsize*2+16*10], m1 ;-out7
mova m1, [rsp+gprsize*2+16*0] ;t11
- mova [rsp+gprsize*2+16*11], m3 ;out8
mova [rsp+gprsize*2+16*0 ], m5 ;-out15
mova m3, [rsp+gprsize*2+16*1] ;t10
mova [rsp+gprsize*2+16*1 ], m4 ;-out11
@@ -3044,26 +3137,106 @@
paddsw m2, m6 ;-out1
paddsw m6, m4, m1 ;out14
psubsw m4, m1 ;t11
- psubsw m1, m3, m4
- paddsw m3, m4
- pmulhrsw m1, m7 ;-out9
- pmulhrsw m3, m7 ;out6
- mova [rsp+gprsize*2+16*4], m2 ;-out1
+ mova [rsp+gprsize*2+16*14], m4
+ mova [rsp+gprsize*2+16* 4], m2 ;-out1
mova m4, [rsp+gprsize*2+16*8] ;t14
mova m2, [rsp+gprsize*2+16*9] ;t15
- mova [rsp+gprsize*2+16*9], m3 ;out6
+ mova [rsp+gprsize*2+16* 9], m3 ;out6
psubsw m3, m0, m4 ;t14a
paddsw m0, m4 ;out2
psubsw m4, m5, m2 ;t15a
paddsw m5, m2 ;-out13
+ mova [rsp+gprsize*2+16* 5], m0 ;out2
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova m0, [rsp+gprsize*2+16*14]
+ mova [rsp+gprsize*2+16*14], m5
+ mova [rsp+gprsize*2+16*15], m6
+ mova m5, [o(pw_2896_2896)]
+ mova m6, [o(pw_2896_m2896)]
+ mova m7, [o(pd_2048)]
+ punpcklwd m2, m3, m4
+ punpckhwd m3, m4
+ pmaddwd m4, m5, m2
+ pmaddwd m2, m6
+ pmaddwd m1, m5, m3
+ pmaddwd m3, m6
+ REPX {paddd x, m7}, m4, m2, m1, m3
+ REPX {psrad x, 12}, m4, m1, m2, m3
+ packssdw m4, m1 ;-out5
+ packssdw m2, m3 ;out10
+ mova [rsp+gprsize*2+16* 8], m4
+ mova m3, [rsp+gprsize*2+16* 9]
+ punpcklwd m1, m3, m0
+ punpckhwd m3, m0
+ pmaddwd m0, m5, m1
+ pmaddwd m1, m6
+ pmaddwd m4, m5, m3
+ pmaddwd m3, m6
+ REPX {paddd x, m7}, m0, m1, m4, m3
+ REPX {psrad x, 12}, m0, m4, m1, m3
+ packssdw m0, m4 ;out6
+ packssdw m1, m3 ;-out9
+ mova [rsp+gprsize*2+16* 9], m0
+ mova m0, [rsp+gprsize*2+16* 7]
+ mova m4, [rsp+gprsize*2+16*12]
+ punpcklwd m3, m0, m4
+ punpckhwd m0, m4
+ pmaddwd m4, m5, m3
+ pmaddwd m3, m6
+ pmaddwd m5, m0
+ pmaddwd m0, m6
+ REPX {paddd x, m7}, m4, m3, m5, m0
+ REPX {psrad x, 12}, m4, m5, m3, m0
+ packssdw m4, m5 ;out4
+ packssdw m3, m0 ;-out11
+ mova [rsp+gprsize*2+16* 7], m4
+ mova m4, [rsp+gprsize*2+16*10]
+ mova m5, [rsp+gprsize*2+16*11]
+ punpcklwd m0, m4, m5
+ punpckhwd m4, m5
+ pmaddwd m5, m0, [o(pw_2896_2896)]
+ pmaddwd m0, m6
+ pmaddwd m6, m4
+ pmaddwd m4, [o(pw_2896_2896)]
+ REPX {paddd x, m7}, m5, m0, m6, m4
+ REPX {psrad x, 12}, m0, m6, m5, m4
+ packssdw m0, m6 ;out8
+ packssdw m5, m4 ;-out7
+ mova [rsp+gprsize*2+16*10], m5
+ mova m4, [rsp+gprsize*2+16* 2] ;out12
+ mova m5, [rsp+gprsize*2+16*14] ;-out13
+ mova m6, [rsp+gprsize*2+16*15] ;out14
+ ret
+ALIGN function_align
+.main_pass2_end:
+ mova m7, [o(pw_2896x8)]
+ mova m1, [rsp+gprsize*2+16* 9]
+ mova m2, [rsp+gprsize*2+16*14]
+ paddsw m0, m1, m2
+ psubsw m1, m2
+ pmulhrsw m0, m7 ;out6
+ pmulhrsw m1, m7 ;-out9
+ mova [rsp+gprsize*2+16* 9], m0
psubsw m2, m3, m4
paddsw m3, m4
- mova [rsp+gprsize*2+16*5], m0 ;out2
- pmulhrsw m3, m7 ;-out5
pmulhrsw m2, m7 ;out10
- mova [rsp+gprsize*2+16*8], m3 ;-out5
- mova m0, [rsp+gprsize*2+16*11] ;out8
- mova m3, [rsp+gprsize*2+16*1 ] ;-out11
+ pmulhrsw m3, m7 ;-out5
+ mova [rsp+gprsize*2+16* 8], m3
+ mova m3, [rsp+gprsize*2+16* 7]
+ mova m4, [rsp+gprsize*2+16*12]
+ paddsw m0, m3, m4
+ psubsw m3, m4
+ pmulhrsw m0, m7 ;out4
+ pmulhrsw m3, m7 ;-out11
+ mova [rsp+gprsize*2+16* 7], m0
+ mova m0, [rsp+gprsize*2+16*10]
+ paddsw m4, m0, [rsp+gprsize*2+16*11]
+ psubsw m0, [rsp+gprsize*2+16*11]
+ pmulhrsw m4, m7 ;-out7
+ pmulhrsw m0, m7 ;out8
+ mova [rsp+gprsize*2+16*10], m4
mova m4, [rsp+gprsize*2+16*2 ] ;out12
ret
@@ -3100,6 +3273,7 @@
pmulhrsw m7, [coeffq+16*13]
call m(iadst_16x8_internal).main
+ call m(iadst_16x8_internal).main_pass1_end
mova m7, [rsp+gprsize+16*0]
SAVE_8ROWS coeffq+16*0, 32
@@ -3184,7 +3358,7 @@
%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 16x16, 8, 16*12
+ INV_TXFM_FN %1, %2, %3, 16x16, 8, 16*16
%ifidn %1_%2, dct_dct
movd m1, [o(pw_2896x8)]
pmulhrsw m0, m1, [coeffq]
@@ -3423,6 +3597,7 @@
cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_16X16_ADST_LOAD_ODD_COEFS
call m(iadst_16x8_internal).main
+ call m(iadst_16x8_internal).main_pass1_end
mov r3, tx2q
lea tx2q, [o(m(iadst_16x16_internal).pass1_end)]
@@ -3441,6 +3616,7 @@
SAVE_8ROWS coeffq+16*1, 32
ITX_16X16_ADST_LOAD_EVEN_COEFS
call m(iadst_16x8_internal).main
+ call m(iadst_16x8_internal).main_pass1_end
lea tx2q, [o(m(iadst_16x16_internal).pass1_end2)]
mova m7, [o(pw_8192)]
@@ -3496,6 +3672,7 @@
cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_16X16_ADST_LOAD_ODD_COEFS
call m(iadst_16x8_internal).main
+ call m(iadst_16x8_internal).main_pass1_end
mov r3, tx2q
lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end)]
@@ -3514,6 +3691,7 @@
SAVE_8ROWS coeffq+16*17, 32
ITX_16X16_ADST_LOAD_EVEN_COEFS
call m(iadst_16x8_internal).main
+ call m(iadst_16x8_internal).main_pass1_end
mova m7, [rsp+gprsize+16*0]
SAVE_8ROWS coeffq+16*0, 32