shithub: dav1d

Download patch

ref: a9315f5fde02530f64358375c3d2444a506b3a58
parent: e2702eaf5f13d5f93be75084a5bfecc77a67c001
author: Henrik Gramner <gramner@twoorioles.com>
date: Wed Sep 4 18:06:58 EDT 2019

x86: Increase precision of the final inverse ADST transform stages

16-bit precision is sufficient for the second pass, but the first pass
requires 32-bit precision to correctly handle some esoteric edge cases.

--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -47,9 +47,11 @@
 pw_3803_1321:   dw  3803,  1321
 pw_m1321_2482:  dw -1321,  2482
 pw_2482_3344:   dw  2482,  3344
+pw_m3344_3344:  dw -3344,  3344
 pw_m3803_3344:  dw -3803,  3344
 pw_m3803_m6688: dw -3803, -6688
-%define pw_3344x8 iadst4_dconly2b
+COEF_PAIR           2896,  2896
+pw_2896_m2896:  dw  2896, -2896
 
 pw_5:      times 2 dw 5
 pw_2048:   times 2 dw 2048
@@ -464,13 +466,15 @@
 %macro IADST4_1D_PACKED 0
     punpcklwd            m2, m1, m0
     punpckhwd            m3, m1, m0
-    psubw                m0, m1
-    punpckhqdq           m1, m1
-    paddw                m1, m0 ; in0 - in2 + in3
+    vpbroadcastd         m5, [o(pw_m3344_3344)]
     vpbroadcastd         m0, [o(pw_3803_1321)]
     vpbroadcastd         m4, [o(pw_m1321_2482)]
+    pmaddwd              m1, m5, m2 ; 3344*in3 - 3344*in2
+    psrld                m5, 16
     pmaddwd              m0, m2
     pmaddwd              m2, m4
+    pmaddwd              m5, m3 ; 3344*in0
+    paddd                m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3
     vpbroadcastd         m4, [o(pw_2482_3344)]
     vpbroadcastd         m5, [o(pw_m3803_3344)]
     pmaddwd              m4, m3
@@ -478,19 +482,16 @@
     paddd                m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
     vpbroadcastd         m0, [o(pw_m3803_m6688)]
     pmaddwd              m3, m0
-    vpbroadcastd         m0, [o(pw_3344x8)]
-    pmulhrsw             m1, m0 ; out2 ____
     vpbroadcastd         m0, [o(pd_2048)]
     paddd                m2, m0
+    paddd                m1, m0
     paddd                m0, m4
     paddd                m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
     paddd                m2, m4
     paddd                m2, m3
-    psrad                m0, 12
-    psrad                m5, 12
-    psrad                m2, 12
+    REPX      {psrad x, 12}, m1, m2, m0, m5
     packssdw             m0, m5 ; out0 out1
-    packssdw             m2, m2 ; out3 out3
+    packssdw             m1, m2 ; out2 out3
 %endmacro
 
 INV_TXFM_4X4_FN dct, dct,      0
@@ -524,7 +525,7 @@
     mova                 m0, [cq+16*0]
     mova                 m1, [cq+16*1]
     call .main
-    punpckhwd            m3, m0, m2
+    punpckhwd            m3, m0, m1
     punpcklwd            m0, m1
     punpckhwd            m1, m0, m3
     punpcklwd            m0, m3
@@ -531,7 +532,6 @@
     jmp                tx2q
 .pass2:
     call .main
-    vpblendd             m1, m1, m2, 0x0c ; out2 out3
 .end:
     pxor                 m2, m2
     mova          [cq+16*0], m2
@@ -552,14 +552,13 @@
     mova                 m0, [cq+16*0]
     mova                 m1, [cq+16*1]
     call m(iadst_4x4_internal).main
-    punpcklwd            m1, m0
-    punpckhwd            m2, m0
-    punpcklwd            m0, m2, m1
-    punpckhwd            m1, m2, m1
+    punpcklwd            m2, m1, m0
+    punpckhwd            m1, m0
+    punpcklwd            m0, m1, m2
+    punpckhwd            m1, m2
     jmp                tx2q
 .pass2:
     call m(iadst_4x4_internal).main
-    vpblendd             m1, m1, m2, 0x0c ; out2 out3
 .end:
     pxor                 m2, m2
     mova          [cq+16*0], m2
@@ -710,12 +709,55 @@
     paddsw               m1, m5     ; out3 out2
 %endmacro
 
-%macro IADST8_1D_PACKED 0
+%macro IADST8_1D_PACKED 1 ; pass
     vpbroadcastd         m6, [o(pd_2048)]
     punpckhwd            m0, m4, m3 ; 0 7
     punpckhwd            m1, m5, m2 ; 2 5
     punpcklwd            m2, m5     ; 4 3
     punpcklwd            m3, m4     ; 6 1
+%if %1 == 1
+    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076, 3 ; t1a t0a
+    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
+    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
+    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
+    psubsw               m4, m0, m2 ; t5 t4
+    paddsw               m0, m2     ; t1 t0
+    psubsw               m5, m1, m3 ; t6 t7
+    paddsw               m1, m3     ; t2 t3
+    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
+    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
+%if mmsize > 16
+    vbroadcasti128       m2, [o(deint_shuf)]
+%else
+    mova                 m2, [o(deint_shuf)]
+%endif
+    pshuflw              m1, m1, q2301
+    pshufhw              m1, m1, q2301
+    psubsw               m3, m0, m1        ; t3 t2
+    paddsw               m0, m1            ; -out7  out0
+    psubsw               m1, m4, m5        ; t7 t6
+    paddsw               m4, m5            ;  out6 -out1
+    pshufb               m0, m2
+    pshufb               m4, m2
+    vpbroadcastd         m5, [o(pw_m2896_2896)]
+    pmaddwd              m2, m5, m3
+    pmaddwd              m5, m1
+    paddd                m2, m6
+    paddd                m5, m6
+    psrad                m2, 12
+    psrad                m5, 12
+    packssdw             m2, m5            ; out4 -out5
+    vpbroadcastd         m5, [o(pw_2896_2896)]
+    pmaddwd              m3, m5
+    pmaddwd              m1, m5
+    paddd                m3, m6
+    paddd                m1, m6
+    psrad                m3, 12
+    psrad                m1, 12
+    packssdw             m1, m3            ; out2 -out3
+    punpcklqdq           m3, m4, m0        ; out6 -out7
+    punpckhqdq           m0, m4            ; out0 -out1
+%else
     ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076 ; t0a t1a
     ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
     ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
@@ -743,6 +785,7 @@
     pmulhrsw             m2, m5            ; out4 -out5
     pshufd               m1, m1, q1032
     pmulhrsw             m1, m5            ; out2 -out3
+%endif
 %endmacro
 
 INIT_YMM avx2
@@ -790,7 +833,7 @@
     pmulhrsw             m0, m2
     pmulhrsw             m1, m2
     call m(iadst_8x4_internal).main
-    punpckhwd            m3, m0, m2
+    punpckhwd            m3, m0, m1
     punpcklwd            m0, m1
     punpckhwd            m1, m0, m3
     punpcklwd            m0, m3
@@ -800,7 +843,7 @@
     vextracti128        xm3, m1, 1
     pshufd              xm4, xm0, q1032
     pshufd              xm5, xm1, q1032
-    call .main
+    call .main_pass2
     vpbroadcastd         m4, [o(pw_2048)]
     vinserti128          m0, m0, xm2, 1
     vinserti128          m1, m1, xm3, 1
@@ -822,9 +865,13 @@
     WRITE_4X8             0, 1
     RET
 ALIGN function_align
-.main:
-    WRAP_XMM IADST8_1D_PACKED
+.main_pass1:
+    WRAP_XMM IADST8_1D_PACKED 1
     ret
+ALIGN function_align
+.main_pass2:
+    WRAP_XMM IADST8_1D_PACKED 2
+    ret
 
 INV_TXFM_4X8_FN flipadst, dct,      0
 INV_TXFM_4X8_FN flipadst, adst
@@ -839,7 +886,7 @@
     pmulhrsw             m1, m2
     call m(iadst_8x4_internal).main
     punpcklwd            m3, m1, m0
-    punpckhwd            m1, m2, m0
+    punpckhwd            m1, m0
     punpcklwd            m0, m1, m3
     punpckhwd            m1, m3
     jmp                tx2q
@@ -848,7 +895,7 @@
     vextracti128        xm3, m1, 1
     pshufd              xm4, xm0, q1032
     pshufd              xm5, xm1, q1032
-    call m(iadst_4x8_internal).main
+    call m(iadst_4x8_internal).main_pass2
     vpbroadcastd         m5, [o(pw_2048)]
     vinserti128          m3, m3, xm1, 1
     vinserti128          m2, m2, xm0, 1
@@ -1099,8 +1146,13 @@
     jmp                tx2q
 .pass2:
     call .main
-    pshufd               m1, m1, q1032
+    vpbroadcastd         m5, [o(pw_2896x8)]
+    paddsw               m1, m2, m4
+    psubsw               m2, m4
+    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
+    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
     vpbroadcastd         m5, [o(pw_2048)]
+    pshufd               m1, m1, q1032
     vpblendd             m4, m1, m0, 0x33
     vpblendd             m0, m0, m2, 0x33
     vpblendd             m2, m2, m3, 0x33
@@ -1176,7 +1228,6 @@
     vinserti128          m0, m0, xm2, 1    ; t1   t0   t9a  t8a
     vperm2i128           m2, m4, m1, 0x31  ; t7a  t6a  t15  t14
     vinserti128          m4, m4, xm1, 1    ; t4a  t5a  t12  t13
-    vpbroadcastd         m5, [o(pw_2896x8)]
     pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
     psubsw               m1, m0, m3        ; t3a t2a t11 t10
     paddsw               m0, m3     ; -out15  out0   out14 -out1
@@ -1184,11 +1235,22 @@
     psubsw               m4, m2            ; t6 t7 t14a t15a
     shufps               m2, m1, m4, q1032 ; t2a t6  t10 t14a
     vpblendd             m4, m4, m1, 0x33  ; t3a t7  t11 t15a
-    paddsw               m1, m2, m4
-    psubsw               m2, m4
-    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
-    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
     ret
+ALIGN function_align
+.main_pass1_end:
+    vpbroadcastd         m5, [o(pw_m2896_2896)]
+    vpbroadcastd         m6, [o(pw_2896_2896)]
+    punpcklwd            m1, m4, m2
+    punpckhwd            m4, m2
+    pmaddwd              m2, m5, m4
+    pmaddwd              m4, m6
+    pmaddwd              m5, m1
+    pmaddwd              m1, m6
+    REPX      {paddd x, m8}, m5, m1, m2, m4
+    REPX      {psrad x, 12}, m5, m2, m1, m4
+    packssdw             m2, m5     ; -out11  out8   out10 -out9
+    packssdw             m1, m4     ; -out7   out4   out6  -out5
+    ret
 
 INV_TXFM_4X16_FN flipadst, dct,      0
 INV_TXFM_4X16_FN flipadst, adst
@@ -1214,8 +1276,13 @@
     jmp                tx2q
 .pass2:
     call m(iadst_4x16_internal).main
-    pshufd               m1, m1, q1032
+    vpbroadcastd         m5, [o(pw_2896x8)]
+    paddsw               m1, m2, m4
+    psubsw               m2, m4
+    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
+    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
     vpbroadcastd         m6, [o(pw_2048)]
+    pshufd               m1, m1, q1032
     vpblendd             m4, m0, m2, 0x33
     vpblendd             m0, m0, m1, 0xcc
     vpblendd             m1, m1, m3, 0xcc
@@ -1381,7 +1448,7 @@
     pmulhrsw            xm2, xm0, [cq+16*2]
     pmulhrsw            xm4, xm0
     pmulhrsw            xm5, xm0
-    call m(iadst_4x8_internal).main
+    call m(iadst_4x8_internal).main_pass1
     vinserti128        m0, m0, xm2, 1
     vinserti128        m1, m1, xm3, 1
     punpckhwd          m2, m0, m1
@@ -1393,7 +1460,6 @@
     jmp              tx2q
 .pass2:
     call .main
-    vpblendd             m1, m1, m2, 0xcc
 .end:
     vpermq               m0, m0, q3120
     vpermq               m1, m1, q3120
@@ -1427,7 +1493,7 @@
     pmulhrsw            xm2, xm0, [cq+16*2]
     pmulhrsw            xm4, xm0
     pmulhrsw            xm5, xm0
-    call m(iadst_4x8_internal).main
+    call m(iadst_4x8_internal).main_pass1
     vinserti128          m3, m3, xm1, 1
     vinserti128          m2, m2, xm0, 1
     punpckhwd            m1, m3, m2
@@ -1439,7 +1505,7 @@
     jmp                tx2q
 .pass2:
     call m(iadst_8x4_internal).main
-    vpblendd             m2, m2, m1, 0x33
+    mova                 m2, m1
     vpermq               m1, m0, q2031
     vpermq               m0, m2, q2031
     jmp m(iadst_8x4_internal).end2
@@ -1580,7 +1646,7 @@
     vpermq               m3, [cq+32*3], q3120 ; 6 7
     vpermq               m5, [cq+32*1], q1302 ; 3 2
     vpermq               m2, [cq+32*2], q3120 ; 4 5
-    call .main
+    call .main_pass1
     vpbroadcastd         m5, [o(pw_16384)]
     punpcklwd            m4, m0, m1
     punpckhwd            m0, m1
@@ -1604,7 +1670,7 @@
 .pass2:
     pshufd               m4, m0, q1032
     pshufd               m5, m1, q1032
-    call .main
+    call .main_pass2
     vpbroadcastd         m5, [o(pw_2048)]
     vpbroadcastd        xm4, [o(pw_4096)]
     psubw                m4, m5 ; lower half = 2048, upper half = -2048
@@ -1629,9 +1695,13 @@
     WRITE_8X4             2, 3, 4, 5
     RET
 ALIGN function_align
-.main:
-    IADST8_1D_PACKED
+.main_pass1:
+    IADST8_1D_PACKED 1
     ret
+ALIGN function_align
+.main_pass2:
+    IADST8_1D_PACKED 2
+    ret
 
 INV_TXFM_8X8_FN flipadst, dct
 INV_TXFM_8X8_FN flipadst, adst
@@ -1643,7 +1713,7 @@
     vpermq               m3, [cq+32*3], q3120 ; 6 7
     vpermq               m5, [cq+32*1], q1302 ; 3 2
     vpermq               m2, [cq+32*2], q3120 ; 4 5
-    call m(iadst_8x8_internal).main
+    call m(iadst_8x8_internal).main_pass1
     vpbroadcastd         m5, [o(pw_16384)]
     punpckhwd            m4, m3, m2
     punpcklwd            m3, m2
@@ -1667,7 +1737,7 @@
 .pass2:
     pshufd               m4, m0, q1032
     pshufd               m5, m1, q1032
-    call m(iadst_8x8_internal).main
+    call m(iadst_8x8_internal).main_pass2
     vpbroadcastd         m4, [o(pw_2048)]
     vpbroadcastd        xm5, [o(pw_4096)]
     psubw                m4, m5 ; lower half = -2048, upper half = 2048
@@ -1867,6 +1937,7 @@
 cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
     ITX_8X16_LOAD_COEFS
     call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
     vpbroadcastd        m10, [o(pw_16384)]
     pslld                m9, m10, 17
     psubw               m10, m9 ; 16384, -16384
@@ -1874,6 +1945,7 @@
 ALIGN function_align
 .pass2:
     call .main
+    call .main_pass2_end
     vpbroadcastd         m9, [o(pw_2048)]
     vpbroadcastd        xm8, [o(pw_4096)]
     psubw                m8, m9
@@ -1930,39 +2002,73 @@
     paddsw               m4, m6     ; t8a  t9a
     vpbroadcastd        m11, [o(pw_m3784_1567)]
     vpbroadcastd        m12, [o(pw_1567_3784)]
-    ITX_MUL2X_PACK        3, 6, _, 10, 11, 12, 4 ; t4a t5a
+    ITX_MUL2X_PACK        3, 6, _, 10, 12, 11, 6 ; t5a t4a
     psubw                m6, m9, m11 ; pw_3784_m1567
-    ITX_MUL2X_PACK        8, 12, _, 10, 12, 6, 4 ; t6a t7a
+    ITX_MUL2X_PACK        8, 6, _, 10, 6, 12, 6  ; t7a t6a
     vpbroadcastd        m11, [o(pw_m1567_3784)]
     vpbroadcastd        m12, [o(pw_3784_1567)]
-    ITX_MUL2X_PACK        2, 6, _, 10, 11, 12, 4 ; t15 t14
+    ITX_MUL2X_PACK        2, 6, _, 10, 11, 12, 6 ; t15 t14
     psubw                m6, m9, m11 ; pw_1567_m3784
-    ITX_MUL2X_PACK        5, 12, _, 10, 12, 6, 4 ; t13 t12
-    vbroadcasti128      m11, [o(deint_shuf)]
-    vpbroadcastd        m12, [o(pw_2896x8)]
-    psubsw               m6, m0, m1        ;  t3a    t2a
+    ITX_MUL2X_PACK        5, 12, _, 10, 12, 6, 6 ; t13 t12
+    vbroadcasti128      m12, [o(deint_shuf)]
+    paddsw               m6, m4, m7        ; -out1  out14
+    psubsw               m4, m7            ;  t10    t11
+    psubsw              m11, m3, m8        ;  t7     t6
+    paddsw               m8, m3            ;  out12 -out3
+    psubsw               m3, m0, m1        ;  t3a    t2a
     paddsw               m0, m1            ; -out15  out0
     paddsw               m1, m2, m5        ; -out13  out2
     psubsw               m5, m2            ;  t15a   t14a
-    paddsw               m2, m4, m7        ; -out1  out14
-    psubsw               m4, m7            ;  t10    t11
-    psubsw               m7, m3, m8        ;  t6     t7
-    paddsw               m8, m3            ; -out3   out12
-    REPX    {pshufb x, m11}, m6, m4, m0, m2
-    vpblendd             m3, m6, m4, 0xcc  ;  t3a    t11
-    shufps               m6, m6, m4, q1032 ;  t2a    t10
-    vpblendd             m4, m5, m7, 0xcc  ;  t15a   t7
-    shufps               m5, m5, m7, q1032 ;  t14a   t6
-    shufps               m7, m2, m0, q1032 ;  out14 -out15
-    vpblendd             m0, m0, m2, 0x33  ; -out1   out0
-    paddsw               m2, m5, m4        ; -out5   out4
-    psubsw               m5, m4            ;  out10 -out11
-    psubsw               m4, m6, m3        ;  out8  -out9
-    paddsw               m3, m6            ; -out7   out6
-    shufps               m6, m8, m1, q1032 ;  out12 -out13
-    vpblendd             m1, m1, m8, 0x33  ; -out3   out2
-    REPX  {pmulhrsw x, m12}, m2, m3, m4, m5
+    pshufb               m0, m12
+    pshufb               m6, m12
+    pshufb               m8, m12
+    pshufb               m1, m12
+    shufps               m7, m6, m0, q1032 ;  out14 -out15
+    vpblendd             m0, m6, 0x33      ; -out1   out0
+    punpcklqdq           m6, m8, m1        ;  out12 -out13
+    punpckhqdq           m1, m8, m1        ; -out3   out2
     ret
+ALIGN function_align
+.main_pass1_end:
+    vpbroadcastd         m8, [o(pw_m2896_2896)]
+    vpbroadcastd        m12, [o(pw_2896_2896)]
+    pmaddwd              m9, m8, m11       ; -out11
+    pmaddwd              m2, m12, m5       ; -out5
+    pmaddwd              m5, m8            ;  out10
+    pmaddwd             m11, m12           ;  out4
+    REPX     {paddd x, m10}, m9, m5, m2, m11
+    REPX     {psrad x, 12 }, m9, m5, m2, m11
+    packssdw             m5, m9            ;  out10 -out11
+    packssdw             m2, m11           ; -out5   out4
+    pmaddwd             m11, m8, m3        ;  out8
+    vpbroadcastd         m8, [o(pw_2896_m2896)]
+    pmaddwd              m3, m12           ; -out7
+    pmaddwd              m8, m4            ; -out9
+    pmaddwd              m4, m12           ;  out6
+    REPX     {paddd x, m10}, m11, m3, m8, m4
+    REPX     {psrad x, 12 }, m11, m3, m8, m4
+    packssdw             m3, m4            ; -out7   out6
+    packssdw             m4, m11, m8       ;  out8  -out9
+    vpbroadcastd        m10, [o(pw_16384)]
+    pxor                 m9, m9
+    ret
+ALIGN function_align
+.main_pass2_end:
+    vpbroadcastd         m8, [o(pw_2896x8)]
+    pshufb               m2, m11, m12
+    pshufb               m5, m12
+    pshufb               m3, m12
+    pshufb               m4, m12
+    punpcklqdq          m11, m5, m2        ;  t15a   t7
+    punpckhqdq           m5, m2            ;  t14a   t6
+    shufps               m2, m3, m4, q1032 ;  t2a    t10
+    vpblendd             m3, m4, 0xcc      ;  t3a    t11
+    psubsw               m4, m2, m3        ;  out8  -out9
+    paddsw               m3, m2            ; -out7   out6
+    paddsw               m2, m5, m11       ; -out5   out4
+    psubsw               m5, m11           ;  out10 -out11
+    REPX   {pmulhrsw x, m8}, m2, m3, m4, m5
+    ret
 
 INV_TXFM_8X16_FN flipadst, dct
 INV_TXFM_8X16_FN flipadst, adst
@@ -1972,6 +2078,7 @@
 cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
     ITX_8X16_LOAD_COEFS
     call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
     vpbroadcastd         m9, [o(pw_16384)]
     pslld               m10, m9, 17
     psubw               m10, m9 ; -16384, 16384
@@ -1990,6 +2097,7 @@
     jmp m(idct_8x16_internal).pass1_end2
 .pass2:
     call m(iadst_8x16_internal).main
+    call m(iadst_8x16_internal).main_pass2_end
     vpbroadcastd         m8, [o(pw_2048)]
     vpbroadcastd        xm9, [o(pw_4096)]
     psubw                m8, m9
@@ -2232,7 +2340,7 @@
     vpermq               m1, [cq+32*1], q1230
     vpermq               m2, [cq+32*2], q2103
     call m(iadst_4x16_internal).main2
-    pshufd               m2, m2, q1032
+    call m(iadst_4x16_internal).main_pass1_end
     punpcklwd            m4, m3, m1
     punpcklwd            m5, m2, m0
     punpckhwd            m0, m1
@@ -2276,13 +2384,15 @@
     RET
 ALIGN function_align
 .main:
+    vpbroadcastd         m6, [o(pw_m3344_3344)]
     vpbroadcastd         m7, [o(pw_3803_1321)]
     vpbroadcastd         m8, [o(pw_m1321_2482)]
     vpbroadcastd         m9, [o(pw_2482_3344)]
     punpcklwd            m4, m2, m0 ; in2 in0 l
-    psubw                m6, m0, m2
     punpckhwd            m2, m0     ; in2 in0 h
-    paddw                m6, m3     ; t2
+    psrld                m5, m6, 16
+    pmaddwd             m10, m6, m4 ; t2:02 l
+    pmaddwd              m6, m2     ; t2:02 h
     pmaddwd              m0, m7, m4 ; t0:02 l
     pmaddwd              m7, m2     ; t0:02 h
     pmaddwd              m4, m8     ; t1:02 l
@@ -2289,7 +2399,11 @@
     pmaddwd              m8, m2     ; t1:02 h
     punpckhwd            m2, m3, m1 ; in3 in1 h
     punpcklwd            m3, m1     ; in3 in1 l
+    pmaddwd              m1, m5, m2 ; t2:3 h
+    pmaddwd              m5, m3     ; t2:3 l
+    paddd                m6, m1
     vpbroadcastd         m1, [o(pd_2048)]
+    paddd               m10, m5
     pmaddwd              m5, m9, m3
     pmaddwd              m9, m2
     paddd                m0, m1
@@ -2299,6 +2413,8 @@
     vpbroadcastd         m9, [o(pw_m3803_3344)]
     pmaddwd              m5, m9, m2
     pmaddwd              m9, m3
+    paddd               m10, m1     ; t2 + 2048 l
+    paddd                m6, m1     ; t2 + 2048 h
     paddd                m5, m1     ; t1:13 + 2048 h
     paddd                m1, m9     ; t1:13 + 2048 l
     vpbroadcastd         m9, [o(pw_m3803_m6688)]
@@ -2310,12 +2426,11 @@
     paddd                m4, m0
     paddd                m2, m8     ; t0 + t1 - t3 + 2048 h
     paddd                m3, m4     ; t0 + t1 - t3 + 2048 l
-    REPX      {psrad x, 12}, m0, m7, m5, m1, m2, m3
+    REPX      {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3
     packssdw             m0, m7
     packssdw             m1, m5
     packssdw             m3, m2
-    vpbroadcastd         m2, [o(pw_3344x8)]
-    pmulhrsw             m2, m6
+    packssdw             m2, m10, m6
     ret
 
 INV_TXFM_16X4_FN flipadst, dct
@@ -2329,7 +2444,7 @@
     vpermq               m1, [cq+32*1], q1230
     vpermq               m2, [cq+32*2], q2103
     call m(iadst_4x16_internal).main2
-    pshufd               m2, m2, q1032
+    call m(iadst_4x16_internal).main_pass1_end
     punpckhwd            m4, m3, m2
     punpckhwd            m5, m1, m0
     punpcklwd            m0, m2
@@ -2552,7 +2667,7 @@
 cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
     ITX_16X8_LOAD_COEFS 1302
     call m(iadst_8x16_internal).main2
-    vpbroadcastd        m10, [o(pw_16384)]
+    call m(iadst_8x16_internal).main_pass1_end
     psubw               m11, m9, m10
     punpcklwd            m8, m0, m2
     punpckhwd            m0, m2
@@ -2567,7 +2682,7 @@
 ALIGN function_align
 .pass2:
     call .main
-    vpbroadcastd         m9, [o(pw_2048)]
+    call .main_pass2_end
     pxor                 m8, m8
     psubw                m8, m9
     REPX   {pmulhrsw x, m9}, m0, m2, m4, m6
@@ -2591,7 +2706,6 @@
     ITX_MULSUB_2W         4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
     psubsw               m9, m6, m8 ;  t7
     paddsw               m6, m8     ;  out6
-    vpbroadcastd         m8, [o(pw_2896x8)]
     psubsw               m3, m7, m5 ;  t3
     paddsw               m7, m5     ; -out7
     psubsw               m5, m0, m2 ;  t2
@@ -2598,6 +2712,35 @@
     paddsw               m0, m2     ;  out0
     psubsw               m2, m1, m4 ;  t6
     paddsw               m1, m4     ; -out1
+    ret
+ALIGN function_align
+.main_pass1_end:
+    vpbroadcastd        m11, [o(pw_m2896_2896)]
+    vpbroadcastd        m12, [o(pw_2896_2896)]
+    punpckhwd            m4, m3, m5
+    punpcklwd            m3, m5
+    pmaddwd              m5, m11, m4
+    pmaddwd              m4, m12
+    pmaddwd              m8, m11, m3
+    pmaddwd              m3, m12
+    REPX     {paddd x, m10}, m5, m4, m8, m3
+    REPX     {psrad x, 12 }, m5, m8, m4, m3
+    packssdw             m3, m4     ; -out3
+    packssdw             m4, m8, m5 ;  out4
+    punpcklwd            m5, m9, m2
+    punpckhwd            m9, m2
+    pmaddwd              m2, m12, m5
+    pmaddwd              m5, m11
+    pmaddwd             m12, m9
+    pmaddwd             m11, m9
+    REPX     {paddd x, m10}, m2, m5, m12, m11
+    REPX     {psrad x, 12 }, m2, m12, m5, m11
+    packssdw             m2, m12    ;  out2
+    packssdw             m5, m11    ; -out5
+    ret
+ALIGN function_align
+.main_pass2_end:
+    vpbroadcastd         m8, [o(pw_2896x8)]
     psubsw               m4, m5, m3
     paddsw               m3, m5
     psubsw               m5, m2, m9
@@ -2606,6 +2749,7 @@
     pmulhrsw             m3, m8     ; -out3
     pmulhrsw             m4, m8     ;  out4
     pmulhrsw             m5, m8     ; -out5
+    vpbroadcastd         m9, [o(pw_2048)]
     ret
 
 INV_TXFM_16X8_FN flipadst, dct
@@ -2616,7 +2760,7 @@
 cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
     ITX_16X8_LOAD_COEFS 1302
     call m(iadst_8x16_internal).main2
-    vpbroadcastd        m10, [o(pw_16384)]
+    call m(iadst_8x16_internal).main_pass1_end
     psubw                m9, m10
     punpcklwd            m8, m6, m4
     punpckhwd            m6, m4
@@ -2655,7 +2799,7 @@
     jmp                tx2q
 .pass2:
     call m(iadst_16x8_internal).main
-    vpbroadcastd         m9, [o(pw_2048)]
+    call m(iadst_16x8_internal).main_pass2_end
     pxor                 m8, m8
     psubw                m8, m9
     pmulhrsw            m10, m7, m8
@@ -2986,8 +3130,12 @@
 cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
     ITX_16X16_LOAD_COEFS
     call .main
-    vpbroadcastd         m1, [o(pw_8192)]
-    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+    call .main_pass1_end
+    pmulhrsw             m0, m1, [cq+32*0]
+    pmulhrsw             m2, m1, [cq+32*1]
+    REPX   {pmulhrsw x, m1}, m4, m6, m8, m10
+    pmulhrsw            m12, m1, [cq+32*2]
+    pmulhrsw            m14, m1, [cq+32*3]
     vextracti128 [rsp+16*5], m8, 1
     mova         [rsp+16*1], xm8
     pxor                 m8, m8
@@ -2996,7 +3144,7 @@
 ALIGN function_align
 .pass2:
     call .main
-    vpbroadcastd         m1, [o(pw_2048)]
+    call .main_pass2_end
     REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
     mova         [rsp+32*0], m6
     pxor                 m6, m6
@@ -3081,6 +3229,62 @@
     paddsw               m0, m12      ;  out0
     paddsw              m12, m8, m5   ;  out12
     psubsw               m8, m5       ;  t7
+    ret
+ALIGN function_align
+.main_pass1_end:
+    mova          [cq+32*0], m0
+    mova          [cq+32*1], m2
+    mova          [cq+32*2], m12
+    mova          [cq+32*3], m14
+    vpbroadcastd        m14, [pw_m2896_2896]
+    vpbroadcastd        m12, [pw_2896_2896]
+    vpbroadcastd         m2, [pd_2048]
+    punpcklwd            m5, m11, m10
+    punpckhwd           m11, m10
+    pmaddwd             m10, m14, m5
+    pmaddwd              m0, m14, m11
+    pmaddwd              m5, m12
+    pmaddwd             m11, m12
+    REPX      {paddd x, m2}, m10, m0, m5, m11
+    REPX      {psrad x, 12}, m10, m0, m5, m11
+    packssdw            m10, m0  ;  out10
+    packssdw             m5, m11 ; -out5
+    punpcklwd           m11, m8, m4
+    punpckhwd            m8, m4
+    pmaddwd              m4, m12, m11
+    pmaddwd              m0, m12, m8
+    pmaddwd             m11, m14
+    pmaddwd              m8, m14
+    REPX      {paddd x, m2}, m4, m0, m11, m8
+    REPX      {psrad x, 12}, m4, m0, m11, m8
+    packssdw             m4, m0  ;  out4
+    packssdw            m11, m8  ; -out11
+    punpcklwd            m8, m9, m7
+    punpckhwd            m9, m7
+    pmaddwd              m7, m12, m8
+    pmaddwd              m0, m12, m9
+    pmaddwd              m8, m14
+    pmaddwd              m9, m14
+    REPX      {paddd x, m2}, m7, m0, m8, m9
+    REPX      {psrad x, 12}, m7, m0, m8, m9
+    packssdw             m7, m0  ; -out7
+    packssdw             m8, m9  ;  out8
+    punpckhwd            m0, m6, m1
+    punpcklwd            m6, m1
+    pmaddwd              m1, m14, m0
+    pmaddwd              m9, m14, m6
+    pmaddwd              m0, m12
+    pmaddwd              m6, m12
+    REPX      {paddd x, m2}, m1, m9, m0, m6
+    REPX      {psrad x, 12}, m1, m9, m0, m6
+    packssdw             m9, m1  ; -out7
+    packssdw             m6, m0  ;  out8
+    vpbroadcastd         m1, [o(pw_8192)]
+    ret
+ALIGN function_align
+.main_pass2_end:
+    ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to
+    ; 16-bit here will produce the same result as using 32-bit intermediates.
     paddsw               m5, m10, m11 ; -out5
     psubsw              m10, m11      ;  out10
     psubsw              m11, m4, m8   ; -out11
@@ -3091,6 +3295,7 @@
     paddsw               m6, m1       ;  out6
     vpbroadcastd         m1, [o(pw_2896x8)]
     REPX   {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
+    vpbroadcastd         m1, [o(pw_2048)]
     ret
 
 INV_TXFM_16X16_FN flipadst, dct
@@ -3100,16 +3305,16 @@
 cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
     ITX_16X16_LOAD_COEFS
     call m(iadst_16x16_internal).main
-    vpbroadcastd         m1, [o(pw_8192)]
+    call m(iadst_16x16_internal).main_pass1_end
     pmulhrsw             m6, m1
+    pmulhrsw             m2, m1, m8
     mova         [rsp+32*2], m6
     pmulhrsw             m6, m1, m4
     pmulhrsw             m4, m1, m10
-    pmulhrsw            m10, m1, m12
-    pmulhrsw            m12, m1, m2
-    pmulhrsw             m2, m1, m8
-    pmulhrsw             m8, m1, m14
-    pmulhrsw            m14, m1, m0
+    pmulhrsw             m8, m1, [cq+32*3]
+    pmulhrsw            m10, m1, [cq+32*2]
+    pmulhrsw            m12, m1, [cq+32*1]
+    pmulhrsw            m14, m1, [cq+32*0]
     pxor                 m0, m0
     psubw                m0, m1
     REPX   {pmulhrsw x, m0}, m3, m5, m7, m11, m15
@@ -3136,7 +3341,7 @@
     jmp m(idct_16x16_internal).pass1_end3
 .pass2:
     call m(iadst_16x16_internal).main
-    vpbroadcastd         m1, [o(pw_2048)]
+    call m(iadst_16x16_internal).main_pass2_end
     pmulhrsw             m0, m1
     pmulhrsw             m8, m1
     mova         [rsp+32*0], m0
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -43,8 +43,11 @@
 pw_2482_m1321:  times 4 dw  2482, -1321
 pw_3344_2482:   times 4 dw  3344,  2482
 pw_3344_m3803:  times 4 dw  3344, -3803
+pw_3344_m3344:  times 4 dw  3344, -3344
+pw_0_3344       times 4 dw     0,  3344
 pw_m6688_m3803: times 4 dw -6688, -3803
 
+COEF_PAIR 2896, 2896
 COEF_PAIR 1567, 3784
 COEF_PAIR  799, 4017
 COEF_PAIR 3406, 2276
@@ -126,7 +129,6 @@
 pw_4085x8:      times 8 dw  4085*8
 pw_m301x8:      times 8 dw  -301*8
 
-
 iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
 iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
 iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
@@ -200,7 +202,6 @@
     ret
 %endmacro
 
-
 ; flags: 1 = swap, 2: coef_regs
 %macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
 %if %6 & 2
@@ -239,35 +240,6 @@
     paddsw               m0, m2                ;high: out1 ;low: out0
 %endmacro
 
-
-%macro IADST4_1D_PACKED 0
-    punpcklwd            m2, m0, m1                ;unpacked in0 in2
-    punpckhwd            m3, m0, m1                ;unpacked in1 in3
-    psubw                m0, m1
-    punpckhqdq           m1, m1                    ;
-    paddw                m1, m0                    ;low: in0 - in2 + in3
-
-    pmaddwd              m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
-    pmaddwd              m2, [o(pw_2482_m1321)]    ;2482 * in0 - 1321 * in2
-    pmaddwd              m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
-    pmaddwd              m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
-    paddd                m4, m0                    ;t0 + t3
-    pmaddwd              m3, [o(pw_m6688_m3803)]   ;-2 * 3344 * in1 - 3803 * in3
-    pmulhrsw             m1, [o(pw_3344x8)]        ;low: out2
-    mova                 m0, [o(pd_2048)]
-    paddd                m2, m0
-    paddd                m0, m4                    ;t0 + t3 + 2048
-    paddd                m5, m2                    ;t1 + t3 + 2048
-    paddd                m2, m4
-    paddd                m2, m3                    ;t0 + t1 - t3 + 2048
-
-    psrad                m0, 12                    ;out0
-    psrad                m5, 12                    ;out1
-    psrad                m2, 12                    ;out3
-    packssdw             m0, m5                    ;high: out1 ;low: out0
-    packssdw             m2, m2                    ;high: out3 ;low: out3
-%endmacro
-
 %macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack
 cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
     %undef cmp
@@ -392,15 +364,14 @@
     mova                 m0, [coeffq+16*0]
     mova                 m1, [coeffq+16*1]
     call .main
-    punpckhwd            m3, m0, m2
+    punpckhwd            m2, m0, m1
     punpcklwd            m0, m1
-    punpckhwd            m1, m0, m3       ;high: in3 ;low :in2
-    punpcklwd            m0, m3           ;high: in1 ;low: in0
+    punpckhwd            m1, m0, m2       ;high: in3 ;low :in2
+    punpcklwd            m0, m2           ;high: in1 ;low: in0
     jmp                tx2q
 
 .pass2:
     call .main
-    punpcklqdq            m1, m2          ;out2 out3
 
 .end:
     pxor                 m2, m2
@@ -412,7 +383,28 @@
 
 ALIGN function_align
 .main:
-    IADST4_1D_PACKED
+    punpcklwd            m2, m0, m1                ;unpacked in0 in2
+    punpckhwd            m0, m1                    ;unpacked in1 in3
+    mova                 m3, m0
+    pmaddwd              m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2
+    pmaddwd              m0, [o(pw_0_3344)]        ;3344 * in3
+    paddd                m1, m0                    ;t2
+    pmaddwd              m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+    pmaddwd              m2, [o(pw_2482_m1321)]    ;2482 * in0 - 1321 * in2
+    pmaddwd              m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
+    paddd                m4, m0                    ;t0 + t3
+    pmaddwd              m3, [o(pw_m6688_m3803)]   ;-2 * 3344 * in1 - 3803 * in3
+    mova                 m0, [o(pd_2048)]
+    paddd                m1, m0                    ;t2 + 2048
+    paddd                m2, m0
+    paddd                m0, m4                    ;t0 + t3 + 2048
+    paddd                m5, m2                    ;t1 + t3 + 2048
+    paddd                m2, m4
+    paddd                m2, m3                    ;t0 + t1 - t3 + 2048
+    REPX      {psrad x, 12}, m1, m0, m5, m2
+    packssdw             m0, m5                    ;high: out1 ;low: out0
+    packssdw             m1, m2                    ;high: out3 ;low: out3
     ret
 
 INV_TXFM_4X4_FN flipadst, dct,      0
@@ -424,16 +416,14 @@
     mova                 m0, [coeffq+16*0]
     mova                 m1, [coeffq+16*1]
     call m(iadst_4x4_internal).main
-    punpcklwd            m1, m0
-    punpckhwd            m2, m0
-    punpcklwd            m0, m2, m1            ;high: in3 ;low :in2
-    punpckhwd            m2, m1                ;high: in1 ;low: in0
-    mova                 m1, m2
+    punpcklwd            m2, m1, m0
+    punpckhwd            m1, m0
+    punpcklwd            m0, m1, m2            ;high: in3 ;low :in2
+    punpckhwd            m1, m2                ;high: in1 ;low: in0
     jmp                tx2q
 
 .pass2:
     call m(iadst_4x4_internal).main
-    punpcklqdq            m1, m2               ;out2 out3
 
 .end:
     pxor                 m2, m2
@@ -584,99 +574,6 @@
     mova                m%4, m%5
 %endmacro
 
-%macro IADST4_1D 0
-    mova                 m4, m2
-    psubw                m2, m0, m4
-    paddw                m2, m3                        ;low: in0 - in2 + in3
-
-    punpckhwd            m6, m0, m4                    ;unpacked in0 in2
-    punpckhwd            m7, m1, m3                    ;unpacked in1 in3
-    punpcklwd            m0, m4                        ;unpacked in0 in2
-    punpcklwd            m1, m3                        ;unpacked in1 in3
-
-    pmaddwd              m4, m0, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
-    pmaddwd              m0, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
-    pmaddwd              m3, m1, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
-    pmaddwd              m5, m1, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
-    paddd                m3, m4                        ;t0 + t3
-
-    pmaddwd              m1, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
-    pmulhrsw             m2, [o(pw_3344x8)]            ;out2
-    mova                 m4, [o(pd_2048)]
-    paddd                m0, m4
-    paddd                m4, m3                        ;t0 + t3 + 2048
-    paddd                m5, m0                        ;t1 + t3 + 2048
-    paddd                m3, m0
-    paddd                m3, m1                        ;t0 + t1 - t3 + 2048
-
-    psrad                m4, 12                        ;out0
-    psrad                m5, 12                        ;out1
-    psrad                m3, 12                        ;out3
-    packssdw             m0, m4, m5                    ;low: out0  high: out1
-
-    pmaddwd              m4, m6, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
-    pmaddwd              m6, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
-    pmaddwd              m1, m7, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
-    pmaddwd              m5, m7, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
-    paddd                m1, m4                        ;t0 + t3
-    pmaddwd              m7, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
-
-    mova                 m4, [o(pd_2048)]
-    paddd                m6, m4
-    paddd                m4, m1                        ;t0 + t3 + 2048
-    paddd                m5, m6                        ;t1 + t3 + 2048
-    paddd                m1, m6
-    paddd                m1, m7                        ;t0 + t1 - t3 + 2048
-
-    psrad                m4, 12                        ;out0
-    psrad                m5, 12                        ;out1
-    psrad                m1, 12                        ;out3
-    packssdw             m3, m1                        ;out3
-    packssdw             m4, m5                        ;low: out0  high: out1
-
-    punpckhqdq           m1, m0, m4                    ;out1
-    punpcklqdq           m0, m4                        ;out0
-%endmacro
-
-%macro IADST8_1D_PACKED 0
-    mova                 m6, [o(pd_2048)]
-    punpckhwd            m4, m3, m0                ;unpacked in7 in0
-    punpckhwd            m5, m2, m1                ;unpacked in5 in2
-    punpcklwd            m1, m2                    ;unpacked in3 in4
-    punpcklwd            m0, m3                    ;unpacked in1 in6
-    ITX_MUL2X_PACK        4, 2, 6,  401, 4076      ;low:  t0a   high:  t1a
-    ITX_MUL2X_PACK        5, 2, 6, 1931, 3612      ;low:  t2a   high:  t3a
-    ITX_MUL2X_PACK        1, 2, 6, 3166, 2598      ;low:  t4a   high:  t5a
-    ITX_MUL2X_PACK        0, 2, 6, 3920, 1189      ;low:  t6a   high:  t7a
-
-    psubsw               m3, m4, m1                ;low:  t4    high:  t5
-    paddsw               m4, m1                    ;low:  t0    high:  t1
-    psubsw               m2, m5, m0                ;low:  t6    high:  t7
-    paddsw               m5, m0                    ;low:  t2    high:  t3
-
-    shufps               m1, m3, m2, q1032
-    punpckhwd            m2, m1
-    punpcklwd            m3, m1
-    ITX_MUL2X_PACK        3, 0, 6, 1567, 3784, 1   ;low:  t5a   high:  t4a
-    ITX_MUL2X_PACK        2, 0, 6, 3784, 1567      ;low:  t7a   high:  t6a
-
-    psubsw               m1, m4, m5                ;low:  t2    high:  t3
-    paddsw               m4, m5                    ;low:  out0  high: -out7
-    psubsw               m5, m3, m2                ;low:  t7    high:  t6
-    paddsw               m3, m2                    ;low:  out6  high: -out1
-    shufps               m0, m4, m3, q3210         ;low:  out0  high: -out1
-    shufps               m3, m4, q3210             ;low:  out6  high: -out7
-
-    shufps               m4, m1, m5, q1032         ;low:  t3    high:  t7
-    shufps               m1, m5, q3210             ;low:  t2    high:  t6
-    mova                 m5, [o(pw_2896x8)]
-    psubsw               m2, m1, m4                ;low:  t2-t3 high:  t6-t7
-    paddsw               m1, m4                    ;low:  t2+t3 high:  t6+t7
-    pmulhrsw             m2, m5                    ;low:  out4  high: -out5
-    shufps               m1, m1, q1032
-    pmulhrsw             m1, m5                    ;low:  out2  high: -out3
-%endmacro
-
 %macro WRITE_4X8 4 ;row[1-4]
     WRITE_4X4             0, 1, 4, 5, 6, %1, %2, %3, %4
     lea                dstq, [dstq+strideq*4]
@@ -838,7 +735,48 @@
 
 ALIGN function_align
 .main:
-    IADST8_1D_PACKED
+    mova                 m6, [o(pd_2048)]
+    punpckhwd            m4, m3, m0                ;unpacked in7 in0
+    punpckhwd            m5, m2, m1                ;unpacked in5 in2
+    punpcklwd            m1, m2                    ;unpacked in3 in4
+    punpcklwd            m0, m3                    ;unpacked in1 in6
+    ITX_MUL2X_PACK        4, 2, 6,  401, 4076      ;low:  t0a   high:  t1a
+    ITX_MUL2X_PACK        5, 2, 6, 1931, 3612      ;low:  t2a   high:  t3a
+    ITX_MUL2X_PACK        1, 2, 6, 3166, 2598      ;low:  t4a   high:  t5a
+    ITX_MUL2X_PACK        0, 2, 6, 3920, 1189      ;low:  t6a   high:  t7a
+
+    psubsw               m3, m4, m1                ;low:  t4    high:  t5
+    paddsw               m4, m1                    ;low:  t0    high:  t1
+    psubsw               m2, m5, m0                ;low:  t6    high:  t7
+    paddsw               m5, m0                    ;low:  t2    high:  t3
+
+    shufps               m1, m3, m2, q1032
+    punpckhwd            m2, m1
+    punpcklwd            m3, m1
+    ITX_MUL2X_PACK        3, 0, 6, 1567, 3784, 1   ;low:  t5a   high:  t4a
+    ITX_MUL2X_PACK        2, 0, 6, 3784, 1567      ;low:  t7a   high:  t6a
+
+    psubsw               m1, m4, m5                ;low:  t2    high:  t3
+    paddsw               m4, m5                    ;low:  out0  high: -out7
+    psubsw               m5, m3, m2                ;low:  t7    high:  t6
+    paddsw               m3, m2                    ;low:  out6  high: -out1
+    shufps               m0, m4, m3, q3210         ;low:  out0  high: -out1
+    shufps               m3, m4, q3210             ;low:  out6  high: -out7
+
+    mova                 m2, [o(pw_2896_m2896)]
+    mova                 m7, [o(pw_2896_2896)]
+    shufps               m4, m1, m5, q1032         ;low:  t3    high:  t7
+    shufps               m1, m5, q3210             ;low:  t2    high:  t6
+    punpcklwd            m5, m1, m4
+    punpckhwd            m1, m4
+    pmaddwd              m4, m2, m1                ;-out5
+    pmaddwd              m2, m5                    ; out4
+    pmaddwd              m1, m7                    ; out2
+    pmaddwd              m5, m7                    ;-out3
+    REPX      {paddd x, m6}, m4, m2, m1, m5
+    REPX      {psrad x, 12}, m4, m2, m1, m5
+    packssdw             m1, m5                    ;low:  out2  high: -out3
+    packssdw             m2, m4                    ;low:  out4  high: -out5
     ret
 
 INV_TXFM_4X8_FN flipadst, dct,      0
@@ -1109,7 +1047,67 @@
 
 ALIGN function_align
 .main:
-    IADST4_1D
+    punpckhwd            m6, m0, m2                    ;unpacked in0 in2
+    punpcklwd            m0, m2                        ;unpacked in0 in2
+    punpckhwd            m7, m1, m3                    ;unpacked in1 in3
+    punpcklwd            m1, m3                        ;unpacked in1 in3
+
+    mova                 m2, [o(pw_3344_m3344)]
+    mova                 m4, [o(pw_0_3344)]
+    pmaddwd              m3, m2, m6                    ;3344 * in0 - 3344 * in2
+    pmaddwd              m5, m4, m7                    ;3344 * in3
+    pmaddwd              m2, m0
+    pmaddwd              m4, m1
+    paddd                m3, m5
+    paddd                m2, m4
+    mova                 m4, [o(pd_2048)]
+    paddd                m3, m4                        ;t2 + 2048
+    paddd                m2, m4
+    psrad                m3, 12
+    psrad                m2, 12
+    packssdw             m2, m3                        ;out2
+
+    pmaddwd              m4, m0, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
+    pmaddwd              m0, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
+    pmaddwd              m3, m1, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m1, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
+    paddd                m3, m4                        ;t0 + t3
+
+    pmaddwd              m1, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
+    mova                 m4, [o(pd_2048)]
+    paddd                m0, m4
+    paddd                m4, m3                        ;t0 + t3 + 2048
+    paddd                m5, m0                        ;t1 + t3 + 2048
+    paddd                m3, m0
+    paddd                m3, m1                        ;t0 + t1 - t3 + 2048
+
+    psrad                m4, 12                        ;out0
+    psrad                m5, 12                        ;out1
+    psrad                m3, 12                        ;out3
+    packssdw             m0, m4, m5                    ;low: out0  high: out1
+
+    pmaddwd              m4, m6, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
+    pmaddwd              m6, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
+    pmaddwd              m1, m7, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m7, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
+    paddd                m1, m4                        ;t0 + t3
+    pmaddwd              m7, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
+
+    mova                 m4, [o(pd_2048)]
+    paddd                m6, m4
+    paddd                m4, m1                        ;t0 + t3 + 2048
+    paddd                m5, m6                        ;t1 + t3 + 2048
+    paddd                m1, m6
+    paddd                m1, m7                        ;t0 + t1 - t3 + 2048
+
+    psrad                m4, 12                        ;out0
+    psrad                m5, 12                        ;out1
+    psrad                m1, 12                        ;out3
+    packssdw             m3, m1                        ;out3
+    packssdw             m4, m5                        ;low: out0  high: out1
+
+    punpckhqdq           m1, m0, m4                    ;out1
+    punpcklqdq           m0, m4                        ;out0
     ret
 
 INV_TXFM_8X4_FN flipadst, dct
@@ -1423,6 +1421,7 @@
 
 .pass1:
     call .main
+    call .main_pass1_end
 
 .pass1_end:
     mova                    m7, [o(pw_16384)]
@@ -1441,6 +1440,7 @@
 
 .pass2_main:
     call .main
+    call .main_pass2_end
 
 .end:
     mova                    m7, [o(pw_2048)]
@@ -1491,6 +1491,53 @@
     psubsw                  m5, m6                        ;t6
     paddsw                  m6, m2, m7                    ;out6
     psubsw                  m2, m7                        ;t7
+    ret
+ALIGN function_align
+.main_pass1_end:
+    mova  [rsp+gprsize*2+16*1], m1
+    mova  [rsp+gprsize*2+16*2], m6
+    punpckhwd               m1, m4, m3
+    punpcklwd               m4, m3
+    punpckhwd               m7, m5, m2
+    punpcklwd               m5, m2
+    mova                    m2, [o(pw_2896_2896)]
+    mova                    m6, [o(pd_2048)]
+    pmaddwd                 m3, m2, m7
+    pmaddwd                 m2, m5
+    paddd                   m3, m6
+    paddd                   m2, m6
+    psrad                   m3, 12
+    psrad                   m2, 12
+    packssdw                m2, m3                        ;out2
+    mova                    m3, [o(pw_2896_m2896)]
+    pmaddwd                 m7, m3
+    pmaddwd                 m5, m3
+    paddd                   m7, m6
+    paddd                   m5, m6
+    psrad                   m7, 12
+    psrad                   m5, 12
+    packssdw                m5, m7                        ;-out5
+    mova                    m3, [o(pw_2896_2896)]
+    pmaddwd                 m7, m3, m1
+    pmaddwd                 m3, m4
+    paddd                   m7, m6
+    paddd                   m3, m6
+    psrad                   m7, 12
+    psrad                   m3, 12
+    packssdw                m3, m7                        ;-out3
+    mova                    m7, [o(pw_2896_m2896)]
+    pmaddwd                 m1, m7
+    pmaddwd                 m4, m7
+    paddd                   m1, m6
+    paddd                   m4, m6
+    psrad                   m1, 12
+    psrad                   m4, 12
+    packssdw                m4, m1                        ;-out5
+    mova                    m1, [rsp+gprsize*2+16*1]
+    mova                    m6, [rsp+gprsize*2+16*2]
+    ret
+ALIGN function_align
+.main_pass2_end:
     paddsw                  m7, m4, m3                    ;t2 + t3
     psubsw                  m4, m3                        ;t2 - t3
     paddsw                  m3, m5, m2                    ;t6 + t7
@@ -1513,6 +1560,7 @@
 
 .pass1:
     call m(iadst_8x8_internal).main
+    call m(iadst_8x8_internal).main_pass1_end
 
 .pass1_end:
     mova                    m7, [o(pw_m16384)]
@@ -1542,6 +1590,7 @@
 
 .pass2_main:
     call m(iadst_8x8_internal).main
+    call m(iadst_8x8_internal).main_pass2_end
 
 .end:
     mova                    m7, [o(pw_2048)]
@@ -1753,6 +1802,7 @@
 
 .pass2:
     call m(iadst_16x4_internal).main
+    call m(iadst_16x4_internal).main_pass2_end
 
     punpcklqdq            m6, m5, m4                ;low: -out5  high: -out7
     punpckhqdq            m4, m5                    ;low:  out8  high:  out10
@@ -1820,6 +1870,7 @@
 
 .pass2:
     call m(iadst_16x4_internal).main
+    call m(iadst_16x4_internal).main_pass2_end
 
     punpckhqdq            m6, m5, m4                ;low:  out5  high:  out7
     punpcklqdq            m4, m5                    ;low: -out8  high: -out10
@@ -2160,6 +2211,7 @@
 cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_7ROWS        coeffq, 16
     call .main
+    call .main_pass1_end
 
     punpckhwd             m6, m7, m0                 ;packed -out11, -out15
     punpcklwd             m0, m7                     ;packed   out0,   out4
@@ -2193,69 +2245,65 @@
 ALIGN function_align
 .main:
     mova       [coeffq+16*6], m0
-    pshufd                m1, m1, q1032
+    pshufd                m0, m1, q1032
     pshufd                m2, m2, q1032
-    punpckhwd             m0, m6, m1                 ;packed in13,  in2
-    punpcklwd             m1, m6                     ;packed  in3, in12
-    punpckhwd             m6, m5, m2                 ;packed in11,  in4
+    punpckhwd             m1, m6, m0                 ;packed in13,  in2
+    punpcklwd             m0, m6                     ;packed  in3, in12
+    punpckhwd             m7, m5, m2                 ;packed in11,  in4
     punpcklwd             m2, m5                     ;packed  in5, in10
-    mova                  m7, [o(pd_2048)]
-    ITX_MUL2X_PACK         0, 5, 7,  995, 3973       ;low:t2   high:t3
-    ITX_MUL2X_PACK         6, 5, 7, 1751, 3703       ;low:t4   high:t5
-    ITX_MUL2X_PACK         2, 5, 7, 3513, 2106       ;low:t10  high:t11
-    ITX_MUL2X_PACK         1, 5, 7, 3857, 1380       ;low:t12  high:t13
-    psubsw                m5, m0, m2                 ;low:t10a high:t11a
-    paddsw                m0, m2                     ;low:t2a  high:t3a
-    psubsw                m2, m6, m1                 ;low:t12a high:t13a
-    paddsw                m6, m1                     ;low:t4a  high:t5a
-    punpcklqdq            m1, m5
-    punpckhwd             m1, m5                     ;packed t10a, t11a
+    mova                  m6, [o(pd_2048)]
+    ITX_MUL2X_PACK         1, 5, 6,  995, 3973       ;low:t2   high:t3
+    ITX_MUL2X_PACK         7, 5, 6, 1751, 3703       ;low:t4   high:t5
+    ITX_MUL2X_PACK         2, 5, 6, 3513, 2106       ;low:t10  high:t11
+    ITX_MUL2X_PACK         0, 5, 6, 3857, 1380       ;low:t12  high:t13
+    psubsw                m5, m1, m2                 ;low:t10a high:t11a
+    paddsw                m1, m2                     ;low:t2a  high:t3a
+    psubsw                m2, m7, m0                 ;low:t12a high:t13a
+    paddsw                m7, m0                     ;low:t4a  high:t5a
+    punpcklqdq            m0, m5
+    punpckhwd             m0, m5                     ;packed t10a, t11a
     punpcklqdq            m5, m2
     punpckhwd             m2, m5                     ;packed t13a, t12a
-    ITX_MUL2X_PACK         1, 5, 7, 3406, 2276       ;low:t10  high:t11
-    ITX_MUL2X_PACK         2, 5, 7, 4017,  799, 1    ;low:t12  high:t13
-    mova       [coeffq+16*4], m0
-    mova       [coeffq+16*5], m6
-    mova                  m0, [coeffq+16*6]
-    mova                  m6, [coeffq+16*7]
-    pshufd                m0, m0, q1032
+    ITX_MUL2X_PACK         0, 5, 6, 3406, 2276       ;low:t10  high:t11
+    ITX_MUL2X_PACK         2, 5, 6, 4017,  799, 1    ;low:t12  high:t13
+    mova       [coeffq+16*4], m1
+    mova       [coeffq+16*5], m7
+    mova                  m1, [coeffq+16*6]
+    mova                  m7, [coeffq+16*7]
+    pshufd                m1, m1, q1032
     pshufd                m3, m3, q1032
-    punpckhwd             m5, m6, m0                 ;packed in15,  in0
-    punpcklwd             m0, m6                     ;packed  in1, in14
-    punpckhwd             m6, m4, m3                 ;packed  in9,  in6
+    punpckhwd             m5, m7, m1                 ;packed in15,  in0
+    punpcklwd             m1, m7                     ;packed  in1, in14
+    punpckhwd             m7, m4, m3                 ;packed  in9,  in6
     punpcklwd             m3, m4                     ;packed  in7,  in8
-    ITX_MUL2X_PACK         5, 4, 7,  201, 4091       ;low:t0    high:t1
-    ITX_MUL2X_PACK         6, 4, 7, 2440, 3290       ;low:t6    high:t7
-    ITX_MUL2X_PACK         3, 4, 7, 3035, 2751       ;low:t8    high:t9
-    ITX_MUL2X_PACK         0, 4, 7, 4052,  601       ;low:t14   high:t15
+    ITX_MUL2X_PACK         5, 4, 6,  201, 4091       ;low:t0    high:t1
+    ITX_MUL2X_PACK         7, 4, 6, 2440, 3290       ;low:t6    high:t7
+    ITX_MUL2X_PACK         3, 4, 6, 3035, 2751       ;low:t8    high:t9
+    ITX_MUL2X_PACK         1, 4, 6, 4052,  601       ;low:t14   high:t15
     psubsw                m4, m5, m3                 ;low:t8a   high:t9a
     paddsw                m5, m3                     ;low:t0a   high:t1a
-    psubsw                m3, m6, m0                 ;low:t14a  high:t15a
-    paddsw                m6, m0                     ;low:t6a   high:t7a
-    punpcklqdq            m0, m4
-    punpckhwd             m0, m4                     ;packed  t8a,  t9a
+    psubsw                m3, m7, m1                 ;low:t14a  high:t15a
+    paddsw                m7, m1                     ;low:t6a   high:t7a
+    punpcklqdq            m1, m4
+    punpckhwd             m1, m4                     ;packed  t8a,  t9a
     punpcklqdq            m4, m3
     punpckhwd             m3, m4                     ;packed t15a, t14a
-    ITX_MUL2X_PACK         0, 4, 7,  799, 4017       ;low:t8    high:t9
-    ITX_MUL2X_PACK         3, 4, 7, 2276, 3406, 1    ;low:t14   high:t15
-    psubsw                m4, m0, m2                 ;low:t12a  high:t13a
-    paddsw                m0, m2                     ;low:t8a   high:t9a
-    psubsw                m2, m1, m3                 ;low:t14a  high:t15a
-    paddsw                m1, m3                     ;low:t10a  high:t11a
-    punpcklqdq            m3, m4
-    punpckhwd             m3, m4                     ;packed t12a, t13a
-    punpcklqdq            m4, m2
-    punpckhwd             m2, m4                     ;packed t15a, t14a
-    ITX_MUL2X_PACK         3, 4, 7, 1567, 3784       ;low:t12   high:t13
-    ITX_MUL2X_PACK         2, 4, 7, 3784, 1567, 1    ;low:t14   high:t15
-    psubsw                m4, m0, m1                 ;low:t10   high:t11
-    paddsw                m0, m1                     ;low:-out1 high:out14
+    ITX_MUL2X_PACK         1, 4, 6,  799, 4017       ;low:t8    high:t9
+    ITX_MUL2X_PACK         3, 4, 6, 2276, 3406, 1    ;low:t14   high:t15
+    paddsw                m4, m1, m2                 ;low:t12a  high:t13a
+    psubsw                m1, m2                     ;low:t8a   high:t9a
+    psubsw                m2, m0, m3                 ;low:t14a  high:t15a
+    paddsw                m0, m3                     ;low:t10a  high:t11a
+    punpcklqdq            m3, m1
+    punpckhwd             m3, m1                     ;packed t12a, t13a
+    punpcklqdq            m1, m2
+    punpckhwd             m2, m1                     ;packed t15a, t14a
+    ITX_MUL2X_PACK         3, 1, 6, 1567, 3784       ;low:t12   high:t13
+    ITX_MUL2X_PACK         2, 1, 6, 3784, 1567, 1    ;low:t14   high:t15
     psubsw                m1, m3, m2                 ;low:t14a  high:t15a
     paddsw                m3, m2                     ;low:out2  high:-out13
-    punpckhqdq            m2, m4, m1                 ;low:t11   high:t15a
-    punpcklqdq            m4, m1                     ;low:t10   high:t14a
-    psubsw                m1, m4, m2
-    paddsw                m2, m4
+    psubsw                m2, m4, m0                 ;low:t10   high:t11
+    paddsw                m0, m4                     ;low:-out1 high:out14
     mova       [coeffq+16*6], m0
     mova       [coeffq+16*7], m3
     mova                  m0, [coeffq+16*4]
@@ -2262,19 +2310,68 @@
     mova                  m3, [coeffq+16*5]
     psubsw                m4, m5, m3                 ;low:t4    high:t5
     paddsw                m5, m3                     ;low:t0    high:t1
-    psubsw                m3, m0 ,m6                 ;low:t6    high:t7
-    paddsw                m0, m6                     ;low:t2    high:t3
-    punpcklqdq            m6, m4
-    punpckhwd             m6, m4                     ;packed t4, t5
+    psubsw                m3, m0, m7                 ;low:t6    high:t7
+    paddsw                m0, m7                     ;low:t2    high:t3
+    punpcklqdq            m7, m4
+    punpckhwd             m7, m4                     ;packed t4, t5
     punpcklqdq            m4, m3
     punpckhwd             m3, m4                     ;packed t7, t6
-    ITX_MUL2X_PACK         6, 4, 7, 1567, 3784       ;low:t4a   high:t5a
-    ITX_MUL2X_PACK         3, 4, 7, 3784, 1567, 1    ;low:t6a   high:t7a
+    ITX_MUL2X_PACK         7, 4, 6, 1567, 3784       ;low:t4a   high:t5a
+    ITX_MUL2X_PACK         3, 4, 6, 3784, 1567, 1    ;low:t6a   high:t7a
     psubsw                m4, m5, m0                 ;low:t2a   high:t3a
     paddsw                m0, m5                     ;low:out0  high:-out15
-    psubsw                m5, m6, m3                 ;low:t6    high:t7
-    paddsw                m3, m6                     ;low:-out3 high:out12
+    psubsw                m5, m7, m3                 ;low:t6    high:t7
+    paddsw                m3, m7                     ;low:-out3 high:out12
+    ret
+ALIGN function_align
+.main_pass1_end:
+    mova                  m7, [o(deint_shuf1)]
+    mova       [coeffq+16*4], m0
+    mova       [coeffq+16*5], m3
+    mova                  m0, [o(pw_2896_m2896)]
+    mova                  m3, [o(pw_2896_2896)]
+    pshufb                m1, m7                     ;t14a t15a
+    pshufb                m2, m7                     ;t10  t11
+    pshufb                m4, m7                     ;t2a  t3a
+    pshufb                m5, m7                     ;t6   t7
+    pmaddwd               m7, m0, m2
+    pmaddwd               m2, m3
+    paddd                 m7, m6
+    paddd                 m2, m6
+    psrad                 m7, 12
+    psrad                 m2, 12
+    packssdw              m2, m7                     ;low:out6  high:-out9
+    pmaddwd               m7, m0, m4
+    pmaddwd               m4, m3
+    paddd                 m7, m6
+    paddd                 m4, m6
+    psrad                 m7, 12
+    psrad                 m4, 12
+    packssdw              m4, m7                     ;low:-out7 high:out8
+    pmaddwd               m7, m3, m5
+    pmaddwd               m5, m0
+    paddd                 m7, m6
+    paddd                 m5, m6
+    psrad                 m7, 12
+    psrad                 m5, 12
+    packssdw              m7, m5                     ;low:out4  high:-out11
+    pmaddwd               m5, m3, m1
+    pmaddwd               m1, m0
+    paddd                 m5, m6
+    paddd                 m1, m6
+    psrad                 m5, 12
+    psrad                 m1, 12
+    packssdw              m5, m1                     ;low:-out5 high:out10
+    mova                  m0, [coeffq+16*4]
+    mova                  m3, [coeffq+16*5]
+    ret
+ALIGN function_align
+.main_pass2_end:
     mova                  m7, [o(pw_2896x8)]
+    punpckhqdq            m6, m2, m1                 ;low:t11   high:t15a
+    punpcklqdq            m2, m1                     ;low:t10   high:t14a
+    psubsw                m1, m2, m6
+    paddsw                m2, m6
     punpckhqdq            m6, m4, m5                 ;low:t3a   high:t7
     punpcklqdq            m4, m5                     ;low:t2a   high:t6
     psubsw                m5, m4, m6
@@ -2298,6 +2395,7 @@
 cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_7ROWS        coeffq, 16
     call m(iadst_16x4_internal).main
+    call m(iadst_16x4_internal).main_pass1_end
 
     punpcklwd             m6, m7, m0                 ;packed  out11,  out15
     punpckhwd             m0, m7                     ;packed  -out0,  -out4
@@ -2360,7 +2458,7 @@
 %endmacro
 
 %macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x16, 8, 16*12
+    INV_TXFM_FN          %1, %2, %3, 8x16, 8, 16*16
 %ifidn %1_%2, dct_dct
     pshuflw              m0, [coeffq], q0000
     punpcklwd            m0, m0
@@ -2548,6 +2646,7 @@
     mova                    m7, [coeffq+16*11]
 
     call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass2_end
 
     mov                    r3, dstq
     lea                  dstq, [dstq+strideq*8]
@@ -2599,6 +2698,7 @@
     mova                    m7, [coeffq+16*11]
 
     call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass2_end
     jmp  m(iflipadst_8x8_internal).end
 
 .end:
@@ -2652,7 +2752,7 @@
 
 
 %macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x8, 8, 16*12
+    INV_TXFM_FN          %1, %2, %3, 16x8, 8, 16*16
 %ifidn %1_%2, dct_dct
     movd                 m1, [o(pw_2896x8)]
     pmulhrsw             m0, m1, [coeffq]
@@ -2893,6 +2993,7 @@
     pmulhrsw                m7,     [coeffq+16*13]
 
     call .main
+    call .main_pass1_end
     mov                    r3, tx2q
     lea                  tx2q, [o(m(iadst_16x8_internal).pass1_end)]
     jmp m(iadst_8x8_internal).pass1_end
@@ -2998,23 +3099,15 @@
     mova  [rsp+gprsize*2+16*6], m3                       ;-out3
     psubsw                  m3, m0, m4                   ;t7
     paddsw                  m0, m4                       ;out12
-    mova                    m7, [o(pw_2896x8)]
-    psubsw                  m4, m2, m3
-    paddsw                  m2, m3
+    mova [rsp+gprsize*2+16*12], m3
     mova                    m3, [rsp+gprsize*2+16*7]     ;t3
-    pmulhrsw                m4, m7                       ;-out11
-    pmulhrsw                m2, m7                       ;out4
-    mova  [rsp+gprsize*2+16*7], m2                       ;out4
+    mova [rsp+gprsize*2+16* 7], m2                       ;out4
     psubsw                  m2, m5, m3                   ;t3a
     paddsw                  m5, m3                       ;-out15
-    psubsw                  m3, m1, m2
-    paddsw                  m1, m2
+    mova [rsp+gprsize*2+16*11], m2
     mova                    m2, [rsp+gprsize*2+32*5]     ;t15
-    pmulhrsw                m3, m7                       ;out8
-    pmulhrsw                m1, m7                       ;-out7
-    mova [rsp+gprsize*2+32*5 ], m1                       ;-out7
+    mova [rsp+gprsize*2+16*10], m1                       ;-out7
     mova                    m1, [rsp+gprsize*2+16*0]     ;t11
-    mova [rsp+gprsize*2+16*11], m3                       ;out8
     mova [rsp+gprsize*2+16*0 ], m5                       ;-out15
     mova                    m3, [rsp+gprsize*2+16*1]     ;t10
     mova [rsp+gprsize*2+16*1 ], m4                       ;-out11
@@ -3044,26 +3137,106 @@
     paddsw                  m2, m6                       ;-out1
     paddsw                  m6, m4, m1                   ;out14
     psubsw                  m4, m1                       ;t11
-    psubsw                  m1, m3, m4
-    paddsw                  m3, m4
-    pmulhrsw                m1, m7                       ;-out9
-    pmulhrsw                m3, m7                       ;out6
-    mova  [rsp+gprsize*2+16*4], m2                       ;-out1
+    mova [rsp+gprsize*2+16*14], m4
+    mova [rsp+gprsize*2+16* 4], m2                       ;-out1
     mova                    m4, [rsp+gprsize*2+16*8]     ;t14
     mova                    m2, [rsp+gprsize*2+16*9]     ;t15
-    mova  [rsp+gprsize*2+16*9], m3                       ;out6
+    mova [rsp+gprsize*2+16* 9], m3                       ;out6
     psubsw                  m3, m0, m4                   ;t14a
     paddsw                  m0, m4                       ;out2
     psubsw                  m4, m5, m2                   ;t15a
     paddsw                  m5, m2                       ;-out13
+    mova [rsp+gprsize*2+16* 5], m0                       ;out2
+    ret
+ALIGN function_align
+.main_pass1_end:
+    mova                    m0, [rsp+gprsize*2+16*14]
+    mova [rsp+gprsize*2+16*14], m5
+    mova [rsp+gprsize*2+16*15], m6
+    mova                    m5, [o(pw_2896_2896)]
+    mova                    m6, [o(pw_2896_m2896)]
+    mova                    m7, [o(pd_2048)]
+    punpcklwd               m2, m3, m4
+    punpckhwd               m3, m4
+    pmaddwd                 m4, m5, m2
+    pmaddwd                 m2, m6
+    pmaddwd                 m1, m5, m3
+    pmaddwd                 m3, m6
+    REPX         {paddd x, m7}, m4, m2, m1, m3
+    REPX         {psrad x, 12}, m4, m1, m2, m3
+    packssdw                m4, m1                       ;-out5
+    packssdw                m2, m3                       ;out10
+    mova [rsp+gprsize*2+16* 8], m4
+    mova                    m3, [rsp+gprsize*2+16* 9]
+    punpcklwd               m1, m3, m0
+    punpckhwd               m3, m0
+    pmaddwd                 m0, m5, m1
+    pmaddwd                 m1, m6
+    pmaddwd                 m4, m5, m3
+    pmaddwd                 m3, m6
+    REPX         {paddd x, m7}, m0, m1, m4, m3
+    REPX         {psrad x, 12}, m0, m4, m1, m3
+    packssdw                m0, m4                       ;out6
+    packssdw                m1, m3                       ;-out9
+    mova [rsp+gprsize*2+16* 9], m0
+    mova                    m0, [rsp+gprsize*2+16* 7]
+    mova                    m4, [rsp+gprsize*2+16*12]
+    punpcklwd               m3, m0, m4
+    punpckhwd               m0, m4
+    pmaddwd                 m4, m5, m3
+    pmaddwd                 m3, m6
+    pmaddwd                 m5, m0
+    pmaddwd                 m0, m6
+    REPX         {paddd x, m7}, m4, m3, m5, m0
+    REPX         {psrad x, 12}, m4, m5, m3, m0
+    packssdw                m4, m5                       ;out4
+    packssdw                m3, m0                       ;-out11
+    mova [rsp+gprsize*2+16* 7], m4
+    mova                    m4, [rsp+gprsize*2+16*10]
+    mova                    m5, [rsp+gprsize*2+16*11]
+    punpcklwd               m0, m4, m5
+    punpckhwd               m4, m5
+    pmaddwd                 m5, m0, [o(pw_2896_2896)]
+    pmaddwd                 m0, m6
+    pmaddwd                 m6, m4
+    pmaddwd                 m4, [o(pw_2896_2896)]
+    REPX         {paddd x, m7}, m5, m0, m6, m4
+    REPX         {psrad x, 12}, m0, m6, m5, m4
+    packssdw                m0, m6                       ;out8
+    packssdw                m5, m4                       ;-out7
+    mova [rsp+gprsize*2+16*10], m5
+    mova                    m4, [rsp+gprsize*2+16* 2]    ;out12
+    mova                    m5, [rsp+gprsize*2+16*14]    ;-out13
+    mova                    m6, [rsp+gprsize*2+16*15]    ;out14
+    ret
+ALIGN function_align
+.main_pass2_end:
+    mova                    m7, [o(pw_2896x8)]
+    mova                    m1, [rsp+gprsize*2+16* 9]
+    mova                    m2, [rsp+gprsize*2+16*14]
+    paddsw                  m0, m1, m2
+    psubsw                  m1, m2
+    pmulhrsw                m0, m7                       ;out6
+    pmulhrsw                m1, m7                       ;-out9
+    mova [rsp+gprsize*2+16* 9], m0
     psubsw                  m2, m3, m4
     paddsw                  m3, m4
-    mova  [rsp+gprsize*2+16*5], m0                       ;out2
-    pmulhrsw                m3, m7                       ;-out5
     pmulhrsw                m2, m7                       ;out10
-    mova  [rsp+gprsize*2+16*8], m3                       ;-out5
-    mova                    m0, [rsp+gprsize*2+16*11]    ;out8
-    mova                    m3, [rsp+gprsize*2+16*1 ]    ;-out11
+    pmulhrsw                m3, m7                       ;-out5
+    mova [rsp+gprsize*2+16* 8], m3
+    mova                    m3, [rsp+gprsize*2+16* 7]
+    mova                    m4, [rsp+gprsize*2+16*12]
+    paddsw                  m0, m3, m4
+    psubsw                  m3, m4
+    pmulhrsw                m0, m7                       ;out4
+    pmulhrsw                m3, m7                       ;-out11
+    mova [rsp+gprsize*2+16* 7], m0
+    mova                    m0, [rsp+gprsize*2+16*10]
+    paddsw                  m4, m0, [rsp+gprsize*2+16*11]
+    psubsw                  m0, [rsp+gprsize*2+16*11]
+    pmulhrsw                m4, m7                       ;-out7
+    pmulhrsw                m0, m7                       ;out8
+    mova [rsp+gprsize*2+16*10], m4
     mova                    m4, [rsp+gprsize*2+16*2 ]    ;out12
     ret
 
@@ -3100,6 +3273,7 @@
     pmulhrsw                m7,     [coeffq+16*13]
 
     call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
 
     mova                    m7, [rsp+gprsize+16*0]
     SAVE_8ROWS     coeffq+16*0, 32
@@ -3184,7 +3358,7 @@
 
 
 %macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x16, 8, 16*12
+    INV_TXFM_FN          %1, %2, %3, 16x16, 8, 16*16
 %ifidn %1_%2, dct_dct
     movd                   m1, [o(pw_2896x8)]
     pmulhrsw               m0, m1, [coeffq]
@@ -3423,6 +3597,7 @@
 cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     ITX_16X16_ADST_LOAD_ODD_COEFS
     call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
 
     mov                     r3, tx2q
     lea                   tx2q, [o(m(iadst_16x16_internal).pass1_end)]
@@ -3441,6 +3616,7 @@
     SAVE_8ROWS     coeffq+16*1, 32
     ITX_16X16_ADST_LOAD_EVEN_COEFS
     call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
 
     lea                   tx2q, [o(m(iadst_16x16_internal).pass1_end2)]
     mova                    m7, [o(pw_8192)]
@@ -3496,6 +3672,7 @@
 cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     ITX_16X16_ADST_LOAD_ODD_COEFS
     call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
 
     mov                     r3, tx2q
     lea                   tx2q, [o(m(iflipadst_16x16_internal).pass1_end)]
@@ -3514,6 +3691,7 @@
     SAVE_8ROWS    coeffq+16*17, 32
     ITX_16X16_ADST_LOAD_EVEN_COEFS
     call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
 
     mova                    m7, [rsp+gprsize+16*0]
     SAVE_8ROWS     coeffq+16*0, 32