shithub: dav1d

Download patch

ref: 103cd220dd187194eaa0d8846b72efc6b1d3e905
parent: a20b5757c766999bf3078c6c186f93aefce1d59e
author: Henrik Gramner <gramner@twoorioles.com>
date: Thu Oct 24 20:17:56 EDT 2019

x86: Fix overflows in inverse identity SSSE3 transforms

--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -73,6 +73,8 @@
 pw_4096:        times 8 dw  4096
 pw_16384:       times 8 dw  16384
 pw_m16384:      times 8 dw  -16384
+pw_1697x16:     times 8 dw  1697*16
+pw_1697x8:      times 8 dw  1697*8
 pw_2896x8:      times 8 dw  2896*8
 pw_3344x8:      times 8 dw  3344*8
 pw_5793x4:      times 8 dw  5793*4
@@ -273,8 +275,8 @@
 %ifidn %1_%2, dct_identity
     mova                 m0, [o(pw_2896x8)]
     pmulhrsw             m0, [coeffq]
-    paddw                m0, m0
-    pmulhrsw             m0, [o(pw_5793x4)]
+    pmulhrsw             m1, m0, [o(pw_1697x8)]
+    paddw                m0, m1
     punpcklwd            m0, m0
     punpckhdq            m1, m0, m0
     punpckldq            m0, m0
@@ -286,8 +288,8 @@
     punpckhwd            m1, m2
     punpcklwd            m0, m1
     punpcklqdq           m0, m0
-    paddw                m0, m0
-    pmulhrsw             m0, [o(pw_5793x4)]
+    pmulhrsw             m1, m0, [o(pw_1697x8)]
+    paddw                m0, m1
     pmulhrsw             m0, [o(pw_2896x8)]
     mova                 m1, m0
     TAIL_CALL m(iadst_4x4_internal).end
@@ -434,12 +436,11 @@
 cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m0, [coeffq+16*0]
     mova                 m1, [coeffq+16*1]
-    mova                 m2, [o(pw_5793x4)]
-    paddw                m0, m0
-    paddw                m1, m1
-    pmulhrsw             m0, m2
-    pmulhrsw             m1, m2
-
+    mova                 m3, [o(pw_1697x8)]
+    pmulhrsw             m2, m0, m3
+    pmulhrsw             m3, m1
+    paddw                m0, m2
+    paddw                m1, m3
     punpckhwd            m2, m0, m1
     punpcklwd            m0, m1
     punpckhwd            m1, m0, m2            ;high: in3 ;low :in2
@@ -447,11 +448,11 @@
     jmp                tx2q
 
 .pass2:
-    mova                 m2, [o(pw_5793x4)]
-    paddw                m0, m0
-    paddw                m1, m1
-    pmulhrsw             m0, m2
-    pmulhrsw             m1, m2
+    mova                 m3, [o(pw_1697x8)]
+    pmulhrsw             m2, m3, m0
+    pmulhrsw             m3, m1
+    paddw                m0, m2
+    paddw                m1, m3
     jmp m(iadst_4x4_internal).end
 
 %macro IWHT4_1D_PACKED 0
@@ -609,8 +610,8 @@
     mova                 m2, [o(pw_2896x8)]
     punpckldq            m0, m1
     pmulhrsw             m0, m2
-    paddw                m0, m0
-    pmulhrsw             m0, [o(pw_5793x4)]
+    pmulhrsw             m1, m0, [o(pw_1697x8)]
+    paddw                m0, m1
     pmulhrsw             m0, m2
     pmulhrsw             m0, [o(pw_2048)]
     punpcklqdq           m0, m0
@@ -828,16 +829,15 @@
     pmulhrsw             m3,     [coeffq+16*3]
 
 .pass1:
-    mova                 m5, [o(pw_5793x4)]
-    paddw                m0, m0
-    paddw                m1, m1
-    paddw                m2, m2
-    paddw                m3, m3
-    pmulhrsw             m0, m5
-    pmulhrsw             m1, m5
-    pmulhrsw             m2, m5
-    pmulhrsw             m3, m5
-
+    mova                 m7, [o(pw_1697x8)]
+    pmulhrsw             m4, m7, m0
+    pmulhrsw             m5, m7, m1
+    pmulhrsw             m6, m7, m2
+    pmulhrsw             m7, m3
+    paddw                m0, m4
+    paddw                m1, m5
+    paddw                m2, m6
+    paddw                m3, m7
     jmp m(iadst_4x8_internal).pass1_end
 
 .pass2:
@@ -880,8 +880,8 @@
     mova                 m0, [o(pw_2896x8)]
     pmulhrsw             m1, m0, [coeffq]
     pmulhrsw             m1, m0
-    paddw                m1, m1
-    pmulhrsw             m1, [o(pw_5793x4)]
+    pmulhrsw             m0, m1, [o(pw_1697x8)]
+    paddw                m1, m0
     pmulhrsw             m1, [o(pw_2048)]
     punpcklwd            m1, m1
     punpckhdq            m2, m1, m1
@@ -1180,15 +1180,15 @@
     jmp                tx2q
 
 .pass2:
-    mova                 m4, [o(pw_5793x4)]
-    paddw                m0, m0
-    paddw                m1, m1
-    paddw                m2, m2
-    paddw                m3, m3
-    pmulhrsw             m0, m4
-    pmulhrsw             m1, m4
-    pmulhrsw             m2, m4
-    pmulhrsw             m3, m4
+    mova                 m7, [o(pw_1697x8)]
+    pmulhrsw             m4, m7, m0
+    pmulhrsw             m5, m7, m1
+    pmulhrsw             m6, m7, m2
+    pmulhrsw             m7, m3
+    paddw                m0, m4
+    paddw                m1, m5
+    paddw                m2, m6
+    paddw                m3, m7
     jmp m(iadst_8x4_internal).end
 
 %macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
@@ -1635,14 +1635,16 @@
     pmulhrsw             m0, [coeffq+16*0]
     pmulhrsw             m1, [coeffq+16*1]
     mova                 m2, [o(pw_16384)]
-    mova                 m3, [o(pw_5793x4)]
+    mova                 m3, [o(pw_1697x16)]
     mova                 m4, [o(pw_2048)]
     pmulhrsw             m0, m2
     pmulhrsw             m1, m2
-    psllw                m0, 2
-    psllw                m1, 2
-    pmulhrsw             m0, m3
-    pmulhrsw             m1, m3
+    pmulhrsw             m2, m3, m0
+    pmulhrsw             m3, m1
+    paddw                m0, m0
+    paddw                m1, m1
+    paddw                m0, m2
+    paddw                m1, m3
     pmulhrsw             m0, m4
     pmulhrsw             m4, m1
     punpckhwd            m2, m0, m0
@@ -1664,12 +1666,11 @@
     punpcklwd             m0, [coeffq+32*1]
     movd                  m1, [coeffq+32*2]
     punpcklwd             m1, [coeffq+32*3]
-    mova                  m2, [o(pw_5793x4)]
     mova                  m3, [o(pw_16384)]
-    mova                  m4, [o(pw_2896x8)]
     punpckldq             m0, m1
-    paddw                 m0, m0
-    pmulhrsw              m0, m2
+    pmulhrsw              m1, m0, [o(pw_1697x8)]
+    mova                  m4, [o(pw_2896x8)]
+    paddw                 m0, m1
     pmulhrsw              m0, m3
     psrlw                 m3, 3                ; pw_2048
     pmulhrsw              m0, m4
@@ -1885,17 +1886,27 @@
 INV_TXFM_4X16_FN identity, flipadst
 INV_TXFM_4X16_FN identity, identity
 
+%macro IDTX16 3 ; src/dst, tmp, pw_1697x16
+    pmulhrsw             m%2, m%3, m%1
+    paddw                m%1, m%1
+    paddw                m%1, m%2
+%endmacro
+
 cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     lea                   r3, [o(m(iidentity_4x8_internal).pass1)]
     jmp   m(idct_4x16_internal).pass1
 
 .pass2:
-    mova                  m7, [o(pw_5793x4)]
-    REPX    {psllw    x, 2 }, m0, m1, m2, m3, m4, m5, m6
-    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
-    psllw                 m7, [coeffq+16*7], 2
-    pmulhrsw              m7, [o(pw_5793x4)]
-    mova       [coeffq+16*7], m7
+    mova                  m7, [o(pw_1697x16)]
+    mova       [coeffq+16*6], m6
+    REPX    {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
+    mova                  m6, [coeffq+16*7]
+    IDTX16                 6, 7, 7
+    mova       [coeffq+16*7], m6
+    mova                  m6, [coeffq+16*6]
+    pmulhrsw              m7, m6, [o(pw_1697x16)]
+    paddw                 m6, m6
+    paddw                 m6, m7
 
     mova                  m7, [o(pw_2048)]
     REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
@@ -1913,8 +1924,8 @@
     mova                 m0, [o(pw_16384)]
     pmulhrsw             m3, m0
     psrlw                m0, 3                ; pw_2048
-    paddw                m3, m3
-    pmulhrsw             m3, [o(pw_5793x4)]
+    pmulhrsw             m1, m3, [o(pw_1697x8)]
+    paddw                m3, m1
     pmulhrsw             m3, m0
     punpcklwd            m3, m3
     pshufd               m0, m3, q0000
@@ -1927,28 +1938,28 @@
     mov                dstq, tx2q
     TAIL_CALL m(iadst_8x4_internal).end2
 %elifidn %1_%2, identity_dct
+    mova                 m4, [o(pw_1697x16)]
     mova                 m5, [o(pw_16384)]
-    mova                 m6, [o(pw_5793x4)]
-    mova                 m7, [o(pw_2896x8)]
+    mova                 m6, [o(pw_2896x8)]
     mov                 r3d, 2
+    psrlw                m7, m5, 3 ; pw_2048
 .main_loop:
     mova                 m0, [coeffq+16*0]
     mova                 m1, [coeffq+16*1]
-    mova                 m2, [coeffq+16*2]
-    mova                 m3, [coeffq+16*3]
-    punpckhwd            m4, m0, m1
+    punpckhwd            m2, m0, m1
     punpcklwd            m0, m1
-    punpckhwd            m1, m2, m3
-    punpcklwd            m2, m3
-    punpcklwd            m0, m4
-    punpcklwd            m2, m1
-    punpcklqdq           m0, m2
-    psllw                m0, 2
+    punpcklwd            m0, m2
+    mova                 m1, [coeffq+16*2]
+    mova                 m2, [coeffq+16*3]
+    punpckhwd            m3, m1, m2
+    punpcklwd            m1, m2
+    punpcklwd            m1, m3
+    punpcklqdq           m0, m1
+    pmulhrsw             m1, m4, m0
+    pmulhrsw             m1, m5
+    paddw                m0, m1
     pmulhrsw             m0, m6
-    pmulhrsw             m0, m5
-    psrlw                m1, m5, 3               ; pw_2048
     pmulhrsw             m0, m7
-    pmulhrsw             m0, m1
 .end:
     pxor                 m3, m3
     mova      [coeffq+16*0], m3
@@ -2412,22 +2423,56 @@
 INV_TXFM_16X4_FN identity, identity
 
 cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
-    LOAD_7ROWS        coeffq, 16
-    mova                  m7, [o(pw_5793x4)]
-    REPX    {psllw    x, 2 }, m0, m1, m2, m3, m4, m5, m6
-    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    mova                  m1, [coeffq+16*6]
+    mova                  m0, [coeffq+16*5]
+    mova                  m2, [coeffq+16*7]
+    mova                  m6, [o(pw_1697x16)]
+    mova                  m7, [o(pw_16384)]
+    pmulhrsw              m4, m6, m1
+    pmulhrsw              m3, m6, m0
+    pmulhrsw              m5, m6, m2
+    pmulhrsw              m4, m7
+    pmulhrsw              m3, m7
+    pmulhrsw              m5, m7
+    paddw                 m1, m4
+    paddw                 m0, m3
+    paddw                 m5, m2
+    mova                  m2, [coeffq+16*2]
+    mova                  m3, [coeffq+16*3]
+    mova                  m4, [coeffq+16*4]
+    mova       [coeffq+16*6], m1
+    mova       [coeffq+16*5], m0
+    mova       [coeffq+16*7], m5
+    pmulhrsw              m0, m6, m2
+    pmulhrsw              m1, m6, m3
+    pmulhrsw              m5, m6, m4
+    pmulhrsw              m0, m7
+    pmulhrsw              m1, m7
+    pmulhrsw              m5, m7
+    paddw                 m2, m0
+    paddw                 m3, m1
+    paddw                 m4, m5
+    mova                  m0, [coeffq+16*0]
+    mova                  m1, [coeffq+16*1]
+    pmulhrsw              m5, m6, m0
+    pmulhrsw              m6, m1
+    pmulhrsw              m5, m7
+    pmulhrsw              m6, m7
+    paddw                 m0, m5
+    paddw                 m1, m6
+    mova                  m6, [coeffq+16*6]
+    mova                  m5, [coeffq+16*5]
     punpckhwd             m7, m0, m2                 ;packed out1,  out5
     punpcklwd             m0, m2                     ;packed out0,  out4
     punpckhwd             m2, m1, m3                 ;packed out3,  out7
     punpcklwd             m1, m3                     ;packed out2,  out6
     mova       [coeffq+16*6], m7
-    psllw                 m7, [coeffq+16*7], 2
-    pmulhrsw              m7, [o(pw_5793x4)]
+    mova                  m7, [coeffq+16*7]
     punpckhwd             m3, m4, m6                 ;packed out9,  out13
     punpcklwd             m4, m6                     ;packed out8,  out12
     punpckhwd             m6, m5, m7                 ;packed out11, out15
     punpcklwd             m5, m7                     ;packed out10, out14
-    jmp   m(idct_16x4_internal).pass1_end2
+    jmp   m(idct_16x4_internal).pass1_end3
 
 .pass2:
     lea                 tx2q, [o(m(iidentity_8x4_internal).pass2)]
@@ -2475,8 +2520,9 @@
     pmulhrsw             m7, m0
     pmulhrsw             m7, m1
     psrlw                m1, 3          ; pw_2048
-    psllw                m7, 2
-    pmulhrsw             m7, [o(pw_5793x4)]
+    pmulhrsw             m0, m7, [o(pw_1697x16)]
+    paddw                m7, m7
+    paddw                m7, m0
     pmulhrsw             m7, m1
     punpcklwd            m0, m7, m7
     punpckhwd            m7, m7
@@ -2720,16 +2766,21 @@
     lea                  tx2q, [o(m(iidentity_8x16_internal).end1)]
 
 .end:
-    REPX     {psllw    x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
-    pmulhrsw               m7, [o(pw_5793x4)]
-    pmulhrsw               m7, [o(pw_2048)]
     mova   [rsp+gprsize+16*0], m7
-    mova                   m7, [o(pw_5793x4)]
-    REPX     {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    mova   [rsp+gprsize+16*1], m6
+    mova                   m7, [o(pw_1697x16)]
+    REPX     {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
+    mova                   m6, [rsp+gprsize+16*1]
+    mova   [rsp+gprsize+16*2], m5
+    IDTX16                  6, 5, 7
+    mova                   m5, [rsp+gprsize+16*0]
+    IDTX16                  5, 7, 7
     mova                   m7, [o(pw_2048)]
     REPX     {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    pmulhrsw               m7, [rsp+gprsize+16*2]
+    mova   [rsp+gprsize+16*0], m5
     mova   [rsp+gprsize+16*1], m6
-    mova   [rsp+gprsize+16*2], m5
+    mova   [rsp+gprsize+16*2], m7
     jmp  m(idct_8x8_internal).end3
 
 .end1:
@@ -2787,10 +2838,10 @@
     mov                dstq, tx2q
     TAIL_CALL m(iadst_8x4_internal).end2
 %elifidn %1_%2, identity_dct
-    mova                 m5, [o(pw_16384)]
-    mova                 m6, [o(pw_5793x4)]
-    mova                 m7, [o(pw_2896x8)]
-    pxor                 m4, m4
+    mova                 m4, [o(pw_2896x8)]
+    mova                 m5, [o(pw_1697x16)]
+    mova                 m6, [o(pw_16384)]
+    psrlw                m7, m6, 3 ; pw_2048
     mov                 r3d, 2
 .main_loop:
     mova                 m0, [coeffq+16*0]
@@ -2797,22 +2848,22 @@
     punpcklwd            m0, [coeffq+16*1]
     mova                 m1, [coeffq+16*2]
     punpcklwd            m1, [coeffq+16*3]
-    mova                 m2, [coeffq+16*4]
-    punpcklwd            m2, [coeffq+16*5]
-    mova                 m3, [coeffq+16*6]
-    punpcklwd            m3, [coeffq+16*7]
     punpckldq            m0, m1
-    punpckldq            m2, m3
-    punpcklqdq           m0, m2
+    mova                 m1, [coeffq+16*4]
+    punpcklwd            m1, [coeffq+16*5]
+    mova                 m2, [coeffq+16*6]
+    punpcklwd            m2, [coeffq+16*7]
+    punpckldq            m1, m2
+    punpcklqdq           m0, m1
+    pmulhrsw             m0, m4
+    pmulhrsw             m1, m5, m0
+    pmulhrsw             m1, m6
+    paddw                m0, m1
+    pmulhrsw             m0, m4
     pmulhrsw             m0, m7
-    psllw                m0, 2
-    pmulhrsw             m0, m6
-    pmulhrsw             m0, m5
-    psrlw                m1, m5, 3               ; pw_2048
-    pmulhrsw             m0, m7
-    pmulhrsw             m0, m1
 .end:
-    REPX  {mova [coeffq+16*x], m4},  0,  1,  2,  3,  4,  5,  6,  7
+    pxor                 m1, m1
+    REPX {mova [coeffq+16*x], m1}, 0, 1, 2, 3, 4, 5, 6, 7
     add              coeffq, 16*8
     lea                tx2q, [dstq+8]
     WRITE_8X4             0, 0, 0, 0, 1, 2, 3
@@ -3292,40 +3343,66 @@
 INV_TXFM_16X8_FN identity, identity
 
 cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
-    LOAD_8ROWS    coeffq+16*8, 16, 1
-
+    add                coeffq, 16*16
+    mova                   m4, [coeffq-16*7]
+    mova                   m5, [coeffq-16*5]
+    mova                   m6, [coeffq-16*3]
+    mova                   m7, [coeffq-16*1]
     mov                    r3, tx2q
     lea                  tx2q, [o(m(iidentity_16x8_internal).pass1_end)]
 
 .pass1:
-    REPX     {psllw    x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
-    pmulhrsw               m7, [o(pw_5793x4)]
-    mova   [rsp+gprsize+16*0], m7
-
-    mova                   m7, [o(pw_5793x4)]
-    REPX     {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
-
-    jmp   m(idct_8x8_internal).pass1_end
-
-.pass1_end:
-    mova       [coeffq+16*9 ], m4
-    mova       [coeffq+16*11], m5
-    mova       [coeffq+16*13], m6
-    mova       [coeffq+16*15], m7
-    mova                   m4, [o(pw_2896x8)]
-    pmulhrsw               m5, m4, [coeffq+16*5]
-    pmulhrsw               m6, m4, [coeffq+16*6]
-    pmulhrsw               m7, m4, [coeffq+16*7]
-    mova       [coeffq+16*5 ], m2
-    mova       [coeffq+16*7 ], m3
-    pmulhrsw               m2, m4, [coeffq+16*2]
-    pmulhrsw               m3, m4, [coeffq+16*3]
-    mova       [coeffq+16*3 ], m1
-    pmulhrsw               m1, m4, [coeffq+16*1]
-    mova       [coeffq+16*1 ], m0
-    pmulhrsw               m0, m4, [coeffq+16*0]
-    pmulhrsw               m4, [coeffq+16*4]
+    mova                   m0, [o(pw_2896x8)]
+    mova                   m2, [o(pw_1697x16)]
+    mova                   m3, [o(pw_16384)]
+    sub                coeffq, 8*16
+    REPX     {pmulhrsw x, m0}, m4, m5, m6, m7
+    pmulhrsw               m1, m2, m4
+    pmulhrsw               m1, m3
+    paddw                  m1, m4 ; 1
+    pmulhrsw               m4, m2, m5
+    pmulhrsw               m4, m3
+    paddw                  m4, m5 ; 3
+    pmulhrsw               m5, m2, m6
+    pmulhrsw               m5, m3
+    paddw                  m5, m6 ; 5
+    pmulhrsw               m6, m2, m7
+    pmulhrsw               m6, m3
+    paddw                  m7, m6 ; 7
+    pmulhrsw               m6, m0, [coeffq+16*6]
+    mova   [rsp+gprsize+16*0], m4
+    pmulhrsw               m4, m2, m6
+    pmulhrsw               m4, m3
+    paddw                  m6, m4 ; 6
+    pmulhrsw               m4, m0, [coeffq+16*4]
+    mova   [rsp+gprsize+16*1], m6
+    pmulhrsw               m6, m2, m4
+    pmulhrsw               m6, m3
+    paddw                  m4, m6 ; 4
+    pmulhrsw               m6, m0, [coeffq+16*2]
+    pmulhrsw               m0,     [coeffq+16*0]
+    pmulhrsw               m2, m6
+    pmulhrsw               m2, m3
+    paddw                  m2, m6 ; 2
+    pmulhrsw               m6, m0, [o(pw_1697x16)]
+    pmulhrsw               m6, m3
+    mova                   m3, [rsp+gprsize+16*0]
+    paddw                  m0, m6
+    jmp   m(idct_8x8_internal).pass1_end3
 
+.pass1_end:
+    mova        [coeffq+16*1], m4
+    mova        [coeffq+16*3], m5
+    mova        [coeffq+16*5], m6
+    mova        [coeffq+16*7], m7
+    mova                   m4, [coeffq-16*7]
+    mova                   m5, [coeffq-16*5]
+    mova                   m6, [coeffq-16*3]
+    mova                   m7, [coeffq-16*1]
+    mova        [coeffq-16*7], m0
+    mova        [coeffq-16*5], m1
+    mova        [coeffq-16*3], m2
+    mova        [coeffq-16*1], m3
     mov                  tx2q, r3
     jmp .pass1
 
@@ -3399,7 +3476,7 @@
     jg .loop
     RET
 %elifidn %1_%2, identity_dct
-    mova                   m4, [o(pw_5793x4)]
+    mova                   m4, [o(pw_1697x16)]
     mova                   m5, [o(pw_8192)]
     mova                   m6, [o(pw_2896x8)]
     psrlw                  m7, m5, 2                 ;pw_2048
@@ -3410,23 +3487,24 @@
 .main:
     movd                   m0, [coeffq+32*0]
     punpcklwd              m0, [coeffq+32*1]
-    movd                   m2, [coeffq+32*2]
-    punpcklwd              m2, [coeffq+32*3]
+    movd                   m1, [coeffq+32*2]
+    punpcklwd              m1, [coeffq+32*3]
     add                coeffq, 32*4
+    punpckldq              m0, m1
     movd                   m1, [coeffq+32*0]
     punpcklwd              m1, [coeffq+32*1]
-    movd                   m3, [coeffq+32*2]
-    punpcklwd              m3, [coeffq+32*3]
+    movd                   m2, [coeffq+32*2]
+    punpcklwd              m2, [coeffq+32*3]
     xor                  eobd, eobd
     mov         [coeffq-32*4], eobd
     mov         [coeffq-32*3], eobd
     mov         [coeffq-32*2], eobd
     mov         [coeffq-32*1], eobd
-    punpckldq              m0, m2
-    punpckldq              m1, m3
+    punpckldq              m1, m2
     punpcklqdq             m0, m1
-    psllw                  m0, 2
-    pmulhrsw               m0, m4
+    pmulhrsw               m1, m4, m0
+    paddw                  m0, m0
+    paddw                  m0, m1
     pmulhrsw               m0, m5
     pmulhrsw               m0, m6
     pmulhrsw               m0, m7
@@ -3740,36 +3818,42 @@
 INV_TXFM_16X16_FN identity, identity
 
 cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
-    LOAD_8ROWS    coeffq+16*17, 32
+    add                 coeffq, 16*17
     mov                     r3, tx2q
     lea                   tx2q, [o(m(iidentity_16x16_internal).pass1_end)]
 
 .pass1:
-    REPX      {psllw    x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
-    pmulhrsw                m7, [o(pw_5793x4)]
-    mova    [rsp+gprsize+16*0], m7
-
-    mova                    m7, [o(pw_5793x4)]
-    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
-
+    mova                    m7, [o(pw_1697x16)]
+    mova                    m6, [coeffq+32*7]
+    mova                    m0, [coeffq+32*0]
+    mova                    m1, [coeffq+32*1]
+    mova                    m2, [coeffq+32*2]
+    mova                    m3, [coeffq+32*3]
+    mova                    m4, [coeffq+32*4]
+    REPX      {IDTX16 x, 5, 7}, 6, 0, 1, 2, 3, 4
+    mova                    m5, [coeffq+32*5]
+    mova    [rsp+gprsize+16*0], m6
+    IDTX16                   5, 6, 7
+    mova                    m6, [coeffq+32*6]
+    IDTX16                   6, 7, 7
     mova                    m7, [o(pw_8192)]
     jmp   m(idct_8x8_internal).pass1_end1
 
 .pass1_end:
-    SAVE_8ROWS    coeffq+16*17, 32
-    LOAD_8ROWS    coeffq+16* 1, 32
+    SAVE_8ROWS          coeffq, 32
+    sub                 coeffq, 16
     lea                   tx2q, [o(m(iidentity_16x16_internal).pass1_end1)]
     jmp .pass1
 
 .pass1_end1:
-    SAVE_8ROWS    coeffq+16* 1, 32
-    LOAD_8ROWS    coeffq+16*16, 32
+    SAVE_8ROWS          coeffq, 32
+    sub                 coeffq, 15*16
     lea                   tx2q, [o(m(iidentity_16x16_internal).pass1_end2)]
     jmp .pass1
 
 .pass1_end2:
-    SAVE_8ROWS    coeffq+16*16, 32
-    LOAD_8ROWS    coeffq+16* 0, 32
+    SAVE_8ROWS          coeffq, 32
+    sub                 coeffq, 16
     mov                   tx2q, r3
     jmp .pass1
 
@@ -3778,16 +3862,22 @@
     lea                   tx2q, [o(m(iidentity_16x16_internal).end1)]
 
 .end:
-    REPX      {psllw    x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
-    pmulhrsw                m7, [o(pw_5793x4)]
-    pmulhrsw                m7, [o(pw_2048)]
     mova    [rsp+gprsize+16*0], m7
-    mova                    m7, [o(pw_5793x4)]
-    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
-    mova                    m7, [o(pw_2048)]
-    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
-    mova    [rsp+gprsize+16*1], m6
+    mova    [rsp+gprsize+16*1], m4
+    mova                    m7, [o(pw_1697x16)]
+    REPX      {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3
+    mova                    m4, [o(pw_2048)]
+    pmulhrsw                m5, m4
+    pmulhrsw                m6, m4
     mova    [rsp+gprsize+16*2], m5
+    mova                    m5, [rsp+gprsize+16*1]
+    mova    [rsp+gprsize+16*1], m6
+    IDTX16                   5, 6, 7
+    mova                    m6, [rsp+gprsize+16*0]
+    IDTX16                   6, 7, 7
+    REPX      {pmulhrsw x, m4}, m0, m1, m2, m3, m6
+    pmulhrsw                m4, m5
+    mova    [rsp+gprsize+16*0], m6
     jmp   m(idct_8x8_internal).end3
 
 .end1:
@@ -4991,15 +5081,33 @@
 
 .loop:
     LOAD_8ROWS          coeffq, 32, 1
-    REPX         {psllw  x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
+    REPX          {psllw x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
     mova            [rsp+16*1], m6
     lea                   tx2q, [o(m(idct_32x16_internal).end)]
     call  m(idct_8x8_internal).pass1_end3
-    pmulhrsw                m7, [o(pw_5793x4)]
+    mova            [rsp+16*2], m5
+    mova            [rsp+16*1], m6
+    mova                    m5, [o(pw_1697x8)]
+    pmulhrsw                m6, m5, m7
+    paddw                   m7, m6
+    pmulhrsw                m6, m5, m0
+    paddw                   m0, m6
+    pmulhrsw                m6, m5, m1
+    paddw                   m1, m6
+    pmulhrsw                m6, m5, m2
+    paddw                   m2, m6
+    pmulhrsw                m6, m5, m3
+    paddw                   m3, m6
+    pmulhrsw                m6, m5, m4
     pmulhrsw                m7, [o(pw_2048)]
+    paddw                   m4, m6
+    mova                    m6, [rsp+16*1]
     mova            [rsp+16*0], m7
-    mova                    m7, [o(pw_5793x4)]
-    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    pmulhrsw                m7, m5, m6
+    paddw                   m6, m7
+    mova                    m7, [rsp+16*2]
+    pmulhrsw                m5, m7
+    paddw                   m5, m7
     mova                    m7, [o(pw_2048)]
     REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
     mova            [rsp+16*2], m5
@@ -5008,7 +5116,7 @@
     lea                   dstq, [dstq+strideq*2]
 
     pxor                    m7, m7
-    REPX   {mova [coeffq+32*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
+    REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
 
 .loop_end:
     add                 coeffq, 16