shithub: dav1d

Download patch

ref: 114e8f0ee53ba34d22452dc4bdc0e9ec263189c8
parent: e0b88bd2b2c97a2695edcc498485e1cb3003e7f1
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Apr 20 19:54:47 EDT 2020

x86: Eliminate redundant 3-operand register syntax in itx

Purely a cosmetic change.

--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -175,16 +175,16 @@
     vpbroadcastd        m%3, [o(pw_%8_%9)]
     vpbroadcastd        m%4, [o(pw_m%9_%8)]
     vpbroadcastd       xm%2, [o(pw_%6_%7)]
-    vpblendd            m%2, m%2, m%3, 0xf0
+    vpblendd            m%2, m%3, 0xf0
     vpbroadcastd       xm%3, [o(pw_m%7_%6)]
 %else
     vpbroadcastd        m%3, [o(pw_m%9_%8)]
     vpbroadcastd        m%4, [o(pw_%8_%9)]
     vpbroadcastd       xm%2, [o(pw_m%7_%6)]
-    vpblendd            m%2, m%2, m%3, 0xf0
+    vpblendd            m%2, m%3, 0xf0
     vpbroadcastd       xm%3, [o(pw_%6_%7)]
 %endif
-    vpblendd            m%3, m%3, m%4, 0xf0
+    vpblendd            m%3, m%4, 0xf0
     ITX_MUL2X_PACK       %1, %4, _, %5, %2, %3, (4|%10)
 %endmacro
 
@@ -355,7 +355,7 @@
     punpckhdq            m1, m0, m3
     punpckldq            m0, m3
     IWHT4_1D_PACKED
-    vpblendd             m0, m0, m2, 0x03
+    vpblendd             m0, m2, 0x03
     ITX4_END              3, 0, 2, 1, 0
 
 %macro INV_TXFM_FN 3 ; type1, type2, size
@@ -441,7 +441,7 @@
     IDCT4_1D_PACKED
     mova                 m2, [o(deint_shuf)]
     shufps               m3, m0, m1, q1331
-    shufps               m0, m0, m1, q0220
+    shufps               m0, m1, q0220
     pshufb               m0, m2
     pshufb               m1, m3, m2
     jmp                tx2q
@@ -667,9 +667,9 @@
     paddsw               m4, m5            ; out6 -out1
     vpbroadcastd         m5, [o(pw_2896x8)]
     vpblendd             m3, m0, m4, 0x33  ; out6 -out7
-    vpblendd             m0, m0, m4, 0xcc  ; out0 -out1
+    vpblendd             m0, m4, 0xcc      ; out0 -out1
     shufps               m4, m2, m1, q1032 ; t3 t7
-    vpblendd             m1, m2, m1, 0xcc  ; t2 t6
+    vpblendd             m1, m2, 0x33      ; t2 t6
     psubsw               m2, m1, m4        ; t2-t3 t6-t7
     paddsw               m1, m4            ; t2+t3 t6+t7
     pmulhrsw             m2, m5            ; out4 -out5
@@ -693,7 +693,7 @@
     IDCT4_1D_PACKED
     vbroadcasti128       m2, [o(deint_shuf)]
     shufps               m3, m0, m1, q1331
-    shufps               m0, m0, m1, q0220
+    shufps               m0, m1, q0220
     pshufb               m0, m2
     pshufb               m1, m3, m2
     jmp                tx2q
@@ -702,8 +702,8 @@
     vextracti128        xm3, m1, 1
     call .main
     vpbroadcastd         m4, [o(pw_2048)]
-    vinserti128          m0, m0, xm2, 1
-    vinserti128          m1, m1, xm3, 1
+    vinserti128          m0, xm2, 1
+    vinserti128          m1, xm3, 1
     pshufd               m1, m1, q1032
     jmp m(iadst_4x8_internal).end2
 ALIGN function_align
@@ -735,12 +735,12 @@
     pshufd              xm5, xm1, q1032
     call .main_pass2
     vpbroadcastd         m4, [o(pw_2048)]
-    vinserti128          m0, m0, xm2, 1
-    vinserti128          m1, m1, xm3, 1
+    vinserti128          m0, xm2, 1
+    vinserti128          m1, xm3, 1
     pxor                 m5, m5
     psubw                m5, m4
 .end:
-    vpblendd             m4, m4, m5, 0xcc
+    vpblendd             m4, m5, 0xcc
 .end2:
     pmulhrsw             m0, m4
     pmulhrsw             m1, m4
@@ -786,8 +786,8 @@
     pshufd              xm5, xm1, q1032
     call m(iadst_4x8_internal).main_pass2
     vpbroadcastd         m5, [o(pw_2048)]
-    vinserti128          m3, m3, xm1, 1
-    vinserti128          m2, m2, xm0, 1
+    vinserti128          m3, xm1, 1
+    vinserti128          m2, xm0, 1
     pxor                 m4, m4
     psubw                m4, m5
     pshufd               m0, m3, q1032
@@ -935,11 +935,11 @@
     vextracti128        xm6, m2, 1
     vextracti128        xm7, m3, 1
     call .main
-    vinserti128          m0, m0, xm4, 1
-    vinserti128          m1, m1, xm5, 1
+    vinserti128          m0, xm4, 1
+    vinserti128          m1, xm5, 1
     vpbroadcastd         m5, [o(pw_2048)]
-    vinserti128          m2, m2, xm6, 1
-    vinserti128          m3, m3, xm7, 1
+    vinserti128          m2, xm6, 1
+    vinserti128          m3, xm7, 1
     pshufd               m1, m1, q1032
     pshufd               m3, m3, q1032
     jmp m(iadst_4x16_internal).end2
@@ -980,9 +980,9 @@
     vpbroadcastd         m5, [o(pw_2048)]
     pshufd               m1, m1, q1032
     vpblendd             m4, m1, m0, 0x33
-    vpblendd             m0, m0, m2, 0x33
-    vpblendd             m2, m2, m3, 0x33
-    vpblendd             m3, m3, m1, 0x33
+    vpblendd             m0, m2, 0x33
+    vpblendd             m2, m3, 0x33
+    vpblendd             m3, m1, 0x33
     vpermq               m0, m0, q2031
     vpermq               m1, m2, q1302
     vpermq               m2, m3, q3120
@@ -989,7 +989,7 @@
     vpermq               m3, m4, q0213
     psubw                m6, m7, m5
 .end:
-    vpblendd             m5, m5, m6, 0xcc
+    vpblendd             m5, m6, 0xcc
 .end2:
     REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
     WIN64_RESTORE_XMM
@@ -1009,9 +1009,9 @@
 ALIGN function_align
 .main:
     vpblendd             m4, m1, m0, 0xcc
-    vpblendd             m1, m1, m0, 0x33
+    vpblendd             m1, m0, 0x33
     vpblendd             m5, m2, m3, 0xcc
-    vpblendd             m2, m2, m3, 0x33
+    vpblendd             m2, m3, 0x33
     vperm2i128           m3, m5, m2, 0x31
     vinserti128          m0, m1, xm4, 1 ; in0  in3  in2  in1
     vperm2i128           m4, m1, m4, 0x31
@@ -1043,7 +1043,7 @@
     psubsw               m1, m2, m3 ; t13a t12a t15a t14a
     paddsw               m2, m3     ; t9a  t8a  t11a t10a
     psubw                m3, m7, m6 ; pw_3784_m1567
-    vpblendd             m6, m6, m3, 0xf0
+    vpblendd             m6, m3, 0xf0
     ITX_MUL2X_PACK        4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
     ITX_MUL2X_PACK        1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
     vbroadcasti128       m5, [o(deint_shuf)]
@@ -1050,9 +1050,9 @@
     pshufb               m0, m5
     pshufb               m2, m5
     vperm2i128           m3, m0, m2, 0x31  ; t3   t2   t11a t10a
-    vinserti128          m0, m0, xm2, 1    ; t1   t0   t9a  t8a
+    vinserti128          m0, xm2, 1        ; t1   t0   t9a  t8a
     vperm2i128           m2, m4, m1, 0x31  ; t7a  t6a  t15  t14
-    vinserti128          m4, m4, xm1, 1    ; t4a  t5a  t12  t13
+    vinserti128          m4, xm1, 1        ; t4a  t5a  t12  t13
     pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
     psubsw               m1, m0, m3        ; t3a t2a t11 t10
     paddsw               m0, m3     ; -out15  out0   out14 -out1
@@ -1059,7 +1059,7 @@
     paddsw               m3, m4, m2 ; -out3   out12  out2  -out13
     psubsw               m4, m2            ; t6 t7 t14a t15a
     shufps               m2, m1, m4, q1032 ; t2a t6  t10 t14a
-    vpblendd             m4, m4, m1, 0x33  ; t3a t7  t11 t15a
+    vpblendd             m4, m1, 0x33      ; t3a t7  t11 t15a
     ret
 ALIGN function_align
 .main_pass1_end:
@@ -1109,9 +1109,9 @@
     vpbroadcastd         m6, [o(pw_2048)]
     pshufd               m1, m1, q1032
     vpblendd             m4, m0, m2, 0x33
-    vpblendd             m0, m0, m1, 0xcc
-    vpblendd             m1, m1, m3, 0xcc
-    vpblendd             m2, m2, m3, 0x33
+    vpblendd             m0, m1, 0xcc
+    vpblendd             m1, m3, 0xcc
+    vpblendd             m2, m3, 0x33
     vpermq               m0, m0, q3120
     vpermq               m1, m1, q0213
     vpermq               m2, m2, q2031
@@ -1226,7 +1226,7 @@
     vinserti128          m3, m1, xm3, 1
     vinserti128          m1, m0, xm2, 1
     shufps               m0, m1, m3, q0220
-    shufps               m1, m1, m3, q1331
+    shufps               m1, m3, q1331
     pshufb               m0, m4
     pshufb               m1, m4
     jmp                tx2q
@@ -1250,8 +1250,8 @@
     pmulhrsw            xm4, xm0
     pmulhrsw            xm5, xm0
     call m(iadst_4x8_internal).main_pass1
-    vinserti128        m0, m0, xm2, 1
-    vinserti128        m1, m1, xm3, 1
+    vinserti128        m0, xm2, 1
+    vinserti128        m1, xm3, 1
     punpckhwd          m2, m0, m1
     punpcklwd          m0, m1
     pxor               m3, m3
@@ -1295,8 +1295,8 @@
     pmulhrsw            xm4, xm0
     pmulhrsw            xm5, xm0
     call m(iadst_4x8_internal).main_pass1
-    vinserti128          m3, m3, xm1, 1
-    vinserti128          m2, m2, xm0, 1
+    vinserti128          m3, xm1, 1
+    vinserti128          m2, xm0, 1
     punpckhwd            m1, m3, m2
     punpcklwd            m3, m2
     pxor                 m0, m0
@@ -1317,10 +1317,10 @@
 INV_TXFM_8X4_FN identity, identity
 
 cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
-    mova                xm2,     [cq+16*0]
-    mova                xm0,     [cq+16*1]
-    vinserti128          m2, m2, [cq+16*2], 1
-    vinserti128          m0, m0, [cq+16*3], 1
+    mova                xm2, [cq+16*0]
+    mova                xm0, [cq+16*1]
+    vinserti128          m2, [cq+16*2], 1
+    vinserti128          m0, [cq+16*3], 1
     vpbroadcastd         m3, [o(pw_2896x8)]
     punpcklwd            m1, m2, m0
     punpckhwd            m2, m0
@@ -1520,14 +1520,14 @@
 INV_TXFM_8X8_FN identity, identity
 
 cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
-    mova                xm3,     [cq+16*0]
-    mova                xm2,     [cq+16*1]
-    vinserti128          m3, m3, [cq+16*4], 1
-    vinserti128          m2, m2, [cq+16*5], 1
-    mova                xm4,     [cq+16*2]
-    mova                xm0,     [cq+16*3]
-    vinserti128          m4, m4, [cq+16*6], 1
-    vinserti128          m0, m0, [cq+16*7], 1
+    mova                xm3, [cq+16*0]
+    mova                xm2, [cq+16*1]
+    vinserti128          m3, [cq+16*4], 1
+    vinserti128          m2, [cq+16*5], 1
+    mova                xm4, [cq+16*2]
+    mova                xm0, [cq+16*3]
+    vinserti128          m4, [cq+16*6], 1
+    vinserti128          m0, [cq+16*7], 1
     punpcklwd            m1, m3, m2
     punpckhwd            m3, m2
     punpcklwd            m2, m4, m0
@@ -1583,13 +1583,13 @@
     vpbroadcastd        m10, [o(pw_16384)]
 .pass1_end:
     vperm2i128           m9, m3, m7, 0x31
-    vinserti128          m3, m3, xm7, 1
+    vinserti128          m3, xm7, 1
     vperm2i128           m8, m2, m6, 0x31
-    vinserti128          m2, m2, xm6, 1
+    vinserti128          m2, xm6, 1
     vperm2i128           m6, m1, m5, 0x31
-    vinserti128          m1, m1, xm5, 1
+    vinserti128          m1, xm5, 1
     vperm2i128           m5, m0, m4, 0x31
-    vinserti128          m0, m0, xm4, 1
+    vinserti128          m0, xm4, 1
     punpckhwd            m4, m2, m3
     punpcklwd            m2, m3
     punpckhwd            m3, m0, m1
@@ -1840,24 +1840,24 @@
 %endmacro
 
 cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
-    mova                xm3,     [cq+16*0]
-    mova                xm2,     [cq+16*2]
+    mova                xm3, [cq+16*0]
+    mova                xm2, [cq+16*2]
     add                  cq, 16*8
-    vinserti128          m3, m3, [cq+16*0], 1
-    vinserti128          m2, m2, [cq+16*2], 1
+    vinserti128          m3, [cq+16*0], 1
+    vinserti128          m2, [cq+16*2], 1
     vpbroadcastd         m9, [o(pw_2896x8)]
-    mova                xm4,     [cq-16*4]
-    mova                xm5,     [cq-16*2]
-    vinserti128          m4, m4, [cq+16*4], 1
-    vinserti128          m5, m5, [cq+16*6], 1
-    mova                xm7,     [cq-16*7]
-    mova                xm6,     [cq-16*5]
-    vinserti128          m7, m7, [cq+16*1], 1
-    vinserti128          m6, m6, [cq+16*3], 1
-    mova                xm8,     [cq-16*3]
-    mova                xm0,     [cq-16*1]
-    vinserti128          m8, m8, [cq+16*5], 1
-    vinserti128          m0, m0, [cq+16*7], 1
+    mova                xm4, [cq-16*4]
+    mova                xm5, [cq-16*2]
+    vinserti128          m4, [cq+16*4], 1
+    vinserti128          m5, [cq+16*6], 1
+    mova                xm7, [cq-16*7]
+    mova                xm6, [cq-16*5]
+    vinserti128          m7, [cq+16*1], 1
+    vinserti128          m6, [cq+16*3], 1
+    mova                xm8, [cq-16*3]
+    mova                xm0, [cq-16*1]
+    vinserti128          m8, [cq+16*5], 1
+    vinserti128          m0, [cq+16*7], 1
     punpcklwd            m1, m3, m2
     punpckhwd            m3, m2
     punpcklwd            m2, m4, m5
@@ -1918,7 +1918,7 @@
     pxor                 m3, m3
 .dconly_loop:
     mova                xm1, [dstq]
-    vinserti128          m1, m1, [dstq+strideq], 1
+    vinserti128          m1, [dstq+strideq], 1
     punpckhbw            m2, m1, m3
     punpcklbw            m1, m3
     paddw                m2, m0
@@ -2116,14 +2116,14 @@
 INV_TXFM_16X4_FN identity, identity
 
 cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
-    mova                xm2,     [cq+16*0]
-    mova                xm4,     [cq+16*1]
-    vinserti128          m2, m2, [cq+16*4], 1
-    vinserti128          m4, m4, [cq+16*5], 1
-    mova                xm0,     [cq+16*2]
-    mova                xm1,     [cq+16*3]
-    vinserti128          m0, m0, [cq+16*6], 1
-    vinserti128          m1, m1, [cq+16*7], 1
+    mova                xm2, [cq+16*0]
+    mova                xm4, [cq+16*1]
+    vinserti128          m2, [cq+16*4], 1
+    vinserti128          m4, [cq+16*5], 1
+    mova                xm0, [cq+16*2]
+    mova                xm1, [cq+16*3]
+    vinserti128          m0, [cq+16*6], 1
+    vinserti128          m1, [cq+16*7], 1
     vpbroadcastd         m7, [o(pw_1697x16)]
     vpbroadcastd         m8, [o(pw_16384)]
     punpcklwd            m3, m2, m4
@@ -2224,13 +2224,13 @@
     punpckldq            m8, m9, m5
     punpckhdq            m9, m5
     vperm2i128           m4, m0, m6, 0x31
-    vinserti128          m0, m0, xm6, 1
+    vinserti128          m0, xm6, 1
     vperm2i128           m5, m1, m7, 0x31
-    vinserti128          m1, m1, xm7, 1
+    vinserti128          m1, xm7, 1
     vperm2i128           m6, m2, m8, 0x31
-    vinserti128          m2, m2, xm8, 1
+    vinserti128          m2, xm8, 1
     vperm2i128           m7, m3, m9, 0x31
-    vinserti128          m3, m3, xm9, 1
+    vinserti128          m3, xm9, 1
     jmp                tx2q
 .pass2:
     call .main
@@ -2387,13 +2387,13 @@
     punpckldq            m5, m8, m2
     punpckhdq            m8, m2
     vinserti128          m2, m6, xm5, 1
-    vperm2i128           m6, m6, m5, 0x31
+    vperm2i128           m6, m5, 0x31
     vperm2i128           m5, m1, m4, 0x31
-    vinserti128          m1, m1, xm4, 1
+    vinserti128          m1, xm4, 1
     vperm2i128           m4, m0, m3, 0x31
-    vinserti128          m0, m0, xm3, 1
+    vinserti128          m0, xm3, 1
     vinserti128          m3, m7, xm8, 1
-    vperm2i128           m7, m7, m8, 0x31
+    vperm2i128           m7, m8, 0x31
     jmp                tx2q
 .pass2:
     call m(iadst_16x8_internal).main
@@ -2419,24 +2419,24 @@
 INV_TXFM_16X8_FN identity, identity
 
 cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
-    mova                xm7,     [cq+16*0]
-    mova                xm2,     [cq+16*1]
+    mova                xm7, [cq+16*0]
+    mova                xm2, [cq+16*1]
     add                  cq, 16*8
     vpbroadcastd         m3, [o(pw_2896x8)]
-    vinserti128          m7, m7, [cq+16*0], 1
-    vinserti128          m2, m2, [cq+16*1], 1
-    mova                xm6,     [cq-16*6]
-    mova                xm4,     [cq-16*5]
-    vinserti128          m6, m6, [cq+16*2], 1
-    vinserti128          m4, m4, [cq+16*3], 1
-    mova                xm8,     [cq-16*4]
-    mova                xm5,     [cq-16*3]
-    vinserti128          m8, m8, [cq+16*4], 1
-    vinserti128          m5, m5, [cq+16*5], 1
-    mova                xm0,     [cq-16*2]
-    mova                xm1,     [cq-16*1]
-    vinserti128          m0, m0, [cq+16*6], 1
-    vinserti128          m1, m1, [cq+16*7], 1
+    vinserti128          m7, [cq+16*0], 1
+    vinserti128          m2, [cq+16*1], 1
+    mova                xm6, [cq-16*6]
+    mova                xm4, [cq-16*5]
+    vinserti128          m6, [cq+16*2], 1
+    vinserti128          m4, [cq+16*3], 1
+    mova                xm8, [cq-16*4]
+    mova                xm5, [cq-16*3]
+    vinserti128          m8, [cq+16*4], 1
+    vinserti128          m5, [cq+16*5], 1
+    mova                xm0, [cq-16*2]
+    mova                xm1, [cq-16*1]
+    vinserti128          m0, [cq+16*6], 1
+    vinserti128          m1, [cq+16*7], 1
     vpbroadcastd        m10, [o(pw_1697x16)]
     vpbroadcastd        m11, [o(pw_16384)]
     REPX   {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
@@ -2524,19 +2524,19 @@
     REPX   {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
     pmulhrsw             m1, [rsp+32*1]
     vperm2i128           m8, m1, m9, 0x31
-    vinserti128          m1, m1, xm9, 1
+    vinserti128          m1, xm9, 1
     vperm2i128           m9, m2, m10, 0x31
-    vinserti128          m2, m2, xm10, 1
+    vinserti128          m2, xm10, 1
     vperm2i128          m10, m3, m11, 0x31
-    vinserti128          m3, m3, xm11, 1
+    vinserti128          m3, xm11, 1
     vperm2i128          m11, m4, m12, 0x31
-    vinserti128          m4, m4, xm12, 1
+    vinserti128          m4, xm12, 1
     vperm2i128          m12, m5, m13, 0x31
-    vinserti128          m5, m5, xm13, 1
+    vinserti128          m5, xm13, 1
     vperm2i128          m13, m6, m14, 0x31
-    vinserti128          m6, m6, xm14, 1
+    vinserti128          m6, xm14, 1
     vperm2i128          m14, m7, m15, 0x31
-    vinserti128          m7, m7, xm15, 1
+    vinserti128          m7, xm15, 1
     mova                m15, [rsp+32*2]
 .pass1_end3:
     punpcklwd            m0, m9, m10
@@ -3036,13 +3036,13 @@
     LOAD_8ROWS      cq+32*1, 32*2
     call m(idct_16x8_internal).main
     vperm2i128          m11, m0, m4, 0x31
-    vinserti128          m0, m0, xm4, 1
+    vinserti128          m0, xm4, 1
     vperm2i128           m4, m1, m5, 0x31
-    vinserti128          m1, m1, xm5, 1
+    vinserti128          m1, xm5, 1
     vperm2i128           m5, m2, m6, 0x31
-    vinserti128          m2, m2, xm6, 1
+    vinserti128          m2, xm6, 1
     vperm2i128           m6, m3, m7, 0x31
-    vinserti128          m3, m3, xm7, 1
+    vinserti128          m3, xm7, 1
     pxor                 m7, m7
     REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15
     punpckhwd            m7, m0, m1
@@ -3076,13 +3076,13 @@
     LOAD_8ROWS      cq+32*0, 32*2
     call m(idct_16x8_internal).main
     vperm2i128           m8, m0, m4, 0x31
-    vinserti128          m0, m0, xm4, 1
+    vinserti128          m0, xm4, 1
     vperm2i128           m4, m1, m5, 0x31
-    vinserti128          m1, m1, xm5, 1
+    vinserti128          m1, xm5, 1
     vperm2i128           m5, m2, m6, 0x31
-    vinserti128          m2, m2, xm6, 1
+    vinserti128          m2, xm6, 1
     vperm2i128           m6, m3, m7, 0x31
-    vinserti128          m3, m3, xm7, 1
+    vinserti128          m3, xm7, 1
     vpbroadcastd         m9, [o(pw_8192)]
     pxor                 m7, m7
     REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14
@@ -3285,7 +3285,7 @@
 %macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
     vbroadcasti128      m%1, [cq+16*%3]
     vbroadcasti128      m%2, [cq+16*%4]
-    shufpd              m%1, m%1, m%2, 0x0c
+    shufpd              m%1, m%2, 0x0c
 %endmacro
 
 cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob
@@ -3387,13 +3387,13 @@
     pmulhrsw            m12, [rsp+32*0]
     mova         [rsp+32*0], m8
     vperm2i128           m4, m0, m6, 0x31
-    vinserti128          m0, m0, xm6, 1
+    vinserti128          m0, xm6, 1
     vperm2i128           m5, m1, m7, 0x31
-    vinserti128          m1, m1, xm7, 1
+    vinserti128          m1, xm7, 1
     vperm2i128           m6, m2, m9, 0x31
-    vinserti128          m2, m2, xm9, 1
+    vinserti128          m2, xm9, 1
     vperm2i128           m7, m3, m10, 0x31
-    vinserti128          m3, m3, xm10, 1
+    vinserti128          m3, xm10, 1
     call m(idct_16x8_internal).main
     vpbroadcastd         m8, [o(pw_2048)]
     REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
@@ -3432,13 +3432,13 @@
     punpckldq            m9, m12, m5
     punpckhdq           m12, m5
     vperm2i128           m4, m0, m6, 0x31
-    vinserti128          m0, m0, xm6, 1
+    vinserti128          m0, xm6, 1
     vperm2i128           m5, m1, m7, 0x31
-    vinserti128          m1, m1, xm7, 1
+    vinserti128          m1, xm7, 1
     vperm2i128           m6, m2, m9, 0x31
-    vinserti128          m2, m2, xm9, 1
+    vinserti128          m2, xm9, 1
     vperm2i128           m7, m3, m12, 0x31
-    vinserti128          m3, m3, xm12, 1
+    vinserti128          m3, xm12, 1
     call m(idct_16x8_internal).main2
     vpbroadcastd         m8, [o(pw_2048)]
     REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
@@ -3457,26 +3457,26 @@
     lea                  r4, [strideq*3]
     sub                eobd, 107 ; loop_iterations = 1 + (eobd >= 107)
 .loop:
-    mova                xm0,     [cq+16* 0]
-    mova                xm1,     [cq+16* 4]
-    vinserti128          m0, m0, [cq+16* 1], 1
-    vinserti128          m1, m1, [cq+16* 5], 1
+    mova                xm0,[cq+16* 0]
+    mova                xm1, [cq+16* 4]
+    vinserti128          m0, [cq+16* 1], 1
+    vinserti128          m1, [cq+16* 5], 1
     pxor                 m8, m8
     mova          [cq+32*0], m8
     mova          [cq+32*2], m8
     add                  cq, 16*16
-    mova                xm2,     [cq-16* 8]
-    mova                xm3,     [cq-16* 4]
-    vinserti128          m2, m2, [cq-16* 7], 1
-    vinserti128          m3, m3, [cq-16* 3], 1
-    mova                xm4,     [cq+16* 0]
-    mova                xm5,     [cq+16* 4]
-    vinserti128          m4, m4, [cq+16* 1], 1
-    vinserti128          m5, m5, [cq+16* 5], 1
-    mova                xm6,     [cq+16* 8]
-    mova                xm7,     [cq+16*12]
-    vinserti128          m6, m6, [cq+16* 9], 1
-    vinserti128          m7, m7, [cq+16*13], 1
+    mova                xm2, [cq-16* 8]
+    mova                xm3, [cq-16* 4]
+    vinserti128          m2, [cq-16* 7], 1
+    vinserti128          m3, [cq-16* 3], 1
+    mova                xm4, [cq+16* 0]
+    mova                xm5, [cq+16* 4]
+    vinserti128          m4, [cq+16* 1], 1
+    vinserti128          m5, [cq+16* 5], 1
+    mova                xm6, [cq+16* 8]
+    mova                xm7, [cq+16*12]
+    vinserti128          m6, [cq+16* 9], 1
+    vinserti128          m7, [cq+16*13], 1
     REPX {mova [cq+32*x], m8}, -4, -2,  0,  2,  4,  6
     REPX  {paddsw    x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
     call .transpose8x8
@@ -3529,22 +3529,22 @@
     lea                  r5, [dstq+strideq*4]
     sub                eobd, 107
 .loop:
-    mova                xm0,     [cq-16*8]
-    mova                xm1,     [cq-16*7]
-    vinserti128          m0, m0, [cq+16*0], 1
-    vinserti128          m1, m1, [cq+16*1], 1
-    mova                xm2,     [cq-16*6]
-    mova                xm3,     [cq-16*5]
-    vinserti128          m2, m2, [cq+16*2], 1
-    vinserti128          m3, m3, [cq+16*3], 1
-    mova                xm4,     [cq-16*4]
-    mova                xm5,     [cq-16*3]
-    vinserti128          m4, m4, [cq+16*4], 1
-    vinserti128          m5, m5, [cq+16*5], 1
-    mova                xm6,     [cq-16*2]
-    mova                xm7,     [cq-16*1]
-    vinserti128          m6, m6, [cq+16*6], 1
-    vinserti128          m7, m7, [cq+16*7], 1
+    mova                xm0, [cq-16*8]
+    mova                xm1, [cq-16*7]
+    vinserti128          m0, [cq+16*0], 1
+    vinserti128          m1, [cq+16*1], 1
+    mova                xm2, [cq-16*6]
+    mova                xm3, [cq-16*5]
+    vinserti128          m2, [cq+16*2], 1
+    vinserti128          m3, [cq+16*3], 1
+    mova                xm4, [cq-16*4]
+    mova                xm5, [cq-16*3]
+    vinserti128          m4, [cq+16*4], 1
+    vinserti128          m5, [cq+16*5], 1
+    mova                xm6, [cq-16*2]
+    mova                xm7, [cq-16*1]
+    vinserti128          m6, [cq+16*6], 1
+    vinserti128          m7, [cq+16*7], 1
     pxor                 m8, m8
     REPX {mova [cq+32*x], m8}, -4, -3, -2, -1,  0,  1,  2,  3
     call m(inv_txfm_add_identity_identity_8x32).transpose8x8
@@ -3716,28 +3716,28 @@
     vextracti128 [r2+32*3+16], m14, 1
     vinserti128          m8, m1, xm9, 1
     vperm2i128          m12, m1, m9, 0x31
-    mova                xm0,     [tmp1q-32*4]
-    mova                xm1,     [tmp1q-32*3]
-    vinserti128          m0, m0, [tmp1q+32*0], 1
-    vinserti128          m1, m1, [tmp1q+32*1], 1
+    mova                xm0, [tmp1q-32*4]
+    mova                xm1, [tmp1q-32*3]
+    vinserti128          m0, [tmp1q+32*0], 1
+    vinserti128          m1, [tmp1q+32*1], 1
     vinserti128         m10, m5, xm13, 1
     vperm2i128          m14, m5, m13, 0x31
-    mova                xm4,     [tmp1q-32*4+16]
-    mova                xm5,     [tmp1q-32*3+16]
-    vinserti128          m4, m4, [tmp1q+32*0+16], 1
-    vinserti128          m5, m5, [tmp1q+32*1+16], 1
+    mova                xm4, [tmp1q-32*4+16]
+    mova                xm5, [tmp1q-32*3+16]
+    vinserti128          m4, [tmp1q+32*0+16], 1
+    vinserti128          m5, [tmp1q+32*1+16], 1
     vinserti128          m9, m3, xm11, 1
     vperm2i128          m13, m3, m11, 0x31
-    mova                xm2,     [tmp1q-32*2]
-    mova                xm3,     [tmp1q-32*1]
-    vinserti128          m2, m2, [tmp1q+32*2], 1
-    vinserti128          m3, m3, [tmp1q+32*3], 1
+    mova                xm2, [tmp1q-32*2]
+    mova                xm3, [tmp1q-32*1]
+    vinserti128          m2, [tmp1q+32*2], 1
+    vinserti128          m3, [tmp1q+32*3], 1
     vinserti128         m11, m7, xm15, 1
     vperm2i128          m15, m7, m15, 0x31
-    mova                xm6,     [tmp1q-32*2+16]
-    mova                xm7,     [tmp1q-32*1+16]
-    vinserti128          m6, m6, [tmp1q+32*2+16], 1
-    vinserti128          m7, m7, [tmp1q+32*3+16], 1
+    mova                xm6, [tmp1q-32*2+16]
+    mova                xm7, [tmp1q-32*1+16]
+    vinserti128          m6, [tmp1q+32*2+16], 1
+    vinserti128          m7, [tmp1q+32*3+16], 1
     call .main_oddhalf
     LOAD_8ROWS_H    r2-32*4, 32
 .idct16:
@@ -3985,7 +3985,7 @@
     mova         [tmp1q+32*(11-%2)], xm%2
     vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
     vperm2i128          m%2, m%1, m%4, 0x31
-    vinserti128         m%1, m%1, xm%4, 1
+    vinserti128         m%1, xm%4, 1
 %endmacro
 
 cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob
@@ -4103,22 +4103,22 @@
     mov                 rax, cq
     paddw               m11, m12, m12 ; pw_16384
 .loop:
-    mova                xm0,     [cq+64* 0]
-    mova                xm1,     [cq+64* 1]
-    vinserti128          m0, m0, [cq+64* 8], 1
-    vinserti128          m1, m1, [cq+64* 9], 1
-    mova                xm2,     [cq+64* 2]
-    mova                xm3,     [cq+64* 3]
-    vinserti128          m2, m2, [cq+64*10], 1
-    vinserti128          m3, m3, [cq+64*11], 1
-    mova                xm4,     [cq+64* 4]
-    mova                xm5,     [cq+64* 5]
-    vinserti128          m4, m4, [cq+64*12], 1
-    vinserti128          m5, m5, [cq+64*13], 1
-    mova                xm6,     [cq+64* 6]
-    mova                xm7,     [cq+64* 7]
-    vinserti128          m6, m6, [cq+64*14], 1
-    vinserti128          m7, m7, [cq+64*15], 1
+    mova                xm0, [cq+64* 0]
+    mova                xm1, [cq+64* 1]
+    vinserti128          m0, [cq+64* 8], 1
+    vinserti128          m1, [cq+64* 9], 1
+    mova                xm2, [cq+64* 2]
+    mova                xm3, [cq+64* 3]
+    vinserti128          m2, [cq+64*10], 1
+    vinserti128          m3, [cq+64*11], 1
+    mova                xm4, [cq+64* 4]
+    mova                xm5, [cq+64* 5]
+    vinserti128          m4, [cq+64*12], 1
+    vinserti128          m5, [cq+64*13], 1
+    mova                xm6, [cq+64* 6]
+    mova                xm7, [cq+64* 7]
+    vinserti128          m6, [cq+64*14], 1
+    vinserti128          m7, [cq+64*15], 1
     REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
     REPX  {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7
     call m(inv_txfm_add_identity_identity_8x32).transpose8x8
@@ -4171,22 +4171,22 @@
     mov                  r5, dstq
     mov                 rax, cq
 .loop:
-    mova                xm0,     [cq+32* 0]
-    mova                xm1,     [cq+32* 1]
-    vinserti128          m0, m0, [cq+32* 8], 1
-    vinserti128          m1, m1, [cq+32* 9], 1
-    mova                xm2,     [cq+32* 2]
-    mova                xm3,     [cq+32* 3]
-    vinserti128          m2, m2, [cq+32*10], 1
-    vinserti128          m3, m3, [cq+32*11], 1
-    mova                xm4,     [cq+32* 4]
-    mova                xm5,     [cq+32* 5]
-    vinserti128          m4, m4, [cq+32*12], 1
-    vinserti128          m5, m5, [cq+32*13], 1
-    mova                xm6,     [cq+32* 6]
-    mova                xm7,     [cq+32* 7]
-    vinserti128          m6, m6, [cq+32*14], 1
-    vinserti128          m7, m7, [cq+32*15], 1
+    mova                xm0, [cq+32* 0]
+    mova                xm1, [cq+32* 1]
+    vinserti128          m0, [cq+32* 8], 1
+    vinserti128          m1, [cq+32* 9], 1
+    mova                xm2, [cq+32* 2]
+    mova                xm3, [cq+32* 3]
+    vinserti128          m2, [cq+32*10], 1
+    vinserti128          m3, [cq+32*11], 1
+    mova                xm4, [cq+32* 4]
+    mova                xm5, [cq+32* 5]
+    vinserti128          m4, [cq+32*12], 1
+    vinserti128          m5, [cq+32*13], 1
+    mova                xm6, [cq+32* 6]
+    mova                xm7, [cq+32* 7]
+    vinserti128          m6, [cq+32*14], 1
+    vinserti128          m7, [cq+32*15], 1
     REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
     REPX  {paddsw   x, x  }, m0, m1, m2, m3, m4, m5, m6, m7
     call m(inv_txfm_add_identity_identity_8x32).transpose8x8
@@ -4374,22 +4374,22 @@
     mov                  r5, dstq
     lea                 rax, [cq+32]
 .loop:
-    mova                xm0,     [cq+64* 0]
-    mova                xm1,     [cq+64* 1]
-    vinserti128          m0, m0, [cq+64* 8], 1
-    vinserti128          m1, m1, [cq+64* 9], 1
-    mova                xm2,     [cq+64* 2]
-    mova                xm3,     [cq+64* 3]
-    vinserti128          m2, m2, [cq+64*10], 1
-    vinserti128          m3, m3, [cq+64*11], 1
-    mova                xm4,     [cq+64* 4]
-    mova                xm5,     [cq+64* 5]
-    vinserti128          m4, m4, [cq+64*12], 1
-    vinserti128          m5, m5, [cq+64*13], 1
-    mova                xm6,     [cq+64* 6]
-    mova                xm7,     [cq+64* 7]
-    vinserti128          m6, m6, [cq+64*14], 1
-    vinserti128          m7, m7, [cq+64*15], 1
+    mova                xm0, [cq+64* 0]
+    mova                xm1, [cq+64* 1]
+    vinserti128          m0, [cq+64* 8], 1
+    vinserti128          m1, [cq+64* 9], 1
+    mova                xm2, [cq+64* 2]
+    mova                xm3, [cq+64* 3]
+    vinserti128          m2, [cq+64*10], 1
+    vinserti128          m3, [cq+64*11], 1
+    mova                xm4, [cq+64* 4]
+    mova                xm5, [cq+64* 5]
+    vinserti128          m4, [cq+64*12], 1
+    vinserti128          m5, [cq+64*13], 1
+    mova                xm6, [cq+64* 6]
+    mova                xm7, [cq+64* 7]
+    vinserti128          m6, [cq+64*14], 1
+    vinserti128          m7, [cq+64*15], 1
     call m(inv_txfm_add_identity_identity_8x32).transpose8x8
     REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
     WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
@@ -4532,27 +4532,27 @@
     add                eobd, 0x80000000
     jnc .pass1_loop
     lea                  r2, [rsp+32*23]
-    mova                xm0,     [r2-32*4+ 0]
-    mova                xm1,     [r2-32*2+ 0]
-    vinserti128          m0, m0, [r2+32*0+ 0], 1
-    vinserti128          m1, m1, [r2+32*2+ 0], 1
-    mova                xm2,     [r2-32*4+16]
-    mova                xm3,     [r2-32*2+16]
-    vinserti128          m2, m2, [r2+32*0+16], 1
-    vinserti128          m3, m3, [r2+32*2+16], 1
+    mova                xm0, [r2-32*4+ 0]
+    mova                xm1, [r2-32*2+ 0]
+    vinserti128          m0, [r2+32*0+ 0], 1
+    vinserti128          m1, [r2+32*2+ 0], 1
+    mova                xm2, [r2-32*4+16]
+    mova                xm3, [r2-32*2+16]
+    vinserti128          m2, [r2+32*0+16], 1
+    vinserti128          m3, [r2+32*2+16], 1
     pxor                 m4, m4
     REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
     test                r7d, r7d
     jl .fast
     lea                  r3, [r2+32*8]
-    mova                xm4,     [r3-32*4+ 0]
-    mova                xm5,     [r3-32*2+ 0]
-    vinserti128          m4, m4, [r3+32*0+ 0], 1
-    vinserti128          m5, m5, [r3+32*2+ 0], 1
-    mova                xm6,     [r3-32*4+16]
-    mova                xm7,     [r3-32*2+16]
-    vinserti128          m6, m6, [r3+32*0+16], 1
-    vinserti128          m7, m7, [r3+32*2+16], 1
+    mova                xm4, [r3-32*4+ 0]
+    mova                xm5, [r3-32*2+ 0]
+    vinserti128          m4, [r3+32*0+ 0], 1
+    vinserti128          m5, [r3+32*2+ 0], 1
+    mova                xm6, [r3-32*4+16]
+    mova                xm7, [r3-32*2+16]
+    vinserti128          m6, [r3+32*0+16], 1
+    vinserti128          m7, [r3+32*2+16], 1
 .fast:
     mova              [rsp], m8
     lea               tmp1q, [rsp+32*7]
@@ -4575,26 +4575,26 @@
     mova       [tmp1q+32*1], m13
     mova       [tmp1q+32*2], m14
     mova       [tmp1q+32*3], m15
-    mova                xm0,     [r2-32*3+ 0]
-    mova                xm1,     [r2-32*1+ 0]
-    vinserti128          m0, m0, [r2+32*1+ 0], 1
-    vinserti128          m1, m1, [r2+32*3+ 0], 1
-    mova                xm2,     [r2-32*3+16]
-    mova                xm3,     [r2-32*1+16]
-    vinserti128          m2, m2, [r2+32*1+16], 1
-    vinserti128          m3, m3, [r2+32*3+16], 1
+    mova                xm0, [r2-32*3+ 0]
+    mova                xm1, [r2-32*1+ 0]
+    vinserti128          m0, [r2+32*1+ 0], 1
+    vinserti128          m1, [r2+32*3+ 0], 1
+    mova                xm2, [r2-32*3+16]
+    mova                xm3, [r2-32*1+16]
+    vinserti128          m2, [r2+32*1+16], 1
+    vinserti128          m3, [r2+32*3+16], 1
     pxor                 m4, m4
     REPX       {mova x, m4}, m5, m6, m7
     test                r7d, r7d
     jl .fast2
-    mova                xm4,     [r3-32*3+ 0]
-    mova                xm5,     [r3-32*1+ 0]
-    vinserti128          m4, m4, [r3+32*1+ 0], 1
-    vinserti128          m5, m5, [r3+32*3+ 0], 1
-    mova                xm6,     [r3-32*3+16]
-    mova                xm7,     [r3-32*1+16]
-    vinserti128          m6, m6, [r3+32*1+16], 1
-    vinserti128          m7, m7, [r3+32*3+16], 1
+    mova                xm4, [r3-32*3+ 0]
+    mova                xm5, [r3-32*1+ 0]
+    vinserti128          m4, [r3+32*1+ 0], 1
+    vinserti128          m5, [r3+32*3+ 0], 1
+    mova                xm6, [r3-32*3+16]
+    mova                xm7, [r3-32*1+16]
+    vinserti128          m6, [r3+32*1+16], 1
+    vinserti128          m7, [r3+32*3+16], 1
 .fast2:
     add               tmp1q, 32*8
     lea               tmp2q, [tmp1q+32*8]
@@ -4603,27 +4603,27 @@
     vpbroadcastd        m15, [o(pd_2048)]
     add               tmp1q, 32*16
     add               tmp2q, 32*32
-    mova                xm0,     [r2-32*4+ 0]
-    mova                xm3,     [r2-32*1+16]
-    vinserti128          m0, m0, [r2+32*0+ 0], 1
-    vinserti128          m3, m3, [r2+32*3+16], 1
-    mova                xm4,     [r2-32*4+16]
-    mova                xm7,     [r2-32*1+ 0]
-    vinserti128          m4, m4, [r2+32*0+16], 1
-    vinserti128          m7, m7, [r2+32*3+ 0], 1
+    mova                xm0, [r2-32*4+ 0]
+    mova                xm3, [r2-32*1+16]
+    vinserti128          m0, [r2+32*0+ 0], 1
+    vinserti128          m3, [r2+32*3+16], 1
+    mova                xm4, [r2-32*4+16]
+    mova                xm7, [r2-32*1+ 0]
+    vinserti128          m4, [r2+32*0+16], 1
+    vinserti128          m7, [r2+32*3+ 0], 1
     pxor                 m1, m1
     REPX       {mova x, m1}, m2, m5, m6
     test                r7d, r7d
     jl .fast3
     add                  r3, 32*24
-    mova                xm1,     [r3-32*1+16]
-    mova                xm2,     [r3-32*4+ 0]
-    vinserti128          m1, m1, [r3+32*3+16], 1
-    vinserti128          m2, m2, [r3+32*0+ 0], 1
-    mova                xm5,     [r3-32*1+ 0]
-    mova                xm6,     [r3-32*4+16]
-    vinserti128          m5, m5, [r3+32*3+ 0], 1
-    vinserti128          m6, m6, [r3+32*0+16], 1
+    mova                xm1, [r3-32*1+16]
+    mova                xm2, [r3-32*4+ 0]
+    vinserti128          m1, [r3+32*3+16], 1
+    vinserti128          m2, [r3+32*0+ 0], 1
+    mova                xm5, [r3-32*1+ 0]
+    mova                xm6, [r3-32*4+16]
+    vinserti128          m5, [r3+32*3+ 0], 1
+    vinserti128          m6, [r3+32*0+16], 1
 .fast3:
     add                 rax, o_idct64_offset
     call m(inv_txfm_add_dct_dct_16x64).main_part1
@@ -4630,26 +4630,26 @@
     add                 rax, 8
     add               tmp1q, 32*8
     sub               tmp2q, 32*8
-    mova                xm0,     [r2-32*2+ 0]
-    mova                xm3,     [r2-32*3+16]
-    vinserti128          m0, m0, [r2+32*2+ 0], 1
-    vinserti128          m3, m3, [r2+32*1+16], 1
-    mova                xm4,     [r2-32*2+16]
-    mova                xm7,     [r2-32*3+ 0]
-    vinserti128          m4, m4, [r2+32*2+16], 1
-    vinserti128          m7, m7, [r2+32*1+ 0], 1
+    mova                xm0, [r2-32*2+ 0]
+    mova                xm3, [r2-32*3+16]
+    vinserti128          m0, [r2+32*2+ 0], 1
+    vinserti128          m3, [r2+32*1+16], 1
+    mova                xm4, [r2-32*2+16]
+    mova                xm7, [r2-32*3+ 0]
+    vinserti128          m4, [r2+32*2+16], 1
+    vinserti128          m7, [r2+32*1+ 0], 1
     pxor                 m1, m1
     REPX       {mova x, m1}, m2, m5, m6
     test                r7d, r7d
     jl .fast4
-    mova                xm1,     [r3-32*3+16]
-    mova                xm2,     [r3-32*2+ 0]
-    vinserti128          m1, m1, [r3+32*1+16], 1
-    vinserti128          m2, m2, [r3+32*2+ 0], 1
-    mova                xm5,     [r3-32*3+ 0]
-    mova                xm6,     [r3-32*2+16]
-    vinserti128          m5, m5, [r3+32*1+ 0], 1
-    vinserti128          m6, m6, [r3+32*2+16], 1
+    mova                xm1, [r3-32*3+16]
+    mova                xm2, [r3-32*2+ 0]
+    vinserti128          m1, [r3+32*1+16], 1
+    vinserti128          m2, [r3+32*2+ 0], 1
+    mova                xm5, [r3-32*3+ 0]
+    mova                xm6, [r3-32*2+16]
+    vinserti128          m5, [r3+32*1+ 0], 1
+    vinserti128          m6, [r3+32*2+16], 1
 .fast4:
     call m(inv_txfm_add_dct_dct_16x64).main_part1
     call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
@@ -4933,38 +4933,38 @@
     mov               tmp2d, 4
 .pass2_loop:
     lea                  r3, [tmp1q-32*8]
-    mova                xm0,      [r3   -32*4]
-    mova                xm1,      [r3   -32*3]
-    vinserti128          m0, m0,  [tmp1q-32*4], 1
-    vinserti128          m1, m1,  [tmp1q-32*3], 1
-    mova                xm2,      [r3   -32*2]
-    mova                xm3,      [r3   -32*1]
-    vinserti128          m2, m2,  [tmp1q-32*2], 1
-    vinserti128          m3, m3,  [tmp1q-32*1], 1
-    mova                xm4,      [r3   +32*0]
-    mova                xm5,      [r3   +32*1]
-    vinserti128          m4, m4,  [tmp1q+32*0], 1
-    vinserti128          m5, m5,  [tmp1q+32*1], 1
-    mova                xm6,      [r3   +32*2]
-    mova                xm7,      [r3   +32*3]
-    vinserti128          m6, m6,  [tmp1q+32*2], 1
-    vinserti128          m7, m7,  [tmp1q+32*3], 1
-    mova                xm8,      [r3   -32*4+16]
-    mova                xm9,      [r3   -32*3+16]
-    vinserti128          m8, m8,  [tmp1q-32*4+16], 1
-    vinserti128          m9, m9,  [tmp1q-32*3+16], 1
-    mova               xm10,      [r3   -32*2+16]
-    mova               xm11,      [r3   -32*1+16]
-    vinserti128         m10, m10, [tmp1q-32*2+16], 1
-    vinserti128         m11, m11, [tmp1q-32*1+16], 1
-    mova               xm12,      [r3   +32*0+16]
-    mova               xm13,      [r3   +32*1+16]
-    vinserti128         m12, m12, [tmp1q+32*0+16], 1
-    vinserti128         m13, m13, [tmp1q+32*1+16], 1
-    mova               xm14,      [r3   +32*2+16]
-    mova               xm15,      [r3   +32*3+16]
-    vinserti128         m14, m14, [tmp1q+32*2+16], 1
-    vinserti128         m15, m15, [tmp1q+32*3+16], 1
+    mova                xm0, [r3   -32*4]
+    mova                xm1, [r3   -32*3]
+    vinserti128          m0, [tmp1q-32*4], 1
+    vinserti128          m1, [tmp1q-32*3], 1
+    mova                xm2, [r3   -32*2]
+    mova                xm3, [r3   -32*1]
+    vinserti128          m2, [tmp1q-32*2], 1
+    vinserti128          m3, [tmp1q-32*1], 1
+    mova                xm4, [r3   +32*0]
+    mova                xm5, [r3   +32*1]
+    vinserti128          m4, [tmp1q+32*0], 1
+    vinserti128          m5, [tmp1q+32*1], 1
+    mova                xm6, [r3   +32*2]
+    mova                xm7, [r3   +32*3]
+    vinserti128          m6, [tmp1q+32*2], 1
+    vinserti128          m7, [tmp1q+32*3], 1
+    mova                xm8, [r3   -32*4+16]
+    mova                xm9, [r3   -32*3+16]
+    vinserti128          m8, [tmp1q-32*4+16], 1
+    vinserti128          m9, [tmp1q-32*3+16], 1
+    mova               xm10, [r3   -32*2+16]
+    mova               xm11, [r3   -32*1+16]
+    vinserti128         m10, [tmp1q-32*2+16], 1
+    vinserti128         m11, [tmp1q-32*1+16], 1
+    mova               xm12, [r3   +32*0+16]
+    mova               xm13, [r3   +32*1+16]
+    vinserti128         m12, [tmp1q+32*0+16], 1
+    vinserti128         m13, [tmp1q+32*1+16], 1
+    mova               xm14, [r3   +32*2+16]
+    mova               xm15, [r3   +32*3+16]
+    vinserti128         m14, [tmp1q+32*2+16], 1
+    vinserti128         m15, [tmp1q+32*3+16], 1
     mova         [rsp+32*0], m6
     mova         [rsp+32*1], m7
     vpbroadcastd         m7, [o(pw_8192)]
@@ -5320,48 +5320,48 @@
     mov               tmp3d, 4
 .loop:
     lea               tmp2q, [tmp1q+32*8]
-    mova                xm0,      [tmp1q-32*4]
-    mova                xm1,      [tmp1q-32*3]
-    vinserti128          m0, m0,  [tmp2q-32*4], 1
-    vinserti128          m1, m1,  [tmp2q-32*3], 1
-    mova                xm2,      [tmp1q-32*2]
-    mova                xm3,      [tmp1q-32*1]
-    vinserti128          m2, m2,  [tmp2q-32*2], 1
-    vinserti128          m3, m3,  [tmp2q-32*1], 1
-    mova                xm4,      [tmp1q+32*0]
-    mova                xm5,      [tmp1q+32*1]
-    vinserti128          m4, m4,  [tmp2q+32*0], 1
-    vinserti128          m5, m5,  [tmp2q+32*1], 1
-    mova                xm6,      [tmp1q+32*2]
-    mova                xm7,      [tmp1q+32*3]
-    vinserti128          m6, m6,  [tmp2q+32*2], 1
-    vinserti128          m7, m7,  [tmp2q+32*3], 1
+    mova                xm0, [tmp1q-32*4]
+    mova                xm1, [tmp1q-32*3]
+    vinserti128          m0, [tmp2q-32*4], 1
+    vinserti128          m1, [tmp2q-32*3], 1
+    mova                xm2, [tmp1q-32*2]
+    mova                xm3, [tmp1q-32*1]
+    vinserti128          m2, [tmp2q-32*2], 1
+    vinserti128          m3, [tmp2q-32*1], 1
+    mova                xm4, [tmp1q+32*0]
+    mova                xm5, [tmp1q+32*1]
+    vinserti128          m4, [tmp2q+32*0], 1
+    vinserti128          m5, [tmp2q+32*1], 1
+    mova                xm6, [tmp1q+32*2]
+    mova                xm7, [tmp1q+32*3]
+    vinserti128          m6, [tmp2q+32*2], 1
+    vinserti128          m7, [tmp2q+32*3], 1
     REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
     call m(inv_txfm_add_identity_identity_8x32).transpose8x8
-    mova                xm8,      [tmp1q-32*4+16]
-    mova                xm9,      [tmp1q-32*3+16]
-    vinserti128          m8, m8,  [tmp2q-32*4+16], 1
-    vinserti128          m9, m9,  [tmp2q-32*3+16], 1
+    mova                xm8, [tmp1q-32*4+16]
+    mova                xm9, [tmp1q-32*3+16]
+    vinserti128          m8, [tmp2q-32*4+16], 1
+    vinserti128          m9, [tmp2q-32*3+16], 1
     mova       [tmp1q-32*4], m0
     mova       [tmp2q-32*4], m1
     mova       [tmp1q-32*3], m2
     mova       [tmp2q-32*3], m3
-    mova                xm2,     [tmp1q-32*2+16]
-    mova                xm3,     [tmp1q-32*1+16]
-    vinserti128          m2, m2, [tmp2q-32*2+16], 1
-    vinserti128          m3, m3, [tmp2q-32*1+16], 1
+    mova                xm2, [tmp1q-32*2+16]
+    mova                xm3, [tmp1q-32*1+16]
+    vinserti128          m2, [tmp2q-32*2+16], 1
+    vinserti128          m3, [tmp2q-32*1+16], 1
     mova       [tmp1q-32*2], m4
     mova       [tmp2q-32*2], m5
     mova       [tmp1q-32*1], m6
     mova       [tmp2q-32*1], m7
-    mova                xm4,     [tmp1q+32*0+16]
-    mova                xm5,     [tmp1q+32*1+16]
-    vinserti128          m4, m4, [tmp2q+32*0+16], 1
-    vinserti128          m5, m5, [tmp2q+32*1+16], 1
-    mova                xm6,     [tmp1q+32*2+16]
-    mova                xm7,     [tmp1q+32*3+16]
-    vinserti128          m6, m6, [tmp2q+32*2+16], 1
-    vinserti128          m7, m7, [tmp2q+32*3+16], 1
+    mova                xm4, [tmp1q+32*0+16]
+    mova                xm5, [tmp1q+32*1+16]
+    vinserti128          m4, [tmp2q+32*0+16], 1
+    vinserti128          m5, [tmp2q+32*1+16], 1
+    mova                xm6, [tmp1q+32*2+16]
+    mova                xm7, [tmp1q+32*3+16]
+    vinserti128          m6, [tmp2q+32*2+16], 1
+    vinserti128          m7, [tmp2q+32*3+16], 1
     pmulhrsw             m0, m8, m10
     pmulhrsw             m1, m9, m10
     REPX  {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7