shithub: dav1d

Download patch

ref: 3b33c52d739da0e8a1e7214c84c49f3298a68ad0
parent: 6c81623eab4154e1387681695d51a52a08ec4eec
author: Henrik Gramner <gramner@twoorioles.com>
date: Thu Oct 24 10:37:46 EDT 2019

x86: Add minor ipred_z AVX2 optimizations

--- a/src/x86/ipred.asm
+++ b/src/x86/ipred.asm
@@ -1485,7 +1485,7 @@
     pmaddubsw            m0, m1
     pcmpgtw              m1, m9, m6 ; base < max_base_x
     pmulhrsw             m0, m3
-    paddsw               m6, m10    ; xpos += dx
+    paddw                m6, m10    ; xpos += dx
     lea                  r5, [dstq+strideq*2]
     vpblendvb            m0, m7, m0, m1
     packuswb             m0, m0
@@ -1494,9 +1494,9 @@
     pextrd [r5  +strideq*1], xm0, 1
     movd   [dstq+strideq*0], xm1
     pextrd [dstq+strideq*1], xm1, 1
-    lea                dstq, [dstq+strideq*4]
     sub                  hd, 4
     jz .w4_end
+    lea                dstq, [dstq+strideq*4]
     cmp                 r3d, maxbased
     jb .w4_loop
     packuswb            xm7, xm7
@@ -1662,7 +1662,7 @@
     pshufb               m0, m8
     pmaddubsw            m0, m1
     pcmpgtw              m1, m9, m2
-    paddsw               m2, m6
+    paddw                m2, m6
     pmulhrsw             m0, m3
     vpblendvb            m0, m7, m0, m1
     vextracti128        xm1, m0, 1
@@ -1669,9 +1669,9 @@
     packuswb            xm0, xm1
     movq   [dstq+strideq*0], xm0
     movhps [dstq+strideq*1], xm0
-    lea                dstq, [dstq+strideq*2]
     sub                  hd, 2
     jz .w8_end
+    lea                dstq, [dstq+strideq*2]
     cmp                 r3d, maxbased
     jb .w8_loop
     packuswb            xm7, xm7
@@ -1788,13 +1788,13 @@
     pcmpgtw              m1, m9, m6
     pcmpgtw              m2, m10, m6
     packsswb             m1, m2
-    paddsw               m6, m11
+    paddw                m6, m11
     vpblendvb            m0, m7, m0, m1
     mova         [dstq+strideq*0], xm0
     vextracti128 [dstq+strideq*1], m0, 1
-    lea                dstq, [dstq+strideq*2]
     sub                  hd, 2
     jz .w16_end
+    lea                dstq, [dstq+strideq*2]
     cmp                 r3d, maxbased
     jb .w16_loop
 .w16_end_loop:
@@ -1903,20 +1903,20 @@
     movd                xm9, maxbased
     vbroadcasti128       m8, [z_filter_s+2]
     vpbroadcastw         m9, xm9
-    mov                 r3d, dxd
+    mov                 r5d, dxd
     psubw                m9, [z_base_inc]
     mova                m11, m6
     psubw               m10, m9, m3 ; 64*8
 .w32_loop:
-    mov                 r5d, r3d
-    shr                 r5d, 6
+    mov                 r3d, r5d
+    shr                 r3d, 6
     pand                 m1, m4, m6
     psubw                m2, m5, m1
     psllw                m1, 8
     por                  m2, m1
-    movu                 m0, [tlq+r5+0]
-    movu                 m1, [tlq+r5+8]
-    add                 r3d, dxd
+    movu                 m0, [tlq+r3+0]
+    movu                 m1, [tlq+r3+8]
+    add                 r5d, dxd
     pshufb               m0, m8
     pshufb               m1, m8
     pmaddubsw            m0, m2
@@ -1927,13 +1927,13 @@
     pcmpgtw              m1, m9, m6
     pcmpgtw              m2, m10, m6
     packsswb             m1, m2
-    paddsw               m6, m11
+    paddw                m6, m11
     vpblendvb            m0, m7, m0, m1
     mova             [dstq], m0
-    add                dstq, strideq
     dec                  hd
     jz .w32_end
-    cmp                 r3d, maxbased
+    add                dstq, strideq
+    cmp                 r5d, maxbased
     jb .w32_loop
     test                 hb, 1
     jz .w32_end_loop
@@ -2074,25 +2074,23 @@
     mova         [tlq+32*2], m0
     mova         [tlq+32*3], m1
 .w64_main:
-    movd                xm6, dxd
+    movd               xm12, dxd
     vpbroadcastb         m7, [tlq+maxbaseq]
+    lea                 r3d, [dxq-64]
     shl            maxbased, 6
-    vpbroadcastw         m6, xm6
-    movd               xm10, maxbased
+    vpbroadcastw        m12, xm12
+    sub                 r3d, maxbased
     vbroadcasti128       m8, [z_filter_s+2]
-    mov                 r3d, dxd
-    vpbroadcastw        m10, xm10
-    psllw                m0, m3, 2   ; 64*32
-    psubw               m10, [z_base_inc]
-    mova                m14, m6
-    psubw               m11, m10, m3 ; 64*8
-    psubw               m12, m10, m0
-    psubw               m13, m11, m0
+    movd                xm6, r3d
+    mov                 r5d, dxd
+    mova                m10, [pb_1to32]
+    vpbroadcastd        m11, [pb_32]
+    vpbroadcastw         m6, xm6
 .w64_loop:
-    mov                 r5d, r3d
-    shr                 r5d, 6
-    movu                 m0, [tlq+r5+ 0]
-    movu                 m1, [tlq+r5+ 8]
+    mov                 r3d, r5d
+    shr                 r3d, 6
+    movu                 m0, [tlq+r3+ 0]
+    movu                 m1, [tlq+r3+ 8]
     pand                 m2, m4, m6
     psubw                m9, m5, m2
     psllw                m2, 8
@@ -2101,34 +2099,32 @@
     pshufb               m1, m8
     pmaddubsw            m0, m9
     pmaddubsw            m1, m9
+    psraw                m2, m6, 6
     pmulhrsw             m0, m3
     pmulhrsw             m1, m3
+    packsswb             m2, m2
+    paddb                m2, m10
     packuswb             m0, m1
-    pcmpgtw              m1, m10, m6
-    pcmpgtw              m2, m11, m6
-    packsswb             m1, m2
-    vpblendvb            m2, m7, m0, m1
-    movu                 m0, [tlq+r5+32]
-    movu                 m1, [tlq+r5+40]
-    add                 r3d, dxd
-    mova          [dstq+ 0], m2
+    vpblendvb            m0, m7, m0, m2
+    mova          [dstq+ 0], m0
+    movu                 m0, [tlq+r3+32]
+    movu                 m1, [tlq+r3+40]
+    add                 r5d, dxd
     pshufb               m0, m8
     pshufb               m1, m8
     pmaddubsw            m0, m9
     pmaddubsw            m1, m9
-    pcmpgtw              m9, m12, m6
-    pcmpgtw              m2, m13, m6
+    paddb                m2, m11
     pmulhrsw             m0, m3
     pmulhrsw             m1, m3
-    paddsw               m6, m14
-    packsswb             m9, m2
+    paddw                m6, m12
     packuswb             m0, m1
-    vpblendvb            m0, m7, m0, m9
+    vpblendvb            m0, m7, m0, m2
     mova          [dstq+32], m0
-    add                dstq, strideq
     dec                  hd
     jz .w64_end
-    cmp                 r3d, maxbased
+    add                dstq, strideq
+    cmp                 r5d, maxbased
     jb .w64_loop
 .w64_end_loop:
     mova          [dstq+ 0], m7
@@ -2384,7 +2380,7 @@
     vpblendvb            m0, m1, m2
 .w4_toponly:
     pmulhrsw             m0, m13
-    paddsw               m6, m7        ; xpos += dx
+    paddw                m6, m7        ; xpos += dx
     add                  r5, dyq
     packuswb             m0, m0
     vextracti128        xm1, m0, 1
@@ -2392,9 +2388,9 @@
     pextrd [dstq+r9       ], xm0, 1
     movd   [dstq+strideq*0], xm1
     pextrd [dstq+strideq*1], xm1, 1
-    lea                dstq, [dstq+strideq*4]
     sub                  hd, 4
     jz .w4_end
+    lea                dstq, [dstq+strideq*4]
     cmp                 r2d, r8d
     jge .w4_loop
 .w4_leftonly_loop:
@@ -2604,7 +2600,7 @@
 .w8_toponly:
     pmulhrsw             m0, m13
     pmulhrsw             m1, m13
-    paddsw               m6, m4, m7     ; xpos += dx
+    paddw                m6, m4, m7     ; xpos += dx
     add                  r5, dyq
     packuswb             m0, m1
     vextracti128        xm1, m0, 1
@@ -2612,9 +2608,9 @@
     movhps [dstq+strideq*2], xm0
     movq   [dstq+strideq*1], xm1
     movhps [dstq+r9       ], xm1
-    lea                dstq, [dstq+strideq*4]
     sub                  hd, 4
     jz .w8_end
+    lea                dstq, [dstq+strideq*4]
     cmp                 r2d, r8d
     jge .w8_loop
 .w8_leftonly_loop:
@@ -2841,15 +2837,15 @@
 .w16_toponly:
     pmulhrsw             m0, m13
     pmulhrsw             m1, m13
-    paddsw               m6, m5, m7   ; xpos += dx
+    paddw                m6, m5, m7   ; xpos += dx
     sub                  r5, 2
     packuswb             m0, m1
     vpermq               m0, m0, q3120
     mova         [dstq+strideq*0], xm0
     vextracti128 [dstq+strideq*1], m0, 1
-    lea                dstq, [dstq+strideq*2]
     sub                  hd, 2
     jz .w16_end
+    lea                dstq, [dstq+strideq*2]
     cmp                 r2d, (63-16)<<6
     jge .w16_loop
 .w16_leftonly_loop:
@@ -3135,9 +3131,9 @@
     vpbroadcastb         m7, [r4]
     lea                  r4, [dyq+63] ; ypos
     movd                xm9, maxbased
-    sub            maxbased, 63
+    not            maxbased
     vbroadcasti128       m8, [z3_shuf_w4]
-    neg            maxbaseq
+    add            maxbased, 64
     vpbroadcastw         m9, xm9
     psrlw                m7, 8  ; top[max_base_y]
     paddw               m10, m6, m6
@@ -3170,7 +3166,7 @@
     pmaddubsw            m0, m1
     pcmpgtw              m1, m9, m6 ; base < max_base_y
     pmulhrsw             m0, m3
-    paddsw               m6, m10    ; ypos += dy
+    paddw                m6, m10    ; ypos += dy
     vpblendvb            m0, m7, m0, m1
     vextracti128        xm1, m0, 1
     packuswb            xm1, xm0
@@ -3179,9 +3175,9 @@
     pextrd [dstq+strideq*1], xm1, 1
     pextrd [dstq+strideq*2], xm1, 2
     pextrd [dstq+r7       ], xm1, 3
-    add                dstq, 4
     sub                  wd, 4
     jz .h4_end
+    add                dstq, 4
     cmp                 r4d, maxbased
     jg .h4_loop
     packuswb            xm7, xm7
@@ -3344,9 +3340,9 @@
     vpbroadcastb         m7, [r4]
     lea                  r4, [dyq+63]
     movd                xm9, maxbased
-    sub            maxbased, 63
+    not            maxbased
     vbroadcasti128       m8, [z3_shuf]
-    neg            maxbaseq
+    add            maxbased, 64
     vpbroadcastw         m9, xm9
     psrlw                m7, 8
     psubw                m9, m0
@@ -3367,7 +3363,7 @@
     pshufb               m0, m8
     pmaddubsw            m0, m1
     pcmpgtw              m1, m9, m2
-    paddsw               m2, m6
+    paddw                m2, m6
     pmulhrsw             m0, m3
     vpblendvb            m0, m7, m0, m1
     vextracti128        xm1, m0, 1
@@ -3516,9 +3512,9 @@
     vpbroadcastb         m7, [r4]
     lea                  r4, [dyq+63]
     movd                xm9, maxbased
-    sub            maxbased, 63
+    not            maxbased
     vbroadcasti128       m8, [z3_shuf]
-    neg            maxbaseq
+    add            maxbased, 64
     vpbroadcastw         m9, xm9
     psubw                m9, m0
     paddw               m11, m6, m6
@@ -3548,7 +3544,7 @@
     pcmpgtw              m1, m9, m6
     pcmpgtw              m2, m10, m6
     packsswb             m1, m2
-    paddsw               m6, m11
+    paddw                m6, m11
     vpblendvb            m0, m7, m0, m1
     vpermq               m0, m0, q3120
     mova              [rsp], m0
@@ -3742,9 +3738,9 @@
     vpbroadcastb         m7, [r4]
     lea                  r4, [dyq+63]
     movd                xm9, maxbased
-    sub            maxbased, 63
+    not            maxbased
     vbroadcasti128       m8, [z3_shuf]
-    neg            maxbaseq
+    add            maxbased, 64
     vpbroadcastw         m9, xm9
     psubw                m9, [z_base_inc]
     mova                m11, m6
@@ -3772,7 +3768,7 @@
     pcmpgtw              m1, m9, m6
     pcmpgtw              m2, m10, m6
     packsswb             m1, m2
-    paddsw               m6, m11
+    paddw                m6, m11
     vpblendvb            m0, m7, m0, m1
     mova              [rsp], m0
     dec                  wd
@@ -3996,33 +3992,26 @@
     mova           [tlq-63], m0
     mova           [tlq-31], m1
 .h64_main:
-    movd                xm6, dyd
-    mov                  r4, tlq
-    sub                 tlq, 24
-    neg                 dyq
-    vpbroadcastw         m6, xm6
-    sub                  r4, maxbaseq
+    movd               xm12, dyd
+    neg            maxbaseq
+    vbroadcasti128       m8, [z3_shuf]
+    vpbroadcastb         m7, [tlq+maxbaseq]
     shl            maxbased, 6
-    vpbroadcastb         m7, [r4]
+    vpbroadcastw        m12, xm12
+    lea                 r5d, [dyq+maxbaseq-64]
+    neg                 dyq
+    or             maxbased, 63
     lea                  r4, [dyq+63]
-    movd               xm10, maxbased
-    sub            maxbased, 63
-    vbroadcasti128       m8, [z3_shuf]
-    neg            maxbaseq
-    mova                xm1, [z_base_inc+16]
-    vinserti128          m1, [z_base_inc], 1
-    vpbroadcastw        m10, xm10
-    psllw                m0, m3, 2   ; 64*32
-    psubw               m10, m1
-    mova                m14, m6
-    psubw               m11, m10, m3 ; 64*8
-    psubw               m12, m10, m0
-    psubw               m13, m11, m0
+    movd                xm6, r5d
+    mova               xm10, [pb_1to32+16]
+    vinserti128         m10, [pb_1to32], 1
+    vpbroadcastd        m11, [pb_32]
+    vpbroadcastw         m6, xm6
 .h64_loop:
     mov                  r5, r4
     sar                  r5, 6
-    movu                 m0, [tlq+r5-0]
-    movu                 m1, [tlq+r5-8]
+    movu                 m0, [tlq+r5-24]
+    movu                 m1, [tlq+r5-32]
     pand                 m2, m4, m6
     psubw                m9, m5, m2
     psllw                m2, 8
@@ -4031,30 +4020,28 @@
     pshufb               m1, m8
     pmaddubsw            m0, m9
     pmaddubsw            m1, m9
+    psraw                m2, m6, 6
+    sub                 rsp, 64
     pmulhrsw             m0, m3
     pmulhrsw             m1, m3
+    packsswb             m2, m2
+    paddb                m2, m10
     packuswb             m0, m1
-    pcmpgtw              m1, m10, m6
-    pcmpgtw              m2, m11, m6
-    packsswb             m1, m2
-    vpblendvb            m2, m7, m0, m1
-    movu                 m0, [tlq+r5-32]
-    movu                 m1, [tlq+r5-40]
+    vpblendvb            m0, m7, m0, m2
+    mova           [rsp+32], m0
+    movu                 m0, [tlq+r5-56]
+    movu                 m1, [tlq+r5-64]
     add                  r4, dyq
-    sub                 rsp, 64
-    mova           [rsp+32], m2
     pshufb               m0, m8
     pshufb               m1, m8
     pmaddubsw            m0, m9
     pmaddubsw            m1, m9
-    pcmpgtw              m9, m12, m6
-    pcmpgtw              m2, m13, m6
+    paddb                m2, m11
     pmulhrsw             m0, m3
     pmulhrsw             m1, m3
-    paddsw               m6, m14
-    packsswb             m9, m2
+    paddw                m6, m12
     packuswb             m0, m1
-    vpblendvb            m0, m7, m0, m9
+    vpblendvb            m0, m7, m0, m2
     mova              [rsp], m0
     dec                  wd
     jz .h64_transpose