shithub: dav1d

Download patch

ref: 080200166645d1c2026be2fac8392e5ab21e557a
parent: 585ac4624890355f6df219cf081d055d367347c2
author: Henrik Gramner <gramner@twoorioles.com>
date: Wed Mar 20 15:16:13 EDT 2019

x86: Add minor CDEF AVX2 optimizations

--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -135,7 +135,7 @@
     lea          dst4q, [dstq+strideq*4]
 %endif
     lea       stride3q, [strideq*3]
-    test         edged, 2                   ; have_right
+    test         edgeb, 2                   ; have_right
     jz .no_right
     pmovzxbw        m1, [dstq+strideq*0]
     pmovzxbw        m2, [dstq+strideq*1]
@@ -217,13 +217,13 @@
 
     ; top
     DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge
-    test         edged, 4                    ; have_top
+    test         edgeb, 4                    ; have_top
     jz .no_top
     mov          top1q, [top2q+0*gprsize]
     mov          top2q, [top2q+1*gprsize]
-    test         edged, 1                    ; have_left
+    test         edgeb, 1                    ; have_left
     jz .top_no_left
-    test         edged, 2                    ; have_right
+    test         edgeb, 2                    ; have_right
     jz .top_no_right
     pmovzxbw        m1, [top1q-(%1/2)]
     pmovzxbw        m2, [top2q-(%1/2)]
@@ -239,7 +239,7 @@
     movd [px-1*%3+%1*2], xm14
     jmp .top_done
 .top_no_left:
-    test         edged, 2                   ; have_right
+    test         edgeb, 2                   ; have_right
     jz .top_no_left_right
     pmovzxbw        m1, [top1q]
     pmovzxbw        m2, [top2q]
@@ -272,7 +272,7 @@
 .top_done:
 
     ; left
-    test         edged, 1                   ; have_left
+    test         edgeb, 1                   ; have_left
     jz .no_left
     pmovzxbw       xm1, [leftq+ 0]
 %if %2 == 8
@@ -304,12 +304,12 @@
 
     ; bottom
     DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge
-    test         edged, 8                   ; have_bottom
+    test         edgeb, 8                   ; have_bottom
     jz .no_bottom
     lea          dst8q, [dstq+%2*strideq]
-    test         edged, 1                   ; have_left
+    test         edgeb, 1                   ; have_left
     jz .bottom_no_left
-    test         edged, 2                   ; have_right
+    test         edgeb, 2                   ; have_right
     jz .bottom_no_right
     pmovzxbw        m1, [dst8q-(%1/2)]
     pmovzxbw        m2, [dst8q+strideq-(%1/2)]
@@ -328,7 +328,7 @@
     movd  [px+(%2+1)*%3+%1*2], xm14
     jmp .bottom_done
 .bottom_no_left:
-    test          edged, 2                  ; have_right
+    test          edgeb, 2                  ; have_right
     jz .bottom_no_left_right
     pmovzxbw        m1, [dst8q]
     pmovzxbw        m2, [dst8q+strideq]
@@ -362,50 +362,49 @@
 
     ; actual filter
     INIT_YMM avx2
-    DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp
+    DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero
 %undef edged
     ; register to shuffle values into after packing
     vbroadcasti128 m12, [shufb_lohi]
 
     movifnidn     prid, prim
-    movifnidn     secd, secm
     mov       dampingd, r7m
-
-    mov        pridmpd, prid
-    mov        secdmpd, secd
-    or         pridmpd, 1
-    or         secdmpd, 1
-    lzcnt      pridmpd, pridmpd
-    lzcnt      secdmpd, secdmpd
-    lea        pridmpd, [pridmpd+dampingd-31]
-    lea        secdmpd, [secdmpd+dampingd-31]
-    xor       dampingd, dampingd
-    test       pridmpd, pridmpd
-    cmovl      pridmpd, dampingd
-    test       secdmpd, secdmpd
-    cmovl      secdmpd, dampingd
+    lzcnt      pridmpd, prid
+%if UNIX64
+    movd           xm0, prid
+    movd           xm1, secdmpd
+%endif
+    lzcnt      secdmpd, secdmpm
+    sub       dampingd, 31
+    xor          zerod, zerod
+    add        pridmpd, dampingd
+    cmovl      pridmpd, zerod
+    add        secdmpd, dampingd
+    cmovl      secdmpd, zerod
     mov        [rsp+0], pridmpq                 ; pri_shift
     mov        [rsp+8], secdmpq                 ; sec_shift
 
-    DEFINE_ARGS dst, stride, pridmp, table, pri, sec, stride3, secdmp
+    DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3
     lea         tableq, [tap_table]
     vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
     vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
 
     ; pri/sec_taps[k] [4 total]
-    DEFINE_ARGS dst, stride, dummy, table, pri, sec, stride3
-    movd           xm0, prid
-    movd           xm1, secd
+    DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3
+%if UNIX64
     vpbroadcastb    m0, xm0                     ; pri_strength
     vpbroadcastb    m1, xm1                     ; sec_strength
+%else
+    vpbroadcastb    m0, prim
+    vpbroadcastb    m1, secm
+%endif
     and           prid, 1
     lea           priq, [tableq+priq*2+8]       ; pri_taps
     lea           secq, [tableq+12]             ; sec_taps
 
     ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
-    DEFINE_ARGS dst, stride, dir, tap, pri, sec, stride3
     mov           dird, r6m
-    lea           dirq, [tapq+dirq*2+14]
+    lea           dirq, [tableq+dirq*2+14]
 %if %1*%2*2/mmsize > 1
  %if %1 == 4
     DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
@@ -614,9 +613,9 @@
     paddw          m11, m13                 ; partial_sum_alt[3/2] right
     vbroadcasti128 m13, [div_table+32]
     paddw           m4, m5                  ; partial_sum_alt[3/2] left
-    pshuflw        m11, m11, q3012
-    punpckhwd       m6, m4, m11
-    punpcklwd       m4, m11
+    pshuflw         m5, m11, q3012
+    punpckhwd       m6, m11, m4
+    punpcklwd       m4, m5
     pmaddwd         m6, m6
     pmaddwd         m4, m4
     pmulld          m6, m12
@@ -642,14 +641,14 @@
     paddw           m6, m7
     paddw           m1, m3                  ; partial_sum_alt[0/1] right
     paddw           m5, m6                  ; partial_sum_alt[0/1] left
-    pshuflw         m1, m1, q3012
-    punpckhwd       m6, m5, m1
-    punpcklwd       m5, m1
-    pmaddwd         m6, m6
+    pshuflw         m0, m1, q3012
+    punpckhwd       m1, m5
+    punpcklwd       m5, m0
+    pmaddwd         m1, m1
     pmaddwd         m5, m5
-    pmulld          m6, m12
+    pmulld          m1, m12
     pmulld          m5, m13
-    paddd           m5, m6                  ; cost1[a-d] | cost3[a-d]
+    paddd           m5, m1                  ; cost1[a-d] | cost3[a-d]
 
     mova           xm0, [pd_47130256+ 16]
     mova            m1, [pd_47130256]
@@ -661,11 +660,10 @@
 
     ; now find the best cost
     pmaxsd         xm2, xm0, xm1
-    pshufd         xm3, xm2, q3232
+    pshufd         xm3, xm2, q1032
     pmaxsd         xm2, xm3
-    pshufd         xm3, xm2, q1111
-    pmaxsd         xm2, xm3
-    pshufd         xm2, xm2, q0000 ; best cost
+    pshufd         xm3, xm2, q2301
+    pmaxsd         xm2, xm3 ; best cost
 
     ; find the idx using minpos
     ; make everything other than the best cost negative via subtraction
@@ -676,7 +674,7 @@
     phminposuw     xm3, xm3
 
     ; convert idx to 32-bits
-    psrldq         xm3, 2
+    psrld          xm3, 16
     movd           eax, xm3
 
     ; get idx^4 complement