shithub: dav1d

Download patch

ref: 3dda2dd62e80516476bbd5575b972e002bba9066
parent: 3cf4d32e74e38b99036c21b7d2d0fb2108223221
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Feb 11 12:22:01 EST 2019

x86: Optimize MC w_avg

--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -2978,13 +2978,12 @@
 %macro W_AVG 1 ; src_offset
     ; (a * weight + b * (16 - weight) + 128) >> 8
     ; = ((a - b) * weight + (b << 4) + 128) >> 8
-    ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
-    mova                 m0,     [tmp2q+(%1+0)*mmsize]
-    psubw                m2, m0, [tmp1q+(%1+0)*mmsize]
-    mova                 m1,     [tmp2q+(%1+1)*mmsize]
-    psubw                m3, m1, [tmp1q+(%1+1)*mmsize]
-    paddw                m2, m2 ; compensate for the weight only being half
-    paddw                m3, m3 ; of what it should be
+    ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+    ; = ((((b - a) * (-weight     << 12)) >> 16) + b + 8) >> 4
+    mova                 m0,     [tmp1q+(%1+0)*mmsize]
+    psubw                m2, m0, [tmp2q+(%1+0)*mmsize]
+    mova                 m1,     [tmp1q+(%1+1)*mmsize]
+    psubw                m3, m1, [tmp2q+(%1+1)*mmsize]
     pmulhw               m2, m4
     pmulhw               m3, m4
     paddw                m0, m2
@@ -3000,13 +2999,19 @@
     lea                  r6, [w_avg_avx2_table]
     tzcnt                wd, wm
     movifnidn            hd, hm
-    vpbroadcastw         m0, r6m ; weight
+    vpbroadcastw         m4, r6m ; weight
     movsxd               wq, dword [r6+wq*4]
-    pxor                 m4, m4
-    psllw                m0, 11 ; can't shift by 12, sign bit must be preserved
-    psubw                m4, m0
     vpbroadcastd         m5, [pw_2048+r6-w_avg_avx2_table]
+    psllw                m4, 12 ; (weight-16) << 12 when interpreted as signed
     add                  wq, r6
+    cmp           dword r6m, 7
+    jg .weight_gt7
+    mov                  r6, tmp1q
+    pxor                 m0, m0
+    mov               tmp1q, tmp2q
+    psubw                m4, m0, m4 ; -weight
+    mov               tmp2q, r6
+.weight_gt7:
     BIDIR_FN          W_AVG
 
 %macro MASK 1 ; src_offset
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -828,16 +828,17 @@
 %macro W_AVG 1 ; src_offset
     ; (a * weight + b * (16 - weight) + 128) >> 8
     ; = ((a - b) * weight + (b << 4) + 128) >> 8
-    ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
-    mova                 m0,     [tmp2q+(%1+0)*mmsize]
-    psubw                m2, m0, [tmp1q+(%1+0)*mmsize]
-    mova                 m1,     [tmp2q+(%1+1)*mmsize]
-    psubw                m3, m1, [tmp1q+(%1+1)*mmsize]
-    paddw                m2, m2 ; compensate for the weight only being half
-    paddw                m3, m3 ; of what it should be
-    pmulhw               m2, m4 ; (b-a) * (-weight << 12)
-    pmulhw               m3, m4 ; (b-a) * (-weight << 12)
-    paddw                m0, m2 ; ((b-a) * -weight) + b
+    ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+    ; = ((((b - a) * (-weight     << 12)) >> 16) + b + 8) >> 4
+    mova                 m2, [tmp1q+(%1+0)*mmsize]
+    mova                 m0, m2
+    psubw                m2, [tmp2q+(%1+0)*mmsize]
+    mova                 m3, [tmp1q+(%1+1)*mmsize]
+    mova                 m1, m3
+    psubw                m3, [tmp2q+(%1+1)*mmsize]
+    pmulhw               m2, m4
+    pmulhw               m3, m4
+    paddw                m0, m2
     paddw                m1, m3
     pmulhrsw             m0, m5
     pmulhrsw             m1, m5
@@ -849,16 +850,22 @@
 cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
     LEA                  r6, w_avg_ssse3_table
     tzcnt                wd, wm
+    movd                 m4, r6m
     movifnidn            hd, hm
-    movd                 m0, r6m
-    pshuflw              m0, m0, q0000
-    punpcklqdq           m0, m0
+    pxor                 m0, m0
     movsxd               wq, dword [r6+wq*4]
-    pxor                 m4, m4
-    psllw                m0, 11 ; can't shift by 12, sign bit must be preserved
-    psubw                m4, m0
     mova                 m5, [pw_2048+r6-w_avg_ssse3_table]
+    pshufb               m4, m0
+    psllw                m4, 12 ; (weight-16) << 12 when interpreted as signed
     add                  wq, r6
+    cmp           dword r6m, 7
+    jg .weight_gt7
+    mov                  r6, tmp1q
+    psubw                m0, m4
+    mov               tmp1q, tmp2q
+    mova                 m4, m0 ; -weight
+    mov               tmp2q, r6
+.weight_gt7:
     BIDIR_FN          W_AVG
 
 %macro MASK 1 ; src_offset