shithub: dav1d

Download patch

ref: bf7adb75676f834ef279d79e7001ec6375619a38
parent: 53e7b21e34d0536c55b0b8ba120c2180726190b4
author: Henrik Gramner <gramner@twoorioles.com>
date: Thu Jun 18 19:14:51 EDT 2020

x86: Branch before waiting on popcnt in ipred_z AVX2 functions

Some specific Haswell CPU:s have a hardware bug where the popcnt
instruction doesn't set zero flag correctly, which causes the wrong
branch to be taken.

popcnt also has a 3-cycle latency on Intel CPU:s, so doing the branch
on the input value instead of the output reduces the amount of time
wasted going down the wrong code path in case of branch mispredictions.

--- a/src/x86/ipred.asm
+++ b/src/x86/ipred.asm
@@ -1412,7 +1412,6 @@
     mova                xm2, [r3+angleq*8] ; upper ymm half zero in both cases
     pcmpgtb              m1, m2
     pmovmskb            r5d, m1
-    popcnt              r5d, r5d ; sets ZF which can be used by caller
     ret
 .w4_no_upsample:
     %assign stack_offset org_stack_offset
@@ -1423,7 +1422,9 @@
     lea            maxbased, [hq+3]
     call .filter_strength
     mov            maxbased, 7
+    test                r5d, r5d
     jz .w4_main ; filter_strength == 0
+    popcnt              r5d, r5d
     vpbroadcastd         m7, [base+pb_8]
     vbroadcasti128       m2, [tlq-1]
     pminub               m1, m7, [base+z_filter_s]
@@ -1596,7 +1597,9 @@
     test             angled, 0x400
     jnz .w8_no_intra_edge_filter
     call .filter_strength
+    test                r5d, r5d
     jz .w8_main ; filter_strength == 0
+    popcnt              r5d, r5d
     movu                xm2, [tlq]
     pminub              xm1, xm0, [base+z_filter_s+14]
     vinserti128          m2, [tlq-1], 1
@@ -1698,7 +1701,9 @@
     test             angled, 0x400
     jnz .w16_no_intra_edge_filter
     call .filter_strength
+    test                r5d, r5d
     jz .w16_main ; filter_strength == 0
+    popcnt              r5d, r5d
     vpbroadcastd         m1, [base+pb_12]
     vbroadcasti128       m6, [base+z_filter_s+8]
     vinserti128          m2, m6, [base+z_filter_s], 0
@@ -2205,7 +2210,6 @@
     pand                 m0, m8, m7
     pcmpgtb              m0, m9
     pmovmskb            r3d, m0
-    popcnt              r3d, r3d
     ret
 ALIGN function_align
 .upsample_above: ; w4/w8
@@ -2255,7 +2259,9 @@
     lea                 r3d, [hq+3]
     sub              angled, 1112 ; angle - 90
     call .filter_strength
+    test                r3d, r3d
     jz .w4_no_filter_above
+    popcnt              r3d, r3d
     vpbroadcastd        xm2, [base+pb_4]
     pminub              xm2, [base+z_filter_s]
     vpbroadcastd        xm0, [base+z_filter_k-4+r3*4+12*0]
@@ -2290,9 +2296,10 @@
     pand                xm0, xm8 ; reuse from previous filter_strength call
     pcmpgtb             xm0, xm9
     pmovmskb            r3d, xm0
-    popcnt              r3d, r3d
 .w4_filter_left:
+    test                r3d, r3d
     jz .w4_main
+    popcnt              r3d, r3d
     mov                 r5d, 10
     cmp                  hd, 16
     movu                xm2, [rsp+49]
@@ -2443,7 +2450,9 @@
     lea                 r3d, [hq+7]
     sub              angled, 90 ; angle - 90
     call .filter_strength
+    test                r3d, r3d
     jz .w8_no_filter_above
+    popcnt              r3d, r3d
     vpbroadcastd        xm3, [base+pb_8]
     pminub              xm3, [base+z_filter_s+8]
     vpbroadcastd        xm0, [base+z_filter_k-4+r3*4+12*0]
@@ -2476,9 +2485,10 @@
     pand                 m0, m8
     pcmpgtb              m0, m9
     pmovmskb            r3d, m0
-    popcnt              r3d, r3d
 .w8_filter_left:
+    test                r3d, r3d
     jz .w8_main
+    popcnt              r3d, r3d
     vpbroadcastd         m7, [base+z_filter_k-4+r3*4+12*0]
     vpbroadcastd         m8, [base+z_filter_k-4+r3*4+12*1]
     vpbroadcastd         m9, [base+z_filter_k-4+r3*4+12*2]
@@ -2650,7 +2660,9 @@
     lea                 r3d, [hq+15]
     sub              angled, 90
     call .filter_strength
+    test                r3d, r3d
     jz .w16_no_filter_above
+    popcnt              r3d, r3d
     vbroadcasti128       m6, [tlq+1]
     mova                xm2, [base+z_filter_s]
     vinserti128          m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67   67 78 89 9a ab bc cd de
@@ -2683,8 +2695,9 @@
     pand                 m0, m8
     pcmpgtb              m0, m9
     pmovmskb            r3d, m0
-    popcnt              r3d, r3d
+    test                r3d, r3d
     jz .w16_main
+    popcnt              r3d, r3d
     vpbroadcastd         m7, [base+z_filter_k-4+r3*4+12*0]
     vpbroadcastd         m8, [base+z_filter_k-4+r3*4+12*1]
     vpbroadcastd         m9, [base+z_filter_k-4+r3*4+12*2]
@@ -3086,7 +3099,6 @@
     mova                xm2, [r4+angleq*8]
     pcmpgtb              m1, m2
     pmovmskb            r5d, m1
-    popcnt              r5d, r5d
     ret
 .h4_no_upsample:
     %assign stack_offset org_stack_offset
@@ -3097,7 +3109,9 @@
     lea            maxbased, [wq+3]
     call .filter_strength
     mov            maxbased, 7
+    test                r5d, r5d
     jz .h4_main ; filter_strength == 0
+    popcnt              r5d, r5d
     vpbroadcastd         m7, [base+pb_7]
     vbroadcasti128       m2, [tlq-14]
     pmaxub               m1, m7, [base+z_filter_s-4]
@@ -3288,7 +3302,9 @@
     test             angled, 0x400
     jnz .h8_no_intra_edge_filter
     call .filter_strength
+    test                r5d, r5d
     jz .h8_main ; filter_strength == 0
+    popcnt              r5d, r5d
     vpbroadcastd        xm6, [base+pb_15]
     pcmpeqb             xm1, xm1
     psubusb             xm6, xm0
@@ -3444,7 +3460,9 @@
     test             angled, 0x400
     jnz .h16_no_intra_edge_filter
     call .filter_strength
+    test                r5d, r5d
     jz .h16_main ; filter_strength == 0
+    popcnt              r5d, r5d
     vpbroadcastd        m11, [base+pb_27]
     vpbroadcastd         m1, [base+pb_1]
     vbroadcasti128       m6, [base+z_filter_s+12]