shithub: dav1d

Download patch

ref: f481d69b0ffac087504036375d505f4323d7ef5e
parent: 641ef4cc9fa2a12d53be2e75e9690e7b8ff4e605
author: Martin Storsjö <martin@martin.st>
date: Mon Mar 23 10:34:26 EDT 2020

arm64: ipred: Do shifts on only half the register width when possible

In these cases, we only need the value of the first element.

--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -1717,19 +1717,19 @@
 4:
         ld1r            {v0.2s},  [x2]
         uaddlv          h0,      v0.8b
-        urshr           v0.8h,   v0.8h,   #3
+        urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w4)
 8:
         ld1             {v0.8b},  [x2]
         uaddlv          h0,      v0.8b
-        urshr           v0.8h,   v0.8h,   #3
+        urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w8)
 16:
         ld1             {v0.16b}, [x2]
         uaddlv          h0,      v0.16b
-        urshr           v0.8h,   v0.8h,   #4
+        urshr           v0.4h,   v0.4h,   #4
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)
 32:
@@ -1737,7 +1737,7 @@
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             v2.4h,   v2.4h,   v3.4h
-        urshr           v2.8h,   v2.8h,   #5
+        urshr           v2.4h,   v2.4h,   #5
         dup             v0.8h,   v2.h[0]
         b               L(ipred_cfl_splat_w16)
 
@@ -1772,7 +1772,7 @@
 L(ipred_cfl_left_h4):
         ld1r            {v0.2s},  [x2]
         uaddlv          h0,      v0.8b
-        urshr           v0.8h,   v0.8h,   #3
+        urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         br              x9
 
@@ -1779,7 +1779,7 @@
 L(ipred_cfl_left_h8):
         ld1             {v0.8b},  [x2]
         uaddlv          h0,      v0.8b
-        urshr           v0.8h,   v0.8h,   #3
+        urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         br              x9
 
@@ -1786,7 +1786,7 @@
 L(ipred_cfl_left_h16):
         ld1             {v0.16b}, [x2]
         uaddlv          h0,      v0.16b
-        urshr           v0.8h,   v0.8h,   #4
+        urshr           v0.4h,   v0.4h,   #4
         dup             v0.8h,   v0.h[0]
         br              x9
 
@@ -1795,7 +1795,7 @@
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             v2.4h,   v2.4h,   v3.4h
-        urshr           v2.8h,   v2.8h,   #5
+        urshr           v2.4h,   v2.4h,   #5
         dup             v0.8h,   v2.h[0]
         br              x9