shithub: dav1d

Download patch

ref: 8e00403915e543e6d15e751903265494eb51a94e
parent: 5fe20ec7dd89c88453a61cd47d5d01e49d6cc6c2
author: Martin Storsjö <martin@martin.st>
date: Mon Jun 22 10:25:54 EDT 2020

arm64: ipred: Improve scheduling a tiny bit in the entry in smooth

--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -884,10 +884,10 @@
         lsl             x1,  x1,  #1
         br              x5
 40:
-        sub             x2,  x2,  #4
-        mov             x7,  #-4
         ld1r            {v6.2s}, [x8]             // top
         ld1r            {v7.2s}, [x10]            // weights_hor
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
         dup             v5.16b,  v6.b[3]          // right
         usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
         uxtl            v7.8h,   v7.8b            // weights_hor
@@ -922,10 +922,10 @@
         b.gt            4b
         ret
 80:
-        sub             x2,  x2,  #4
-        mov             x7,  #-4
         ld1             {v6.8b}, [x8]             // top
         ld1             {v7.8b}, [x10]            // weights_hor
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
         dup             v5.16b,  v6.b[7]          // right
         usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
         uxtl            v7.8h,   v7.8b            // weights_hor
--- a/src/arm/64/ipred16.S
+++ b/src/arm/64/ipred16.S
@@ -920,10 +920,10 @@
         lsl             x1,  x1,  #1
         br              x5
 40:
-        sub             x2,  x2,  #8
-        mov             x7,  #-8
         ld1r            {v6.2d}, [x8]             // top
         ld1r            {v7.2s}, [x10]            // weights_hor
+        sub             x2,  x2,  #8
+        mov             x7,  #-8
         dup             v5.8h,   v6.h[3]          // right
         sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
         uxtl            v7.8h,   v7.8b            // weights_hor
@@ -963,10 +963,10 @@
         b.gt            4b
         ret
 80:
-        sub             x2,  x2,  #8
-        mov             x7,  #-8
         ld1             {v6.8h}, [x8]             // top
         ld1             {v7.8b}, [x10]            // weights_hor
+        sub             x2,  x2,  #8
+        mov             x7,  #-8
         dup             v5.8h,   v6.h[7]          // right
         sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
         uxtl            v7.8h,   v7.8b            // weights_hor