shithub: dav1d

Download patch

ref: 77b3b25c636f5e874bdba362bfa8e028e9620931
parent: f90ada0d08e99ccfb676e59b8e3c497e77879915
author: Martin Storsjö <martin@martin.st>
date: Thu Sep 24 07:07:02 EDT 2020

arm32: looprestoration: Fix missed vertical alignment

--- a/src/arm/32/looprestoration.S
+++ b/src/arm/32/looprestoration.S
@@ -40,8 +40,8 @@
         mov             r8,  r5
         vld1.16         {q0},  [r4]
         movw            r9,  #(1 << 14) - (1 << 2)
-        vdup.16         q14,  r9
-        vmov.s16        q15,  #2048
+        vdup.16         q14, r9
+        vmov.s16        q15, #2048
         // Calculate mid_stride
         add             r10, r5,  #7
         bic             r10, r10, #7
@@ -108,8 +108,8 @@
 0:
         // !LR_HAVE_LEFT, fill q1 with the leftmost byte
         // and shift q2 to have 3x the first byte at the front.
-        vdup.8          q1, d4[0]
-        vdup.8          q8, d18[0]
+        vdup.8          q1,  d4[0]
+        vdup.8          q8,  d18[0]
         // Move r2 back to account for the last 3 bytes we loaded before,
         // which we shifted out.
         sub             r2,  r2,  #3
@@ -127,7 +127,7 @@
         bne             4f
         // If we'll need to pad the right edge, load that byte to pad with
         // here since we can find it pretty easily from here.
-        sub             r9,  r5, #14
+        sub             r9,  r5,  #14
         ldrb            r11, [r2, r9]
         ldrb            r9,  [lr, r9]
         // Fill q12/q13 with the right padding pixel
@@ -338,11 +338,11 @@
         vdup.16         d25, d16[3]
         vpadd.s16       d6,  d6,  d6
         vtrn.16         d24, d25
-        vshl.s16        d24, d24,  #7
-        vsub.s16        d24, d24,  d28
-        vqadd.s16       d6,  d6,   d24
-        vshr.s16        d6,  d6,   #3
-        vadd.s16        d6,  d6,   d30
+        vshl.s16        d24, d24, #7
+        vsub.s16        d24, d24, d28
+        vqadd.s16       d6,  d6,  d24
+        vshr.s16        d6,  d6,  #3
+        vadd.s16        d6,  d6,  d30
         vst1.s16        {d6[0]}, [r0,  :16]!
         vst1.s16        {d6[1]}, [r12, :16]!
         subs            r5,  r5,  #1
@@ -422,22 +422,22 @@
         // Interleaving the mul/mla chains actually hurts performance
         // significantly on Cortex A53, thus keeping mul/mla tightly
         // chained like this.
-        vmull.s16       q2,  d16,  d0[0]
-        vmlal.s16       q2,  d18,  d0[1]
-        vmlal.s16       q2,  d20,  d0[2]
-        vmlal.s16       q2,  d22,  d0[3]
-        vmlal.s16       q2,  d24,  d1[0]
-        vmlal.s16       q2,  d26,  d1[1]
-        vmlal.s16       q2,  d28,  d1[2]
-        vmull.s16       q3,  d17,  d0[0]
-        vmlal.s16       q3,  d19,  d0[1]
-        vmlal.s16       q3,  d21,  d0[2]
-        vmlal.s16       q3,  d23,  d0[3]
-        vmlal.s16       q3,  d25,  d1[0]
-        vmlal.s16       q3,  d27,  d1[1]
-        vmlal.s16       q3,  d29,  d1[2]
-        vqrshrun.s32    d4,  q2,   #11
-        vqrshrun.s32    d5,  q3,   #11
+        vmull.s16       q2,  d16, d0[0]
+        vmlal.s16       q2,  d18, d0[1]
+        vmlal.s16       q2,  d20, d0[2]
+        vmlal.s16       q2,  d22, d0[3]
+        vmlal.s16       q2,  d24, d1[0]
+        vmlal.s16       q2,  d26, d1[1]
+        vmlal.s16       q2,  d28, d1[2]
+        vmull.s16       q3,  d17, d0[0]
+        vmlal.s16       q3,  d19, d0[1]
+        vmlal.s16       q3,  d21, d0[2]
+        vmlal.s16       q3,  d23, d0[3]
+        vmlal.s16       q3,  d25, d1[0]
+        vmlal.s16       q3,  d27, d1[1]
+        vmlal.s16       q3,  d29, d1[2]
+        vqrshrun.s32    d4,  q2,  #11
+        vqrshrun.s32    d5,  q3,  #11
         vqmovun.s16     d4,  q2
         vst1.8          {d4}, [r0], r1
 .if \compare
@@ -473,7 +473,7 @@
 52:     // 2 rows in total, q11 already loaded, load q12 with content data
         // and 2 rows of edge.
         vld1.16         {q14}, [r2, :128], r7
-        vmov            q15,  q14
+        vmov            q15, q14
         b               8f
 53:
         // 3 rows in total, q11 already loaded, load q12 and q13 with content
@@ -785,7 +785,7 @@
         bne             4f
         // If we'll need to pad the right edge, load that byte to pad with
         // here since we can find it pretty easily from here.
-        sub             lr,  r5, #(2 + 16 - 2 + 1)
+        sub             lr,  r5,  #(2 + 16 - 2 + 1)
         ldrb            r11, [r3,  lr]
         ldrb            lr,  [r12, lr]
         // Fill q14/q15 with the right padding pixel
@@ -1058,7 +1058,7 @@
         bne             4f
         // If we'll need to pad the right edge, load that byte to pad with
         // here since we can find it pretty easily from here.
-        sub             lr,  r5, #(2 + 16 - 3 + 1)
+        sub             lr,  r5,  #(2 + 16 - 3 + 1)
         ldrb            r11, [r3,  lr]
         ldrb            lr,  [r12, lr]
         // Fill q14/q15 with the right padding pixel
@@ -1100,7 +1100,7 @@
         vaddl_u16_n     q12, q13, d2,  d3,  d16, d17, \w
         vaddl_u16_n     q8,  q9,  d18, d19, d20, d21, \w
         vaddw_u16_n     q12, q13, d22, d23, \w
-        vadd_i32_n      q12, q13, q8,  q9, \w
+        vadd_i32_n      q12, q13, q8,  q9,  \w
         vext.8          q8,  q5,  q6,  #2
         vext.8          q9,  q5,  q6,  #4
         vext.8          q10, q5,  q6,  #6
@@ -1152,7 +1152,7 @@
 
 6:      // Pad the right edge and produce the last few pixels.
         // w < 7, w+1 pixels valid in q0/q4
-        sub             lr,   r5,  #1
+        sub             lr,  r5,  #1
         // lr = pixels valid - 2
         adr             r11, L(box5_variable_shift_tbl)
         ldr             lr,  [r11, lr, lsl #2]