shithub: dav1d

--- a/src/arm/32/looprestoration.S

+++ b/src/arm/32/looprestoration.S

@@ -40,8 +40,8 @@

         mov             r8,  r5

         vld1.16         {q0},  [r4]

         movw            r9,  #(1 << 14) - (1 << 2)

-        vdup.16         q14,  r9

-        vmov.s16        q15,  #2048

+        vdup.16         q14, r9

+        vmov.s16        q15, #2048

         // Calculate mid_stride

         add             r10, r5,  #7

         bic             r10, r10, #7

@@ -108,8 +108,8 @@

0:

         // !LR_HAVE_LEFT, fill q1 with the leftmost byte

         // and shift q2 to have 3x the first byte at the front.

-        vdup.8          q1, d4[0]

-        vdup.8          q8, d18[0]

+        vdup.8          q1,  d4[0]

+        vdup.8          q8,  d18[0]

         // Move r2 back to account for the last 3 bytes we loaded before,

         // which we shifted out.

         sub             r2,  r2,  #3

@@ -127,7 +127,7 @@

         bne             4f

         // If we'll need to pad the right edge, load that byte to pad with

         // here since we can find it pretty easily from here.

-        sub             r9,  r5, #14

+        sub             r9,  r5,  #14

         ldrb            r11, [r2, r9]

         ldrb            r9,  [lr, r9]

         // Fill q12/q13 with the right padding pixel

@@ -338,11 +338,11 @@

         vdup.16         d25, d16[3]

         vpadd.s16       d6,  d6,  d6

         vtrn.16         d24, d25

-        vshl.s16        d24, d24,  #7

-        vsub.s16        d24, d24,  d28

-        vqadd.s16       d6,  d6,   d24

-        vshr.s16        d6,  d6,   #3

-        vadd.s16        d6,  d6,   d30

+        vshl.s16        d24, d24, #7

+        vsub.s16        d24, d24, d28

+        vqadd.s16       d6,  d6,  d24

+        vshr.s16        d6,  d6,  #3

+        vadd.s16        d6,  d6,  d30

         vst1.s16        {d6[0]}, [r0,  :16]!

         vst1.s16        {d6[1]}, [r12, :16]!

         subs            r5,  r5,  #1

@@ -422,22 +422,22 @@

         // Interleaving the mul/mla chains actually hurts performance

         // significantly on Cortex A53, thus keeping mul/mla tightly

         // chained like this.

-        vmull.s16       q2,  d16,  d0[0]

-        vmlal.s16       q2,  d18,  d0[1]

-        vmlal.s16       q2,  d20,  d0[2]

-        vmlal.s16       q2,  d22,  d0[3]

-        vmlal.s16       q2,  d24,  d1[0]

-        vmlal.s16       q2,  d26,  d1[1]

-        vmlal.s16       q2,  d28,  d1[2]

-        vmull.s16       q3,  d17,  d0[0]

-        vmlal.s16       q3,  d19,  d0[1]

-        vmlal.s16       q3,  d21,  d0[2]

-        vmlal.s16       q3,  d23,  d0[3]

-        vmlal.s16       q3,  d25,  d1[0]

-        vmlal.s16       q3,  d27,  d1[1]

-        vmlal.s16       q3,  d29,  d1[2]

-        vqrshrun.s32    d4,  q2,   #11

-        vqrshrun.s32    d5,  q3,   #11

+        vmull.s16       q2,  d16, d0[0]

+        vmlal.s16       q2,  d18, d0[1]

+        vmlal.s16       q2,  d20, d0[2]

+        vmlal.s16       q2,  d22, d0[3]

+        vmlal.s16       q2,  d24, d1[0]

+        vmlal.s16       q2,  d26, d1[1]

+        vmlal.s16       q2,  d28, d1[2]

+        vmull.s16       q3,  d17, d0[0]

+        vmlal.s16       q3,  d19, d0[1]

+        vmlal.s16       q3,  d21, d0[2]

+        vmlal.s16       q3,  d23, d0[3]

+        vmlal.s16       q3,  d25, d1[0]

+        vmlal.s16       q3,  d27, d1[1]

+        vmlal.s16       q3,  d29, d1[2]

+        vqrshrun.s32    d4,  q2,  #11

+        vqrshrun.s32    d5,  q3,  #11

         vqmovun.s16     d4,  q2

         vst1.8          {d4}, [r0], r1

 .if \compare

@@ -473,7 +473,7 @@

 52:     // 2 rows in total, q11 already loaded, load q12 with content data

         // and 2 rows of edge.

         vld1.16         {q14}, [r2, :128], r7

-        vmov            q15,  q14

+        vmov            q15, q14

         b               8f

53:

         // 3 rows in total, q11 already loaded, load q12 and q13 with content

@@ -785,7 +785,7 @@

         bne             4f

         // If we'll need to pad the right edge, load that byte to pad with

         // here since we can find it pretty easily from here.

-        sub             lr,  r5, #(2 + 16 - 2 + 1)

+        sub             lr,  r5,  #(2 + 16 - 2 + 1)

         ldrb            r11, [r3,  lr]

         ldrb            lr,  [r12, lr]

         // Fill q14/q15 with the right padding pixel

@@ -1058,7 +1058,7 @@

         bne             4f

         // If we'll need to pad the right edge, load that byte to pad with

         // here since we can find it pretty easily from here.

-        sub             lr,  r5, #(2 + 16 - 3 + 1)

+        sub             lr,  r5,  #(2 + 16 - 3 + 1)

         ldrb            r11, [r3,  lr]

         ldrb            lr,  [r12, lr]

         // Fill q14/q15 with the right padding pixel

@@ -1100,7 +1100,7 @@

         vaddl_u16_n     q12, q13, d2,  d3,  d16, d17, \w

         vaddl_u16_n     q8,  q9,  d18, d19, d20, d21, \w

         vaddw_u16_n     q12, q13, d22, d23, \w

-        vadd_i32_n      q12, q13, q8,  q9, \w

+        vadd_i32_n      q12, q13, q8,  q9,  \w

         vext.8          q8,  q5,  q6,  #2

         vext.8          q9,  q5,  q6,  #4

         vext.8          q10, q5,  q6,  #6

@@ -1152,7 +1152,7 @@

 6:      // Pad the right edge and produce the last few pixels.

         // w < 7, w+1 pixels valid in q0/q4

-        sub             lr,   r5,  #1

+        sub             lr,  r5,  #1

         // lr = pixels valid - 2

         adr             r11, L(box5_variable_shift_tbl)

         ldr             lr,  [r11, lr, lsl #2]

--

⑨