shithub: dav1d

Download patch

ref: 74d5cf57599f423c3d38d091b9a95ae245d89235
parent: 72db660742c4c31a1a39f470c14fc24fefce361a
author: Martin Storsjö <martin@martin.st>
date: Tue Jun 23 09:33:11 EDT 2020

arm32: ipred: Mark a few more loads as aligned

This speeds things up a bit on older cores.

Also do a load that duplicates the input over the whole register
instead of just loading a single lane in iprev_v_w4. This can be a
bit faster on Cortex A8.

Before:                         Cortex A7      A8      A9     A53    A72     A73
intra_pred_v_w4_8bpc_neon:           54.0    38.4    46.4    47.7   20.4    18.1
intra_pred_h_w4_8bpc_neon:           66.3    43.1    55.0    57.0   27.9    22.2
intra_pred_h_w8_8bpc_neon:           81.0    60.2    76.7    66.5   31.1    30.1
intra_pred_dc_left_w4_8bpc_neon:     91.0    49.0    72.8    77.7   35.4    38.5
intra_pred_dc_left_w8_8bpc_neon:    103.8    73.5    90.2    84.7   42.8    47.1
intra_pred_dc_left_w16_8bpc_neon:   156.1   101.8   186.1   119.4   77.7    92.6
intra_pred_dc_left_w32_8bpc_neon:   270.5   200.5   381.6   191.7  152.6   170.3
intra_pred_dc_left_w64_8bpc_neon:   560.7   439.1   877.0   375.4  333.5   343.6

After:
intra_pred_v_w4_8bpc_neon:           53.9    38.0    46.4    47.7   19.8    19.2
intra_pred_h_w4_8bpc_neon:           66.5    39.2    52.6    57.0   27.7    22.2
intra_pred_h_w8_8bpc_neon:           80.5    55.8    72.9    66.5   31.4    30.1
intra_pred_dc_left_w4_8bpc_neon:     91.0    48.2    71.8    77.7   34.9    38.6
intra_pred_dc_left_w8_8bpc_neon:    103.8    69.6    89.2    84.7   43.2    47.3
intra_pred_dc_left_w16_8bpc_neon:   182.3    99.9   184.9   118.8   77.7    85.8
intra_pred_dc_left_w32_8bpc_neon:   355.4   198.9   380.1   190.6  152.9   161.0
intra_pred_dc_left_w64_8bpc_neon:   517.5   437.4   876.9   375.7  333.3   347.7

--- a/src/arm/32/ipred.S
+++ b/src/arm/32/ipred.S
@@ -132,7 +132,7 @@
         .word 80f  - L(ipred_v_tbl) + CONFIG_THUMB
         .word 40f  - L(ipred_v_tbl) + CONFIG_THUMB
 40:
-        vld1.32         {d0[0]},  [r2]
+        vld1.32         {d0[]},   [r2]
 4:
         vst1.32         {d0[0]},  [r0,  :32], r1
         vst1.32         {d0[0]},  [r12, :32], r1
@@ -215,7 +215,7 @@
         .word 8f   - L(ipred_h_tbl) + CONFIG_THUMB
         .word 4f   - L(ipred_h_tbl) + CONFIG_THUMB
 4:
-        vld4.8          {d0[],  d1[],  d2[],  d3[]},  [r2],  lr
+        vld4.8          {d0[],  d1[],  d2[],  d3[]},  [r2, :32],  lr
         vst1.32         {d3[0]},  [r0,  :32], r1
         vst1.32         {d2[0]},  [r12, :32], r1
         subs            r4,  r4,  #4
@@ -224,7 +224,7 @@
         bgt             4b
         pop             {r4-r5, pc}
 8:
-        vld4.8          {d0[],  d1[],  d2[],  d3[]},  [r2],  lr
+        vld4.8          {d0[],  d1[],  d2[],  d3[]},  [r2, :32],  lr
         vst1.8          {d3},  [r0,  :64], r1
         vst1.8          {d2},  [r12, :64], r1
         subs            r4,  r4,  #4
@@ -453,7 +453,7 @@
         .word L(ipred_dc_left_w4)  - L(ipred_dc_left_tbl) + CONFIG_THUMB
 
 L(ipred_dc_left_h4):
-        vld1.32         {d0[]},  [r2]
+        vld1.32         {d0[]},  [r2, :32]
         vpaddl.u8       d0,  d0
         vpadd.u16       d0,  d0
         vrshrn.u16      d0,  q0,  #2
@@ -468,7 +468,7 @@
         bgt             L(ipred_dc_left_w4)
         pop             {r4-r5, pc}
 L(ipred_dc_left_h8):
-        vld1.8          {d0},  [r2]
+        vld1.8          {d0},  [r2, :64]
         vpaddl.u8       d0,  d0
         vpadd.u16       d0,  d0
         vpadd.u16       d0,  d0
@@ -484,7 +484,7 @@
         bgt             L(ipred_dc_left_w8)
         pop             {r4-r5, pc}
 L(ipred_dc_left_h16):
-        vld1.8          {d0,  d1},  [r2]
+        vld1.8          {d0,  d1},  [r2, :128]
         vaddl.u8        q0,  d0,  d1
         vadd.u16        d0,  d0,  d1
         vpadd.u16       d0,  d0
@@ -501,7 +501,7 @@
         bgt             L(ipred_dc_left_w16)
         pop             {r4-r5, pc}
 L(ipred_dc_left_h32):
-        vld1.8          {d0,  d1,  d2,  d3},  [r2]
+        vld1.8          {d0,  d1,  d2,  d3},  [r2, :128]
         vaddl.u8        q0,  d0,  d1
         vaddl.u8        q1,  d2,  d3
         vadd.u16        q0,  q0,  q1
@@ -522,8 +522,8 @@
         bgt             1b
         pop             {r4-r5, pc}
 L(ipred_dc_left_h64):
-        vld1.8          {d0,  d1,  d2,  d3},  [r2]!
-        vld1.8          {d4,  d5,  d6,  d7},  [r2]
+        vld1.8          {d0,  d1,  d2,  d3},  [r2, :128]!
+        vld1.8          {d4,  d5,  d6,  d7},  [r2, :128]
         vaddl.u8        q0,  d0,  d1
         vaddl.u8        q1,  d2,  d3
         vaddl.u8        q2,  d4,  d5
@@ -599,13 +599,13 @@
         .word L(ipred_dc_w4)  - L(ipred_dc_tbl) + CONFIG_THUMB
 
 L(ipred_dc_h4):
-        vld1.32         {d0[0]},  [r2]!
+        vld1.32         {d0[]},  [r2, :32]!
         vpaddl.u8       d0,  d0
         vpadd.u16       d0,  d0
         bx              r3
 L(ipred_dc_w4):
         add             r2,  r2,  #1
-        vld1.32         {d1[0]},  [r2]
+        vld1.32         {d1[]},  [r2]
         vmov.32         d1[1],  r6
         vadd.s16        d0,  d0,  d30
         vpaddl.u8       d1,  d1
@@ -634,7 +634,7 @@
         pop             {r4-r6, pc}
 
 L(ipred_dc_h8):
-        vld1.8          {d0},  [r2]!
+        vld1.8          {d0},  [r2, :64]!
         vpaddl.u8       d0,  d0
         vpadd.u16       d0,  d0
         vpadd.u16       d0,  d0
@@ -669,7 +669,7 @@
         pop             {r4-r6, pc}
 
 L(ipred_dc_h16):
-        vld1.8          {d0,  d1},  [r2]!
+        vld1.8          {d0,  d1},  [r2, :128]!
         vaddl.u8        q0,  d0,  d1
         vadd.u16        d0,  d0,  d1
         vpadd.u16       d0,  d0
@@ -706,7 +706,7 @@
         pop             {r4-r6, pc}
 
 L(ipred_dc_h32):
-        vld1.8          {d0,  d1,  d2,  d3},  [r2]!
+        vld1.8          {d0,  d1,  d2,  d3},  [r2, :128]!
         vaddl.u8        q0,  d0,  d1
         vaddl.u8        q1,  d2,  d3
         vadd.u16        q0,  q0,  q1
@@ -751,9 +751,9 @@
         pop             {r4-r6, pc}
 
 L(ipred_dc_h64):
-        vld1.8          {d0,  d1,  d2,  d3},  [r2]!
+        vld1.8          {d0,  d1,  d2,  d3},  [r2, :128]!
         vaddl.u8        q0,  d0,  d1
-        vld1.8          {d4,  d5,  d6,  d7},  [r2]!
+        vld1.8          {d4,  d5,  d6,  d7},  [r2, :128]!
         vaddl.u8        q1,  d2,  d3
         vaddl.u8        q2,  d4,  d5
         vaddl.u8        q3,  d6,  d7