shithub: dav1d

Download patch

ref: bf920fba5782a8b272b44792df0942c211ec5886
parent: f64fdae55128ff1c2204f578ee26b6d577862b26
author: Martin Storsjö <martin@martin.st>
date: Sun May 19 17:10:55 EDT 2019

arm: mc: Fix 8tap_v w8 with OBMC 3/4 heights

Also make sure that the w4 case can exit after processing 12 pixels,
where it is convenient.

This gives a small slowdown for in-order cores like A7, A8, A53, but
acutally seems to give a small speedup for out-of-order cores like
A9, A72 and A73.

AArch64:
Before:                      Cortex A53     A72     A73
mc_8tap_regular_w8_v_8bpc_neon:   223.8   247.3   228.5
After:
mc_8tap_regular_w8_v_8bpc_neon:   232.5   243.9   223.4

AArch32:
Before:                       Cortex A7      A8      A9     A53     A72     A73
mc_8tap_regular_w8_v_8bpc_neon:   550.2   470.7   520.5   257.0   256.4   248.2
After:
mc_8tap_regular_w8_v_8bpc_neon:   554.3   474.2   511.6   267.5   252.6   246.8

--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -1112,7 +1112,7 @@
         vmovl_u8        q3,  d6,  q4,  d8,  q8,  d16, q9, d18
         mul_mla_8_2     q12, q13, q12, q13, q14, q15, q1,  q2,  q3,  q4,  q8,  q9
         shift_store_4   \type, \d_strd, q12, d24, d25, q13, d26, d27
-        b               48b
+        bgt             48b
 0:
         vpop            {q4}
         pop             {r4-r11,pc}
@@ -1145,7 +1145,7 @@
 0:
         pop             {r4-r11,pc}
 
-880:    // 8x8, 8x16, 8x32 v
+880:    // 8x6, 8x8, 8x16, 8x32 v
 1680:   // 16x8, 16x16, ...
 320:    // 32x8, 32x16, ...
 640:
@@ -1178,12 +1178,17 @@
         mul_mla_8_1     q3,  q4,  q3,  q4,  q8,  q9,  q10, q11, q12, q13, q14
         shift_store_8   \type, \d_strd, q3,  d6,  q4,  d8
         ble             9f
-        subs            \h,  \h,  #4
-        load_reg        \sr2, \src, \s_strd, d30, d2,  d4,  d6
-        vmovl_u8        q15, d30, q1,  d2,  q2,  d4,  q3,  d6
+        subs            \h,  \h,  #2
+        load_reg        \sr2, \src, \s_strd, d30, d2
+        vmovl_u8        q15, d30, q1,  d2
         mul_mla_8_1     q8,  q9,  q8,  q9,  q10, q11, q12, q13, q14, q15, q1
+        shift_store_8   \type, \d_strd, q8,  d16, q9,  d18
+        ble             9f
+        subs            \h,  \h,  #2
+        load_reg        \sr2, \src, \s_strd, d4,  d6
+        vmovl_u8        q2,  d4,  q3,  d6
         mul_mla_8_1     q10, q11, q10, q11, q12, q13, q14, q15, q1,  q2,  q3
-        shift_store_8   \type, \d_strd, q8,  d16, q9,  d18, q10, d20, q11, d22
+        shift_store_8   \type, \d_strd, q10, d20, q11, d22
         ble             9f
         subs            \h,  \h,  #4
         load_reg        \sr2, \src, \s_strd, d8,  d16, d18, d20
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1119,7 +1119,7 @@
         uxtl_b          v18, v19, v20, v21
         mul_mla_8_2     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
         shift_store_4   \type, \d_strd, v1, v2
-        b               48b
+        b.gt            48b
 0:
         ret
 
@@ -1151,7 +1151,7 @@
 0:
         ret
 
-880:    // 8x8, 8x16, 8x32 v
+880:    // 8x6, 8x8, 8x16, 8x32 v
 1680:   // 16x8, 16x16, ...
 320:    // 32x8, 32x16, ...
 640:
@@ -1183,12 +1183,17 @@
         mul_mla_8_1     v3,  v4,  v18, v19, v20, v21, v22, v23, v24, v25, v26
         shift_store_8   \type, \d_strd, v3, v4
         b.le            9f
-        subs            \h,  \h,  #4
-        load_8b         \sr2, \src, \s_strd, v27, v16, v17, v18
-        uxtl_b          v27, v16, v17, v18
+        subs            \h,  \h,  #2
+        load_8b         \sr2, \src, \s_strd, v27, v16
+        uxtl_b          v27, v16
         mul_mla_8_1     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16
+        shift_store_8   \type, \d_strd, v1, v2
+        b.le            9f
+        subs            \h,  \h,  #2
+        load_8b         \sr2, \src, \s_strd, v17, v18
+        uxtl_b          v17, v18
         mul_mla_8_1     v3,  v4,  v22, v23, v24, v25, v26, v27, v16, v17, v18
-        shift_store_8   \type, \d_strd, v1, v2, v3, v4
+        shift_store_8   \type, \d_strd, v3, v4
         b.le            9f
         subs            \h,  \h,  #4
         load_8b         \sr2, \src, \s_strd, v19, v20, v21, v22