shithub: dav1d

Download patch

ref: a4950bce9a467c1319420da8fa2e173ebce9aec5
parent: 490a1420f34765f6b1aa9610e23aea247bec2dcc
author: Martin Storsjö <martin@martin.st>
date: Sat Sep 28 20:43:54 EDT 2019

arm64: itx: Use smull+smlal instead of addl+mul

Even though smull+smlal does two multiplications instead of one,
the combination seems to be better handled by actual cores.

Before:                                 Cortex A53      A72      A73
inv_txfm_add_8x8_adst_adst_1_8bpc_neon:      356.0    279.2    278.0
inv_txfm_add_16x16_adst_adst_2_8bpc_neon:   1785.0   1329.5   1308.8
After:
inv_txfm_add_8x8_adst_adst_1_8bpc_neon:      360.0    253.2    269.3
inv_txfm_add_16x16_adst_adst_2_8bpc_neon:   1793.1   1300.9   1254.0

(In this particular cases, it seems like it is a minor regression
on A53, which is probably more due to having to change the ordering
of some instructions, due to how smull+smlal+smull2+smlal2 overwrites
the second output register sooner than an addl+addl2 would have, but
in general, smull+smlal seems to be equally good or better than
addl+mul on A53 as well.)

--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -148,27 +148,6 @@
 .endif
 .endm
 
-.macro saddl_sz d0, d1, s0, s1, sz
-        saddl           \d0\().4s,  \s0\().4h,  \s1\().4h
-.ifc \sz, .8h
-        saddl2          \d1\().4s,  \s0\().8h,  \s1\().8h
-.endif
-.endm
-
-.macro ssubl_sz d0, d1, s0, s1, sz
-        ssubl           \d0\().4s,  \s0\().4h,  \s1\().4h
-.ifc \sz, .8h
-        ssubl2          \d1\().4s,  \s0\().8h,  \s1\().8h
-.endif
-.endm
-
-.macro mul_4s_sz d0, d1, s0, s1, c, sz
-        mul             \d0\().4s,  \s0\().4s,  \c
-.ifc \sz, .8h
-        mul             \d1\().4s,  \s1\().4s,  \c
-.endif
-.endm
-
 .macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
         sqrdmulh        \r0\sz,  \r0\sz,  \c
         sqrdmulh        \r1\sz,  \r1\sz,  \c
@@ -865,21 +844,15 @@
         sqsub           v5\sz,     v5\sz, v19\sz // t7
         sqneg           \o1\()\sz, \o1\()\sz     // out1
 
-        movi            v0.4s,  #2896>>4
+        movi            v0.4h,  #2896>>4
 
-        saddl_sz        v18, v19, v2,  v4,  \sz // -> out3 (v19 or v20)
-        ssubl_sz        v6,  v7,  v2,  v4,  \sz // -> out4 (v20 or v19)
-        ssubl_sz        v20, v21, v3,  v5,  \sz // -> out5 (v21 or v18)
-        saddl_sz        v4,  v5,  v3,  v5,  \sz // -> out2 (v18 or v21)
-
-        mul_4s_sz       v18, v19, v18, v19, v0.s[0], \sz
-        mul_4s_sz       v6,  v7,  v6,  v7,  v0.s[0], \sz
-        mul_4s_sz       v20, v21, v20, v21, v0.s[0], \sz
-        mul_4s_sz       v4,  v5,  v4,  v5,  v0.s[0], \sz
-
+        smull_smlal     v18, v19, v2,  v4,  v0.h[0], v0.h[0], \sz // -> out3 (v19 or v20)
+        smull_smlsl     v6,  v7,  v2,  v4,  v0.h[0], v0.h[0], \sz // -> out4 (v20 or v19)
+        smull_smlsl     v20, v21, v3,  v5,  v0.h[0], v0.h[0], \sz // -> out5 (v21 or v18)
         rshrn_sz        v2,  v18, v19, #8,  \sz // out3
+        smull_smlal     v18, v19, v3,  v5,  v0.h[0], v0.h[0], \sz // -> out2 (v18 or v21)
         rshrn_sz        v3,  v20, v21, #8,  \sz // out5
-        rshrn_sz        \o2, v4,  v5,  #8,  \sz // out2 (v18 or v21)
+        rshrn_sz        \o2, v18, v19, #8,  \sz // out2 (v18 or v21)
         rshrn_sz        \o4, v6,  v7,  #8,  \sz // out4 (v20 or v19)
 
         sqneg           \o3\()\sz, v2\sz     // out3
@@ -1310,32 +1283,22 @@
         sqsub           v23\sz,  v25\sz,  v23\sz // t7
         sqneg           \o3\sz,  \o3\sz          // out3
 
-        movi            v0.4s,  #2896>>4
+        movi            v0.4h,  #2896>>4
 
-        ssubl_sz        v24, v25, v2,  v21, \sz // -> out8 (v24 or v23)
-        saddl_sz        v4,  v5,  v2,  v21, \sz // -> out7 (v23 or v24)
-        saddl_sz        v6,  v7,  v26, v3,  \sz // -> out5 (v21 or v26)
-        ssubl_sz        v2,  v3,  v26, v3,  \sz // -> out10 (v26 or v21)
+        smull_smlsl     v24, v25, v2,  v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23)
+        smull_smlal     v4,  v5,  v2,  v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24)
+        smull_smlal     v6,  v7,  v26, v3,  v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26)
 
-        mul_4s_sz       v24, v25, v24, v25, v0.s[0], \sz
-        mul_4s_sz       v4,  v5,  v4,  v5,  v0.s[0], \sz
-        mul_4s_sz       v6,  v7,  v6,  v7,  v0.s[0], \sz
-        mul_4s_sz       v2,  v3,  v2,  v3,  v0.s[0], \sz
-
         rshrn_sz        v24, v24, v25, #8,  \sz // out8
         rshrn_sz        v4,  v4,  v5,  #8,  \sz // out7
         rshrn_sz        v5,  v6,  v7,  #8,  \sz // out5
-        rshrn_sz        v26, v2,  v3,  #8,  \sz // out10
+        smull_smlsl     v6,  v7,  v26, v3,  v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21)
+        smull_smlal     v2,  v3,  v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27)
+        rshrn_sz        v26, v6,  v7,  #8,  \sz // out10
 
-        saddl_sz        v2,  v3,  v22, v23, \sz // -> out4 (v20 or v27)
-        ssubl_sz        v6,  v7,  v22, v23, \sz // -> out11 (v27 or v20)
-        saddl_sz        v22, v23, v27, v20, \sz // -> out6 (v22 or v25)
-        ssubl_sz        v21, v25, v27, v20, \sz // -> out9 (v25 or v22)
-
-        mul_4s_sz       v2,  v3,  v2,  v3,  v0.s[0], \sz
-        mul_4s_sz       v6,  v7,  v6,  v7,  v0.s[0], \sz
-        mul_4s_sz       v22, v23, v22, v23, v0.s[0], \sz
-        mul_4s_sz       v21, v25, v21, v25, v0.s[0], \sz
+        smull_smlsl     v6,  v7,  v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20)
+        smull_smlal     v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25)
+        smull_smlsl     v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22)
 
         rshrn_sz        \o4, v2,  v3,  #8,  \sz // out4
         rshrn_sz        v6,  v6,  v7,  #8,  \sz // out11