shithub: dav1d

Download patch

ref: e2702eaf5f13d5f93be75084a5bfecc77a67c001
parent: c0e1988b0118531fbe264a3e6143ca9cc2e311fc
author: Martin Storsjö <martin@martin.st>
date: Mon Sep 2 19:13:09 EDT 2019

arm64: itx: Do the final calculation of adst4/adst8/adst16 in 32 bit to avoid too narrow clipping

See issue #295, this fixes it for arm64.

Before:                                 Cortex A53      A72      A73
inv_txfm_add_4x4_adst_adst_1_8bpc_neon:      103.0     63.2     65.2
inv_txfm_add_4x8_adst_adst_1_8bpc_neon:      197.0    145.0    134.2
inv_txfm_add_8x8_adst_adst_1_8bpc_neon:      332.0    248.0    247.1
inv_txfm_add_16x16_adst_adst_2_8bpc_neon:   1676.8   1197.0   1186.8
After:
inv_txfm_add_4x4_adst_adst_1_8bpc_neon:      103.0     76.4     67.0
inv_txfm_add_4x8_adst_adst_1_8bpc_neon:      205.0    155.0    143.8
inv_txfm_add_8x8_adst_adst_1_8bpc_neon:      358.0    269.0    276.2
inv_txfm_add_16x16_adst_adst_2_8bpc_neon:   1785.2   1347.8   1312.1

This would probably only be needed for adst in the first pass, but
the additional code complexity from splitting the implementations
(as we currently don't have transforms differentiated between first
and second pass) isn't necessarily worth it (the speedup over C code
is still 8-10x).

--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -98,7 +98,8 @@
 endconst
 
 const iadst4_coeffs, align=4
-        .short          1321, 3803, 2482, 3344, 3344*8
+        // .h[4-5] can be interpreted as .s[2]
+        .short          1321, 3803, 2482, 3344, 3344, 0
 endconst
 
 const iadst8_coeffs, align=4
@@ -147,6 +148,27 @@
 .endif
 .endm
 
+.macro saddl_sz d0, d1, s0, s1, sz
+        saddl           \d0\().4s,  \s0\().4h,  \s1\().4h
+.ifc \sz, .8h
+        saddl2          \d1\().4s,  \s0\().8h,  \s1\().8h
+.endif
+.endm
+
+.macro ssubl_sz d0, d1, s0, s1, sz
+        ssubl           \d0\().4s,  \s0\().4h,  \s1\().4h
+.ifc \sz, .8h
+        ssubl2          \d1\().4s,  \s0\().8h,  \s1\().8h
+.endif
+.endm
+
+.macro mul_4s_sz d0, d1, s0, s1, c, sz
+        mul             \d0\().4s,  \s0\().4s,  \c
+.ifc \sz, .8h
+        mul             \d1\().4s,  \s1\().4s,  \c
+.endif
+.endm
+
 .macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
         sqrdmulh        \r0\sz,  \r0\sz,  \c
         sqrdmulh        \r1\sz,  \r1\sz,  \c
@@ -499,23 +521,24 @@
         movrel          x16, iadst4_coeffs
         ld1             {v0.8h}, [x16]
 
-        sub             v3.4h,   v16.4h,  v18.4h
+        ssubl           v3.4s,   v16.4h,  v18.4h
         smull           v4.4s,   v16.4h,  v0.h[0]
         smlal           v4.4s,   v18.4h,  v0.h[1]
         smlal           v4.4s,   v19.4h,  v0.h[2]
         smull           v7.4s,   v17.4h,  v0.h[3]
-        add             v3.4h,   v3.4h,   v19.4h
+        saddw           v3.4s,   v3.4s,   v19.4h
         smull           v5.4s,   v16.4h,  v0.h[2]
         smlsl           v5.4s,   v18.4h,  v0.h[0]
         smlsl           v5.4s,   v19.4h,  v0.h[1]
 
         add             \o3\().4s, v4.4s,     v5.4s
-        sqrdmulh        \o2\().4h, v3.4h,     v0.h[4]
+        mul             \o2\().4s, v3.4s,     v0.s[2]
         add             \o0\().4s, v4.4s,     v7.4s
         add             \o1\().4s, v5.4s,     v7.4s
         sub             \o3\().4s, \o3\().4s, v7.4s
 
         rshrn           \o0\().4h, \o0\().4s, #12
+        rshrn           \o2\().4h, \o2\().4s, #12
         rshrn           \o1\().4h, \o1\().4s, #12
         rshrn           \o3\().4h, \o3\().4s, #12
 .endm
@@ -534,7 +557,8 @@
         movrel          x16, iadst4_coeffs
         ld1             {v0.8h}, [x16]
 
-        sub             v3.8h,   v16.8h,  v18.8h
+        ssubl           v2.4s,   v16.4h,  v18.4h
+        ssubl2          v3.4s,   v16.8h,  v18.8h
         smull           v4.4s,   v16.4h,  v0.h[0]
         smlal           v4.4s,   v18.4h,  v0.h[1]
         smlal           v4.4s,   v19.4h,  v0.h[2]
@@ -541,7 +565,8 @@
         smull2          v5.4s,   v16.8h,  v0.h[0]
         smlal2          v5.4s,   v18.8h,  v0.h[1]
         smlal2          v5.4s,   v19.8h,  v0.h[2]
-        add             v3.8h,   v3.8h,   v19.8h
+        saddw           v2.4s,   v2.4s,   v19.4h
+        saddw2          v3.4s,   v3.4s,   v19.8h
         smull           v6.4s,   v16.4h,  v0.h[2]
         smlsl           v6.4s,   v18.4h,  v0.h[0]
         smlsl           v6.4s,   v19.4h,  v0.h[1]
@@ -549,7 +574,8 @@
         smlsl2          v7.4s,   v18.8h,  v0.h[0]
         smlsl2          v7.4s,   v19.8h,  v0.h[1]
 
-        sqrdmulh        v18.8h,  v3.8h,   v0.h[4]
+        mul             v18.4s,  v2.4s,   v0.s[2]
+        mul             v19.4s,  v3.4s,   v0.s[2]
 
         smull           v2.4s,   v17.4h,  v0.h[3]
         smull2          v3.4s,   v17.8h,  v0.h[3]
@@ -566,6 +592,9 @@
         sub             v4.4s,   v4.4s,   v2.4s // out3
         sub             v5.4s,   v5.4s,   v3.4s
 
+        rshrn           v18.4h,  v18.4s, #12
+        rshrn2          v18.8h,  v19.4s, #12
+
         rshrn           \o0\().4h, v16.4s, #12
         rshrn2          \o0\().8h, v17.4s, #12
 
@@ -836,16 +865,25 @@
         sqsub           v5\sz,     v5\sz, v19\sz // t7
         sqneg           \o1\()\sz, \o1\()\sz     // out1
 
-        add             v6\sz,   v2\sz,   v4\sz
-        sub             v7\sz,   v2\sz,   v4\sz
-        add             v4\sz,   v3\sz,   v5\sz
-        sub             v5\sz,   v3\sz,   v5\sz
-        sqrdmulh        \o3\sz,  v6\sz,   v1.h[1] // out3
-        sqrdmulh        \o4\sz,  v7\sz,   v1.h[1] // out4
-        sqrdmulh        \o2\sz,  v4\sz,   v1.h[1] // out2
-        sqrdmulh        \o5\sz,  v5\sz,   v1.h[1] // out5
-        neg             \o3\()\sz, \o3\()\sz     // out3
-        neg             \o5\()\sz, \o5\()\sz     // out5
+        movi            v0.4s,  #2896>>4
+
+        saddl_sz        v18, v19, v2,  v4,  \sz // -> out3 (v19 or v20)
+        ssubl_sz        v6,  v7,  v2,  v4,  \sz // -> out4 (v20 or v19)
+        ssubl_sz        v20, v21, v3,  v5,  \sz // -> out5 (v21 or v18)
+        saddl_sz        v4,  v5,  v3,  v5,  \sz // -> out2 (v18 or v21)
+
+        mul_4s_sz       v18, v19, v18, v19, v0.s[0], \sz
+        mul_4s_sz       v6,  v7,  v6,  v7,  v0.s[0], \sz
+        mul_4s_sz       v20, v21, v20, v21, v0.s[0], \sz
+        mul_4s_sz       v4,  v5,  v4,  v5,  v0.s[0], \sz
+
+        rshrn_sz        v2,  v18, v19, #8,  \sz // out3
+        rshrn_sz        v3,  v20, v21, #8,  \sz // out5
+        rshrn_sz        \o2, v4,  v5,  #8,  \sz // out2 (v18 or v21)
+        rshrn_sz        \o4, v6,  v7,  #8,  \sz // out4 (v20 or v19)
+
+        sqneg           \o3\()\sz, v2\sz     // out3
+        sqneg           \o5\()\sz, v3\sz     // out5
 .endm
 
 function inv_adst_8x8_neon
@@ -1272,28 +1310,47 @@
         sqsub           v23\sz,  v25\sz,  v23\sz // t7
         sqneg           \o3\sz,  \o3\sz          // out3
 
-        sqsub           v24\sz,  v2\sz,   v21\sz // -> out8
-        sqadd           v2\sz,   v2\sz,   v21\sz // -> out7
-        sqadd           v21\sz,  v26\sz,  v3\sz  // -> out5
-        sqsub           v26\sz,  v26\sz,  v3\sz  // -> out10
-        sqadd           v3\sz,   v27\sz,  v20\sz // -> out6
-        sqsub           v25\sz,  v27\sz,  v20\sz // -> out9
-        sqadd           v20\sz,  v22\sz,  v23\sz // -> out4
-        sqsub           v27\sz,  v22\sz,  v23\sz // -> out11
+        movi            v0.4s,  #2896>>4
 
-        sqrdmulh        v2\sz,   v2\sz,   v0.h[1] // out7
-        sqrdmulh        v4\sz,   v21\sz,  v0.h[1] // out5
-        sqrdmulh        v5\sz,   v25\sz,  v0.h[1] // out9
-        sqrdmulh        v6\sz,   v27\sz,  v0.h[1] // out11
-        sqrdmulh        \o6\sz,  v3\sz,   v0.h[1] // out6
-        sqrdmulh        \o8\sz,  v24\sz,  v0.h[1] // out8
-        sqrdmulh        \o10\sz, v26\sz,  v0.h[1] // out10
-        sqrdmulh        \o4\sz,  v20\sz,  v0.h[1] // out4
+        ssubl_sz        v24, v25, v2,  v21, \sz // -> out8 (v24 or v23)
+        saddl_sz        v4,  v5,  v2,  v21, \sz // -> out7 (v23 or v24)
+        saddl_sz        v6,  v7,  v26, v3,  \sz // -> out5 (v21 or v26)
+        ssubl_sz        v2,  v3,  v26, v3,  \sz // -> out10 (v26 or v21)
 
-        neg             \o7\sz,  v2\sz // out7
-        neg             \o5\sz,  v4\sz // out5
-        neg             \o9\sz,  v5\sz // out9
-        neg             \o11\sz, v6\sz // out11
+        mul_4s_sz       v24, v25, v24, v25, v0.s[0], \sz
+        mul_4s_sz       v4,  v5,  v4,  v5,  v0.s[0], \sz
+        mul_4s_sz       v6,  v7,  v6,  v7,  v0.s[0], \sz
+        mul_4s_sz       v2,  v3,  v2,  v3,  v0.s[0], \sz
+
+        rshrn_sz        v24, v24, v25, #8,  \sz // out8
+        rshrn_sz        v4,  v4,  v5,  #8,  \sz // out7
+        rshrn_sz        v5,  v6,  v7,  #8,  \sz // out5
+        rshrn_sz        v26, v2,  v3,  #8,  \sz // out10
+
+        saddl_sz        v2,  v3,  v22, v23, \sz // -> out4 (v20 or v27)
+        ssubl_sz        v6,  v7,  v22, v23, \sz // -> out11 (v27 or v20)
+        saddl_sz        v22, v23, v27, v20, \sz // -> out6 (v22 or v25)
+        ssubl_sz        v21, v25, v27, v20, \sz // -> out9 (v25 or v22)
+
+        mul_4s_sz       v2,  v3,  v2,  v3,  v0.s[0], \sz
+        mul_4s_sz       v6,  v7,  v6,  v7,  v0.s[0], \sz
+        mul_4s_sz       v22, v23, v22, v23, v0.s[0], \sz
+        mul_4s_sz       v21, v25, v21, v25, v0.s[0], \sz
+
+        rshrn_sz        \o4, v2,  v3,  #8,  \sz // out4
+        rshrn_sz        v6,  v6,  v7,  #8,  \sz // out11
+        rshrn_sz        v7,  v21, v25, #8,  \sz // out9
+        rshrn_sz        \o6, v22, v23, #8,  \sz // out6
+
+.ifc \o8, v23
+        mov             \o8\szb,  v24\szb
+        mov             \o10\szb, v26\szb
+.endif
+
+        sqneg           \o7\sz,  v4\sz // out7
+        sqneg           \o5\sz,  v5\sz // out5
+        sqneg           \o11\sz, v6\sz // out11
+        sqneg           \o9\sz,  v7\sz // out9
 .endm
 
 function inv_adst_8x16_neon