shithub: dav1d

Download patch

ref: e36088e405054f7b90e3fc757f718003c2ac19f9
parent: 33e65d80de3e3e17c11d6bc6a8da25bcca099962
author: Martin Storsjö <martin@martin.st>
date: Wed Jan 1 18:12:52 EST 2020

arm64: itx: Specialcase transforms with identity in the first pass with downshift

Make sure to not clip to a 16 bit range before the downshift is done.

Add clipping to 16 bit range in all other identity transforms, where
there is no downshift.

4x4, 8x4 and 4x8 don't have any downshift, thus the existing code
structure works fine.

The identity transforms of size 32 already are specialcased with the
downshift folded in where possible. Clamping properly in them should
be enough, as any out of range values will be clamped to pixel range
in the end anyway.

Therefore we only need specialcased identity in the first pass (to
keep intermediates in 32 bit until downshifting) for 8x8 and all the
size 16 variants.

--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -148,6 +148,13 @@
 .endif
 .endm
 
+.macro sqrshrn_sz d0, s0, s1, shift, sz
+        sqrshrn         \d0\().4h, \s0\().4s, \shift
+.ifc \sz, .8h
+        sqrshrn2        \d0\().8h, \s1\().4s, \shift
+.endif
+.endm
+
 .macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
         sqrdmulh        \r0\sz,  \r0\sz,  \c
         sqrdmulh        \r1\sz,  \r1\sz,  \c
@@ -165,24 +172,24 @@
         smull_sz        v2,  v3,  \r0, \c,  \sz
         smull_sz        v4,  v5,  \r1, \c,  \sz
         smull_sz        v6,  v7,  \r2, \c,  \sz
-        rshrn_sz        \r0, v2,  v3,  #12, \sz
+        sqrshrn_sz      \r0, v2,  v3,  #12, \sz
         smull_sz        v2,  v3,  \r3, \c,  \sz
-        rshrn_sz        \r1, v4,  v5,  #12, \sz
+        sqrshrn_sz      \r1, v4,  v5,  #12, \sz
 .ifnb \r4
         smull_sz        v4,  v5,  \r4, \c,  \sz
 .endif
-        rshrn_sz        \r2, v6,  v7,  #12, \sz
+        sqrshrn_sz      \r2, v6,  v7,  #12, \sz
 .ifnb \r4
         smull_sz        v6,  v7,  \r5, \c,  \sz
 .endif
-        rshrn_sz        \r3, v2,  v3,  #12, \sz
+        sqrshrn_sz      \r3, v2,  v3,  #12, \sz
 .ifnb \r4
         smull_sz        v2,  v3,  \r6, \c,  \sz
-        rshrn_sz        \r4, v4,  v5,  #12, \sz
+        sqrshrn_sz      \r4, v4,  v5,  #12, \sz
         smull_sz        v4,  v5,  \r7, \c,  \sz
-        rshrn_sz        \r5, v6,  v7,  #12, \sz
-        rshrn_sz        \r6, v2,  v3,  #12, \sz
-        rshrn_sz        \r7, v4,  v5,  #12, \sz
+        sqrshrn_sz      \r5, v6,  v7,  #12, \sz
+        sqrshrn_sz      \r6, v2,  v3,  #12, \sz
+        sqrshrn_sz      \r7, v4,  v5,  #12, \sz
 .endif
 .endm
 
@@ -605,10 +612,10 @@
         smull           v5.4s,   v17.4h,  v0.h[0]
         smull           v6.4s,   v18.4h,  v0.h[0]
         smull           v7.4s,   v19.4h,  v0.h[0]
-        rshrn           v16.4h,  v4.4s,   #12
-        rshrn           v17.4h,  v5.4s,   #12
-        rshrn           v18.4h,  v6.4s,   #12
-        rshrn           v19.4h,  v7.4s,   #12
+        sqrshrn         v16.4h,  v4.4s,   #12
+        sqrshrn         v17.4h,  v5.4s,   #12
+        sqrshrn         v18.4h,  v6.4s,   #12
+        sqrshrn         v19.4h,  v7.4s,   #12
         ret
 endfunc
 
@@ -619,21 +626,28 @@
         smull2          v3.4s,   v16.8h,  v0.h[0]
         smull           v4.4s,   v17.4h,  v0.h[0]
         smull2          v5.4s,   v17.8h,  v0.h[0]
-        rshrn           v16.4h,  v2.4s,   #12
-        rshrn2          v16.8h,  v3.4s,   #12
+        sqrshrn         v16.4h,  v2.4s,   #12
+        sqrshrn2        v16.8h,  v3.4s,   #12
         smull           v6.4s,   v18.4h,  v0.h[0]
         smull2          v7.4s,   v18.8h,  v0.h[0]
-        rshrn           v17.4h,  v4.4s,   #12
-        rshrn2          v17.8h,  v5.4s,   #12
+        sqrshrn         v17.4h,  v4.4s,   #12
+        sqrshrn2        v17.8h,  v5.4s,   #12
         smull           v2.4s,   v19.4h,  v0.h[0]
         smull2          v3.4s,   v19.8h,  v0.h[0]
-        rshrn           v18.4h,  v6.4s,   #12
-        rshrn2          v18.8h,  v7.4s,   #12
-        rshrn           v19.4h,  v2.4s,   #12
-        rshrn2          v19.8h,  v3.4s,   #12
+        sqrshrn         v18.4h,  v6.4s,   #12
+        sqrshrn2        v18.8h,  v7.4s,   #12
+        sqrshrn         v19.4h,  v2.4s,   #12
+        sqrshrn2        v19.8h,  v3.4s,   #12
         ret
 endfunc
 
+.macro identity_8x4_shift1 r0, r1, r2, r3, c
+.irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h
+        sqrdmulh        v2.8h,  \i,  \c
+        srhadd          \i,     \i,  v2.8h
+.endr
+.endm
+
 function inv_txfm_add_wht_wht_4x4_neon, export=1
         mov             x15, x30
         movi            v31.8h,  #0
@@ -877,30 +891,31 @@
 endfunc
 
 function inv_identity_8x8_neon
-        shl             v16.8h,  v16.8h,  #1
-        shl             v17.8h,  v17.8h,  #1
-        shl             v18.8h,  v18.8h,  #1
-        shl             v19.8h,  v19.8h,  #1
-        shl             v20.8h,  v20.8h,  #1
-        shl             v21.8h,  v21.8h,  #1
-        shl             v22.8h,  v22.8h,  #1
-        shl             v23.8h,  v23.8h,  #1
+        sqshl           v16.8h,  v16.8h,  #1
+        sqshl           v17.8h,  v17.8h,  #1
+        sqshl           v18.8h,  v18.8h,  #1
+        sqshl           v19.8h,  v19.8h,  #1
+        sqshl           v20.8h,  v20.8h,  #1
+        sqshl           v21.8h,  v21.8h,  #1
+        sqshl           v22.8h,  v22.8h,  #1
+        sqshl           v23.8h,  v23.8h,  #1
         ret
 endfunc
 
 function inv_identity_4x8_neon
-        shl             v16.4h,  v16.4h,  #1
-        shl             v17.4h,  v17.4h,  #1
-        shl             v18.4h,  v18.4h,  #1
-        shl             v19.4h,  v19.4h,  #1
-        shl             v20.4h,  v20.4h,  #1
-        shl             v21.4h,  v21.4h,  #1
-        shl             v22.4h,  v22.4h,  #1
-        shl             v23.4h,  v23.4h,  #1
+        sqshl           v16.4h,  v16.4h,  #1
+        sqshl           v17.4h,  v17.4h,  #1
+        sqshl           v18.4h,  v18.4h,  #1
+        sqshl           v19.4h,  v19.4h,  #1
+        sqshl           v20.4h,  v20.4h,  #1
+        sqshl           v21.4h,  v21.4h,  #1
+        sqshl           v22.4h,  v22.4h,  #1
+        sqshl           v23.4h,  v23.4h,  #1
         ret
 endfunc
 
-function inv_txfm_add_8x8_neon
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_neon
         movi            v28.8h,  #0
         movi            v29.8h,  #0
         movi            v30.8h,  #0
@@ -910,6 +925,9 @@
         ld1             {v20.8h,v21.8h,v22.8h,v23.8h}, [x2]
         st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
 
+.ifc \variant, identity_
+        // The identity shl #1 and downshift srshr #1 cancel out
+.else
         blr             x4
 
         srshr           v16.8h,  v16.8h,  #1
@@ -920,6 +938,7 @@
         srshr           v21.8h,  v21.8h,  #1
         srshr           v22.8h,  v22.8h,  #1
         srshr           v23.8h,  v23.8h,  #1
+.endif
 
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
 
@@ -928,7 +947,11 @@
         load_add_store_8x8 x0, x7
         br              x15
 endfunc
+.endm
 
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
 .macro def_fn_8x8 txfm1, txfm2
 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_neon, export=1
         mov             x15, x30
@@ -936,9 +959,13 @@
 .ifc \txfm1\()_\txfm2, dct_dct
         idct_dc         8,   8,   1
 .endif
-        adr             x4,  inv_\txfm1\()_8x8_neon
         adr             x5,  inv_\txfm2\()_8x8_neon
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_8x8_neon
+.else
+        adr             x4,  inv_\txfm1\()_8x8_neon
         b               inv_txfm_add_8x8_neon
+.endif
 endfunc
 .endm
 
@@ -1338,8 +1365,8 @@
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         smull           v2.4s,   v\i\().4h,  v0.h[0]
         smull2          v3.4s,   v\i\().8h,  v0.h[0]
-        rshrn           v\i\().4h,  v2.4s,   #12
-        rshrn2          v\i\().8h,  v3.4s,   #12
+        sqrshrn         v\i\().4h,  v2.4s,   #12
+        sqrshrn2        v\i\().8h,  v3.4s,   #12
 .endr
         ret
 endfunc
@@ -1349,11 +1376,27 @@
         dup             v0.4h,   w16
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         smull           v2.4s,   v\i\().4h,  v0.h[0]
-        rshrn           v\i\().4h,  v2.4s,   #12
+        sqrshrn         v\i\().4h,  v2.4s,   #12
 .endr
         ret
 endfunc
 
+.macro identity_8x16_shift2 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        sqrdmulh        v2.8h,   \i,      \c
+        sshr            v2.8h,   v2.8h,   #1
+        srhadd          \i,      \i,      v2.8h
+.endr
+.endm
+
+.macro identity_8x16_shift1 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        sqrdmulh        v2.8h,   \i,      \c
+        srshr           v2.8h,   v2.8h,   #1
+        sqadd           \i,      \i,      v2.8h
+.endr
+.endm
+
 function inv_txfm_horz_16x8_neon
         mov             x14, x30
         movi            v7.8h,  #0
@@ -1375,6 +1418,26 @@
         br              x14
 endfunc
 
+function inv_txfm_horz_identity_16x8_neon
+        mov             x14, x30
+        movi            v7.8h,  #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        ld1             {v\i\().8h}, [x7]
+        st1             {v7.8h}, [x7], x8
+.endr
+        mov             w16, #2*(5793-4096)*8
+        dup             v0.4h,   w16
+        identity_8x16_shift2 v0.h[0]
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
+
+.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+        st1             {v\i\().8h}, [x6], #16
+.endr
+
+        br              x14
+endfunc
+
 function inv_txfm_horz_scale_16x8_neon
         mov             x14, x30
         movi            v7.8h,  #0
@@ -1421,7 +1484,7 @@
 .endif
         add             x7,  x2,  #(\i*2)
         mov             x8,  #16*2
-        bl              inv_txfm_horz_16x8_neon
+        blr             x9
 .endr
         b               2f
 1:
@@ -1449,7 +1512,12 @@
 .ifc \txfm1\()_\txfm2, dct_dct
         idct_dc         16,  16,  2
 .endif
+.ifc \txfm1, identity
+        adr             x9,  inv_txfm_horz_identity_16x8_neon
+.else
+        adr             x9,  inv_txfm_horz_16x8_neon
         adr             x4,  inv_\txfm1\()_8x16_neon
+.endif
         adr             x5,  inv_\txfm2\()_8x16_neon
         mov             x13, #\eob_half
         b               inv_txfm_add_16x16_neon
@@ -1469,10 +1537,33 @@
 def_fn_16x16 flipadst, flipadst, 36
 def_fn_16x16 identity, dct, 8
 
-function inv_txfm_add_16x4_neon
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_16x4_neon
         mov             x15, x30
         movi            v4.8h,  #0
 
+.ifc \variant, identity_
+.irp i, v16.4h, v17.4h, v18.4h, v19.4h
+        ld1             {\i},    [x2]
+        st1             {v4.4h}, [x2], #8
+.endr
+.irp i, v16.d, v17.d, v18.d, v19.d
+        ld1             {\i}[1], [x2]
+        st1             {v4.4h}, [x2], #8
+.endr
+        mov             w16, #2*(5793-4096)*8
+        dup             v0.4h,   w16
+.irp i, v20.4h, v21.4h, v22.4h, v23.4h
+        ld1             {\i},    [x2]
+        st1             {v4.4h}, [x2], #8
+.endr
+.irp i, v20.d, v21.d, v22.d, v23.d
+        ld1             {\i}[1], [x2]
+        st1             {v4.4h}, [x2], #8
+.endr
+
+        identity_8x16_shift1 v0.h[0]
+.else
 .irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
         ld1             {\i},    [x2]
         st1             {v4.4h}, [x2], #8
@@ -1487,11 +1578,18 @@
 .irp i, v16.8h, v17.8h, v18.8h, v19.8h
         srshr           \i,  \i,  #1
 .endr
+.endif
         transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
         blr             x5
         mov             x6,  x0
         load_add_store_8x4 x6, x7
 
+.ifc \variant, identity_
+        mov             v16.16b, v20.16b
+        mov             v17.16b, v21.16b
+        mov             v18.16b, v22.16b
+        mov             v19.16b, v23.16b
+.else
         ins             v24.d[1], v28.d[0]
         ins             v25.d[1], v29.d[0]
         ins             v26.d[1], v30.d[0]
@@ -1500,6 +1598,7 @@
         srshr           v17.8h,  v25.8h,  #1
         srshr           v18.8h,  v26.8h,  #1
         srshr           v19.8h,  v27.8h,  #1
+.endif
         transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
         blr             x5
         add             x6,  x0,  #8
@@ -1508,7 +1607,7 @@
         br              x15
 endfunc
 
-function inv_txfm_add_4x16_neon
+function inv_txfm_\variant\()add_4x16_neon
         mov             x15, x30
         movi            v2.8h,   #0
 
@@ -1517,15 +1616,25 @@
         b.lt            1f
 
         add             x6,  x2,  #16
-.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+.ifc \variant, identity_
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
         ld1             {\i},    [x6]
         st1             {v2.8h}, [x6], x11
 .endr
+        mov             w16, #(5793-4096)*8
+        dup             v0.4h,   w16
+        identity_8x4_shift1 v24, v25, v26, v27, v0.h[0]
+.else
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+        ld1             {\i},    [x6]
+        st1             {v2.8h}, [x6], x11
+.endr
         blr             x4
         srshr           v24.8h,  v16.8h,  #1
         srshr           v25.8h,  v17.8h,  #1
         srshr           v26.8h,  v18.8h,  #1
         srshr           v27.8h,  v19.8h,  #1
+.endif
         transpose_4x8h  v24, v25, v26, v27, v4,  v5,  v6,  v7
         ins             v28.d[0], v24.d[1]
         ins             v29.d[0], v25.d[1]
@@ -1543,10 +1652,16 @@
         ld1             {\i},    [x2]
         st1             {v2.8h}, [x2], x11
 .endr
+.ifc \variant, identity_
+        mov             w16, #(5793-4096)*8
+        dup             v0.4h,   w16
+        identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
+.else
         blr             x4
 .irp i, v16.8h, v17.8h, v18.8h, v19.8h
         srshr           \i,  \i,  #1
 .endr
+.endif
         transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
         ins             v20.d[0], v16.d[1]
         ins             v21.d[0], v17.d[1]
@@ -1559,7 +1674,11 @@
 
         br              x15
 endfunc
+.endm
 
+def_fn_416_base
+def_fn_416_base identity_
+
 .macro def_fn_416 w, h, txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
 .ifc \txfm1\()_\txfm2, dct_dct
@@ -1573,7 +1692,11 @@
         adr             x4,  inv_\txfm1\()_4x\w\()_neon
         adr             x5,  inv_\txfm2\()_8x\h\()_neon
 .endif
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_\w\()x\h\()_neon
+.else
         b               inv_txfm_add_\w\()x\h\()_neon
+.endif
 endfunc
 .endm
 
@@ -1600,7 +1723,8 @@
 def_fns_416 16, 4
 
 
-function inv_txfm_add_16x8_neon
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_16x8_neon
         mov             x15, x30
         movi            v4.8h,  #0
         mov             w16, #2896*8
@@ -1613,11 +1737,17 @@
 
         scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
         scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.ifc \variant, identity_
+        mov             w16, #2*(5793-4096)*8
+        dup             v0.4h,   w16
+        identity_8x16_shift1 v0.h[0]
+.else
         blr             x4
 
 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
         srshr           \i,  \i,  #1
 .endr
+.endif
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 
         blr             x5
@@ -1625,6 +1755,16 @@
         mov             x6,  x0
         load_add_store_8x8 x6, x7
 
+.ifc \variant, identity_
+        mov             v16.16b, v24.16b
+        mov             v17.16b, v25.16b
+        mov             v18.16b, v26.16b
+        mov             v19.16b, v27.16b
+        mov             v20.16b, v28.16b
+        mov             v21.16b, v29.16b
+        mov             v22.16b, v30.16b
+        mov             v23.16b, v31.16b
+.else
         srshr           v16.8h,  v24.8h,  #1
         srshr           v17.8h,  v25.8h,  #1
         srshr           v18.8h,  v26.8h,  #1
@@ -1633,6 +1773,7 @@
         srshr           v21.8h,  v29.8h,  #1
         srshr           v22.8h,  v30.8h,  #1
         srshr           v23.8h,  v31.8h,  #1
+.endif
 
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 
@@ -1644,7 +1785,7 @@
         br              x15
 endfunc
 
-function inv_txfm_add_8x16_neon
+function inv_txfm_\variant\()add_8x16_neon
         mov             x15, x30
         movi            v4.8h,   #0
         mov             w16, #2896*8
@@ -1655,6 +1796,14 @@
         b.lt            1f
 
         add             x6,  x2,  #16
+.ifc \variant, identity_
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        ld1             {\i},    [x6]
+        st1             {v4.8h}, [x6], x11
+.endr
+        scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+        // The identity shl #1 and downshift srshr #1 cancel out
+.else
 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
         ld1             {\i},    [x6]
         st1             {v4.8h}, [x6], x11
@@ -1670,6 +1819,7 @@
         srshr           v29.8h,  v21.8h,  #1
         srshr           v30.8h,  v22.8h,  #1
         srshr           v31.8h,  v23.8h,  #1
+.endif
         transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
 
         b               2f
@@ -1689,11 +1839,15 @@
         st1             {v4.8h}, [x2], x11
 .endr
         scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+.ifc \variant, identity_
+        // The identity shl #1 and downshift srshr #1 cancel out
+.else
         blr             x4
 
 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
         srshr           \i,  \i,  #1
 .endr
+.endif
 
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 
@@ -1703,7 +1857,11 @@
 
         br              x15
 endfunc
+.endm
 
+def_fn_816_base
+def_fn_816_base identity_
+
 .macro def_fn_816 w, h, txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
 .ifc \txfm1\()_\txfm2, dct_dct
@@ -1714,7 +1872,11 @@
 .if \w == 8
         mov             x13, #\eob_half
 .endif
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_\w\()x\h\()_neon
+.else
         b               inv_txfm_add_\w\()x\h\()_neon
+.endif
 endfunc
 .endm
 
@@ -2144,7 +2306,7 @@
         shift_8_regs    srshr, 1
 .else
         // 32x16
-        shift_8_regs    shl, 1
+        shift_8_regs    sqshl, 1
         scale_wide      .8h, v1.h[1], v16, v17, v18, v19, v20, v21, v22, v23
 .endif