ref: e36088e405054f7b90e3fc757f718003c2ac19f9
parent: 33e65d80de3e3e17c11d6bc6a8da25bcca099962
author: Martin Storsjö <martin@martin.st>
date: Wed Jan 1 18:12:52 EST 2020
arm64: itx: Specialcase transforms with identity in the first pass with downshift Make sure to not clip to a 16 bit range before the downshift is done. Add clipping to 16 bit range in all other identity transforms, where there is no downshift. 4x4, 8x4 and 4x8 don't have any downshift, thus the existing code structure works fine. The identity transforms of size 32 already are specialcased with the downshift folded in where possible. Clamping properly in them should be enough, as any out of range values will be clamped to pixel range in the end anyway. Therefore we only need specialcased identity in the first pass (to keep intermediates in 32 bit until downshifting) for 8x8 and all the size 16 variants.
--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -148,6 +148,13 @@
.endif
.endm
+.macro sqrshrn_sz d0, s0, s1, shift, sz
+ sqrshrn \d0\().4h, \s0\().4s, \shift
+.ifc \sz, .8h
+ sqrshrn2 \d0\().8h, \s1\().4s, \shift
+.endif
+.endm
+
.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
sqrdmulh \r0\sz, \r0\sz, \c
sqrdmulh \r1\sz, \r1\sz, \c
@@ -165,24 +172,24 @@
smull_sz v2, v3, \r0, \c, \sz
smull_sz v4, v5, \r1, \c, \sz
smull_sz v6, v7, \r2, \c, \sz
- rshrn_sz \r0, v2, v3, #12, \sz
+ sqrshrn_sz \r0, v2, v3, #12, \sz
smull_sz v2, v3, \r3, \c, \sz
- rshrn_sz \r1, v4, v5, #12, \sz
+ sqrshrn_sz \r1, v4, v5, #12, \sz
.ifnb \r4
smull_sz v4, v5, \r4, \c, \sz
.endif
- rshrn_sz \r2, v6, v7, #12, \sz
+ sqrshrn_sz \r2, v6, v7, #12, \sz
.ifnb \r4
smull_sz v6, v7, \r5, \c, \sz
.endif
- rshrn_sz \r3, v2, v3, #12, \sz
+ sqrshrn_sz \r3, v2, v3, #12, \sz
.ifnb \r4
smull_sz v2, v3, \r6, \c, \sz
- rshrn_sz \r4, v4, v5, #12, \sz
+ sqrshrn_sz \r4, v4, v5, #12, \sz
smull_sz v4, v5, \r7, \c, \sz
- rshrn_sz \r5, v6, v7, #12, \sz
- rshrn_sz \r6, v2, v3, #12, \sz
- rshrn_sz \r7, v4, v5, #12, \sz
+ sqrshrn_sz \r5, v6, v7, #12, \sz
+ sqrshrn_sz \r6, v2, v3, #12, \sz
+ sqrshrn_sz \r7, v4, v5, #12, \sz
.endif
.endm
@@ -605,10 +612,10 @@
smull v5.4s, v17.4h, v0.h[0]
smull v6.4s, v18.4h, v0.h[0]
smull v7.4s, v19.4h, v0.h[0]
- rshrn v16.4h, v4.4s, #12
- rshrn v17.4h, v5.4s, #12
- rshrn v18.4h, v6.4s, #12
- rshrn v19.4h, v7.4s, #12
+ sqrshrn v16.4h, v4.4s, #12
+ sqrshrn v17.4h, v5.4s, #12
+ sqrshrn v18.4h, v6.4s, #12
+ sqrshrn v19.4h, v7.4s, #12
ret
endfunc
@@ -619,21 +626,28 @@
smull2 v3.4s, v16.8h, v0.h[0]
smull v4.4s, v17.4h, v0.h[0]
smull2 v5.4s, v17.8h, v0.h[0]
- rshrn v16.4h, v2.4s, #12
- rshrn2 v16.8h, v3.4s, #12
+ sqrshrn v16.4h, v2.4s, #12
+ sqrshrn2 v16.8h, v3.4s, #12
smull v6.4s, v18.4h, v0.h[0]
smull2 v7.4s, v18.8h, v0.h[0]
- rshrn v17.4h, v4.4s, #12
- rshrn2 v17.8h, v5.4s, #12
+ sqrshrn v17.4h, v4.4s, #12
+ sqrshrn2 v17.8h, v5.4s, #12
smull v2.4s, v19.4h, v0.h[0]
smull2 v3.4s, v19.8h, v0.h[0]
- rshrn v18.4h, v6.4s, #12
- rshrn2 v18.8h, v7.4s, #12
- rshrn v19.4h, v2.4s, #12
- rshrn2 v19.8h, v3.4s, #12
+ sqrshrn v18.4h, v6.4s, #12
+ sqrshrn2 v18.8h, v7.4s, #12
+ sqrshrn v19.4h, v2.4s, #12
+ sqrshrn2 v19.8h, v3.4s, #12
ret
endfunc
+.macro identity_8x4_shift1 r0, r1, r2, r3, c
+.irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h
+ sqrdmulh v2.8h, \i, \c
+ srhadd \i, \i, v2.8h
+.endr
+.endm
+
function inv_txfm_add_wht_wht_4x4_neon, export=1
mov x15, x30
movi v31.8h, #0
@@ -877,30 +891,31 @@
endfunc
function inv_identity_8x8_neon
- shl v16.8h, v16.8h, #1
- shl v17.8h, v17.8h, #1
- shl v18.8h, v18.8h, #1
- shl v19.8h, v19.8h, #1
- shl v20.8h, v20.8h, #1
- shl v21.8h, v21.8h, #1
- shl v22.8h, v22.8h, #1
- shl v23.8h, v23.8h, #1
+ sqshl v16.8h, v16.8h, #1
+ sqshl v17.8h, v17.8h, #1
+ sqshl v18.8h, v18.8h, #1
+ sqshl v19.8h, v19.8h, #1
+ sqshl v20.8h, v20.8h, #1
+ sqshl v21.8h, v21.8h, #1
+ sqshl v22.8h, v22.8h, #1
+ sqshl v23.8h, v23.8h, #1
ret
endfunc
function inv_identity_4x8_neon
- shl v16.4h, v16.4h, #1
- shl v17.4h, v17.4h, #1
- shl v18.4h, v18.4h, #1
- shl v19.4h, v19.4h, #1
- shl v20.4h, v20.4h, #1
- shl v21.4h, v21.4h, #1
- shl v22.4h, v22.4h, #1
- shl v23.4h, v23.4h, #1
+ sqshl v16.4h, v16.4h, #1
+ sqshl v17.4h, v17.4h, #1
+ sqshl v18.4h, v18.4h, #1
+ sqshl v19.4h, v19.4h, #1
+ sqshl v20.4h, v20.4h, #1
+ sqshl v21.4h, v21.4h, #1
+ sqshl v22.4h, v22.4h, #1
+ sqshl v23.4h, v23.4h, #1
ret
endfunc
-function inv_txfm_add_8x8_neon
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_neon
movi v28.8h, #0
movi v29.8h, #0
movi v30.8h, #0
@@ -910,6 +925,9 @@
ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2]
st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
+.ifc \variant, identity_
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
blr x4
srshr v16.8h, v16.8h, #1
@@ -920,6 +938,7 @@
srshr v21.8h, v21.8h, #1
srshr v22.8h, v22.8h, #1
srshr v23.8h, v23.8h, #1
+.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
@@ -928,7 +947,11 @@
load_add_store_8x8 x0, x7
br x15
endfunc
+.endm
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
.macro def_fn_8x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_neon, export=1
mov x15, x30
@@ -936,9 +959,13 @@
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc 8, 8, 1
.endif
- adr x4, inv_\txfm1\()_8x8_neon
adr x5, inv_\txfm2\()_8x8_neon
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_8x8_neon
+.else
+ adr x4, inv_\txfm1\()_8x8_neon
b inv_txfm_add_8x8_neon
+.endif
endfunc
.endm
@@ -1338,8 +1365,8 @@
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
smull v2.4s, v\i\().4h, v0.h[0]
smull2 v3.4s, v\i\().8h, v0.h[0]
- rshrn v\i\().4h, v2.4s, #12
- rshrn2 v\i\().8h, v3.4s, #12
+ sqrshrn v\i\().4h, v2.4s, #12
+ sqrshrn2 v\i\().8h, v3.4s, #12
.endr
ret
endfunc
@@ -1349,11 +1376,27 @@
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
smull v2.4s, v\i\().4h, v0.h[0]
- rshrn v\i\().4h, v2.4s, #12
+ sqrshrn v\i\().4h, v2.4s, #12
.endr
ret
endfunc
+.macro identity_8x16_shift2 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ sqrdmulh v2.8h, \i, \c
+ sshr v2.8h, v2.8h, #1
+ srhadd \i, \i, v2.8h
+.endr
+.endm
+
+.macro identity_8x16_shift1 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ sqrdmulh v2.8h, \i, \c
+ srshr v2.8h, v2.8h, #1
+ sqadd \i, \i, v2.8h
+.endr
+.endm
+
function inv_txfm_horz_16x8_neon
mov x14, x30
movi v7.8h, #0
@@ -1375,6 +1418,26 @@
br x14
endfunc
+function inv_txfm_horz_identity_16x8_neon
+ mov x14, x30
+ movi v7.8h, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7]
+ st1 {v7.8h}, [x7], x8
+.endr
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+ identity_8x16_shift2 v0.h[0]
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+ transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
+
+.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+ st1 {v\i\().8h}, [x6], #16
+.endr
+
+ br x14
+endfunc
+
function inv_txfm_horz_scale_16x8_neon
mov x14, x30
movi v7.8h, #0
@@ -1421,7 +1484,7 @@
.endif
add x7, x2, #(\i*2)
mov x8, #16*2
- bl inv_txfm_horz_16x8_neon
+ blr x9
.endr
b 2f
1:
@@ -1449,7 +1512,12 @@
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc 16, 16, 2
.endif
+.ifc \txfm1, identity
+ adr x9, inv_txfm_horz_identity_16x8_neon
+.else
+ adr x9, inv_txfm_horz_16x8_neon
adr x4, inv_\txfm1\()_8x16_neon
+.endif
adr x5, inv_\txfm2\()_8x16_neon
mov x13, #\eob_half
b inv_txfm_add_16x16_neon
@@ -1469,10 +1537,33 @@
def_fn_16x16 flipadst, flipadst, 36
def_fn_16x16 identity, dct, 8
-function inv_txfm_add_16x4_neon
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_16x4_neon
mov x15, x30
movi v4.8h, #0
+.ifc \variant, identity_
+.irp i, v16.4h, v17.4h, v18.4h, v19.4h
+ ld1 {\i}, [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+.irp i, v16.d, v17.d, v18.d, v19.d
+ ld1 {\i}[1], [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+.irp i, v20.4h, v21.4h, v22.4h, v23.4h
+ ld1 {\i}, [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+.irp i, v20.d, v21.d, v22.d, v23.d
+ ld1 {\i}[1], [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+
+ identity_8x16_shift1 v0.h[0]
+.else
.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
ld1 {\i}, [x2]
st1 {v4.4h}, [x2], #8
@@ -1487,11 +1578,18 @@
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
srshr \i, \i, #1
.endr
+.endif
transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
blr x5
mov x6, x0
load_add_store_8x4 x6, x7
+.ifc \variant, identity_
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ mov v18.16b, v22.16b
+ mov v19.16b, v23.16b
+.else
ins v24.d[1], v28.d[0]
ins v25.d[1], v29.d[0]
ins v26.d[1], v30.d[0]
@@ -1500,6 +1598,7 @@
srshr v17.8h, v25.8h, #1
srshr v18.8h, v26.8h, #1
srshr v19.8h, v27.8h, #1
+.endif
transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
blr x5
add x6, x0, #8
@@ -1508,7 +1607,7 @@
br x15
endfunc
-function inv_txfm_add_4x16_neon
+function inv_txfm_\variant\()add_4x16_neon
mov x15, x30
movi v2.8h, #0
@@ -1517,15 +1616,25 @@
b.lt 1f
add x6, x2, #16
-.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+.ifc \variant, identity_
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
ld1 {\i}, [x6]
st1 {v2.8h}, [x6], x11
.endr
+ mov w16, #(5793-4096)*8
+ dup v0.4h, w16
+ identity_8x4_shift1 v24, v25, v26, v27, v0.h[0]
+.else
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ ld1 {\i}, [x6]
+ st1 {v2.8h}, [x6], x11
+.endr
blr x4
srshr v24.8h, v16.8h, #1
srshr v25.8h, v17.8h, #1
srshr v26.8h, v18.8h, #1
srshr v27.8h, v19.8h, #1
+.endif
transpose_4x8h v24, v25, v26, v27, v4, v5, v6, v7
ins v28.d[0], v24.d[1]
ins v29.d[0], v25.d[1]
@@ -1543,10 +1652,16 @@
ld1 {\i}, [x2]
st1 {v2.8h}, [x2], x11
.endr
+.ifc \variant, identity_
+ mov w16, #(5793-4096)*8
+ dup v0.4h, w16
+ identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
+.else
blr x4
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
srshr \i, \i, #1
.endr
+.endif
transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
ins v20.d[0], v16.d[1]
ins v21.d[0], v17.d[1]
@@ -1559,7 +1674,11 @@
br x15
endfunc
+.endm
+def_fn_416_base
+def_fn_416_base identity_
+
.macro def_fn_416 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
.ifc \txfm1\()_\txfm2, dct_dct
@@ -1573,7 +1692,11 @@
adr x4, inv_\txfm1\()_4x\w\()_neon
adr x5, inv_\txfm2\()_8x\h\()_neon
.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
b inv_txfm_add_\w\()x\h\()_neon
+.endif
endfunc
.endm
@@ -1600,7 +1723,8 @@
def_fns_416 16, 4
-function inv_txfm_add_16x8_neon
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_16x8_neon
mov x15, x30
movi v4.8h, #0
mov w16, #2896*8
@@ -1613,11 +1737,17 @@
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.ifc \variant, identity_
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+ identity_8x16_shift1 v0.h[0]
+.else
blr x4
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
srshr \i, \i, #1
.endr
+.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
blr x5
@@ -1625,6 +1755,16 @@
mov x6, x0
load_add_store_8x8 x6, x7
+.ifc \variant, identity_
+ mov v16.16b, v24.16b
+ mov v17.16b, v25.16b
+ mov v18.16b, v26.16b
+ mov v19.16b, v27.16b
+ mov v20.16b, v28.16b
+ mov v21.16b, v29.16b
+ mov v22.16b, v30.16b
+ mov v23.16b, v31.16b
+.else
srshr v16.8h, v24.8h, #1
srshr v17.8h, v25.8h, #1
srshr v18.8h, v26.8h, #1
@@ -1633,6 +1773,7 @@
srshr v21.8h, v29.8h, #1
srshr v22.8h, v30.8h, #1
srshr v23.8h, v31.8h, #1
+.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
@@ -1644,7 +1785,7 @@
br x15
endfunc
-function inv_txfm_add_8x16_neon
+function inv_txfm_\variant\()add_8x16_neon
mov x15, x30
movi v4.8h, #0
mov w16, #2896*8
@@ -1655,6 +1796,14 @@
b.lt 1f
add x6, x2, #16
+.ifc \variant, identity_
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x6]
+ st1 {v4.8h}, [x6], x11
+.endr
+ scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
ld1 {\i}, [x6]
st1 {v4.8h}, [x6], x11
@@ -1670,6 +1819,7 @@
srshr v29.8h, v21.8h, #1
srshr v30.8h, v22.8h, #1
srshr v31.8h, v23.8h, #1
+.endif
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
b 2f
@@ -1689,11 +1839,15 @@
st1 {v4.8h}, [x2], x11
.endr
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+.ifc \variant, identity_
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
blr x4
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
srshr \i, \i, #1
.endr
+.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
@@ -1703,7 +1857,11 @@
br x15
endfunc
+.endm
+def_fn_816_base
+def_fn_816_base identity_
+
.macro def_fn_816 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
.ifc \txfm1\()_\txfm2, dct_dct
@@ -1714,7 +1872,11 @@
.if \w == 8
mov x13, #\eob_half
.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
b inv_txfm_add_\w\()x\h\()_neon
+.endif
endfunc
.endm
@@ -2144,7 +2306,7 @@
shift_8_regs srshr, 1
.else
// 32x16
- shift_8_regs shl, 1
+ shift_8_regs sqshl, 1
scale_wide .8h, v1.h[1], v16, v17, v18, v19, v20, v21, v22, v23
.endif