ref: 9f084b0d267029817599e6a6a9692350f823c1ae
parent: e36088e405054f7b90e3fc757f718003c2ac19f9
author: Martin Storsjö <martin@martin.st>
date: Thu Jan 2 10:43:49 EST 2020
arm64: itx: Use sqrdmulh in the preexisting identity transform functions
--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -148,13 +148,6 @@
.endif
.endm
-.macro sqrshrn_sz d0, s0, s1, shift, sz
- sqrshrn \d0\().4h, \s0\().4s, \shift
-.ifc \sz, .8h
- sqrshrn2 \d0\().8h, \s1\().4s, \shift
-.endif
-.endm
-
.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
sqrdmulh \r0\sz, \r0\sz, \c
sqrdmulh \r1\sz, \r1\sz, \c
@@ -168,31 +161,6 @@
.endif
.endm
-.macro scale_wide sz, c, r0, r1, r2 r3, r4, r5, r6, r7
- smull_sz v2, v3, \r0, \c, \sz
- smull_sz v4, v5, \r1, \c, \sz
- smull_sz v6, v7, \r2, \c, \sz
- sqrshrn_sz \r0, v2, v3, #12, \sz
- smull_sz v2, v3, \r3, \c, \sz
- sqrshrn_sz \r1, v4, v5, #12, \sz
-.ifnb \r4
- smull_sz v4, v5, \r4, \c, \sz
-.endif
- sqrshrn_sz \r2, v6, v7, #12, \sz
-.ifnb \r4
- smull_sz v6, v7, \r5, \c, \sz
-.endif
- sqrshrn_sz \r3, v2, v3, #12, \sz
-.ifnb \r4
- smull_sz v2, v3, \r6, \c, \sz
- sqrshrn_sz \r4, v4, v5, #12, \sz
- smull_sz v4, v5, \r7, \c, \sz
- sqrshrn_sz \r5, v6, v7, #12, \sz
- sqrshrn_sz \r6, v2, v3, #12, \sz
- sqrshrn_sz \r7, v4, v5, #12, \sz
-.endif
-.endm
-
.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
.ifnb \load
ld1 {\load}, [\src], x1
@@ -606,38 +574,30 @@
endfunc
function inv_identity_4x4_neon
- mov w16, #5793
+ mov w16, #(5793-4096)*8
dup v0.4h, w16
- smull v4.4s, v16.4h, v0.h[0]
- smull v5.4s, v17.4h, v0.h[0]
- smull v6.4s, v18.4h, v0.h[0]
- smull v7.4s, v19.4h, v0.h[0]
- sqrshrn v16.4h, v4.4s, #12
- sqrshrn v17.4h, v5.4s, #12
- sqrshrn v18.4h, v6.4s, #12
- sqrshrn v19.4h, v7.4s, #12
+ sqrdmulh v4.4h, v16.4h, v0.h[0]
+ sqrdmulh v5.4h, v17.4h, v0.h[0]
+ sqrdmulh v6.4h, v18.4h, v0.h[0]
+ sqrdmulh v7.4h, v19.4h, v0.h[0]
+ sqadd v16.4h, v16.4h, v4.4h
+ sqadd v17.4h, v17.4h, v5.4h
+ sqadd v18.4h, v18.4h, v6.4h
+ sqadd v19.4h, v19.4h, v7.4h
ret
endfunc
function inv_identity_8x4_neon
- mov w16, #5793
+ mov w16, #(5793-4096)*8
dup v0.4h, w16
- smull v2.4s, v16.4h, v0.h[0]
- smull2 v3.4s, v16.8h, v0.h[0]
- smull v4.4s, v17.4h, v0.h[0]
- smull2 v5.4s, v17.8h, v0.h[0]
- sqrshrn v16.4h, v2.4s, #12
- sqrshrn2 v16.8h, v3.4s, #12
- smull v6.4s, v18.4h, v0.h[0]
- smull2 v7.4s, v18.8h, v0.h[0]
- sqrshrn v17.4h, v4.4s, #12
- sqrshrn2 v17.8h, v5.4s, #12
- smull v2.4s, v19.4h, v0.h[0]
- smull2 v3.4s, v19.8h, v0.h[0]
- sqrshrn v18.4h, v6.4s, #12
- sqrshrn2 v18.8h, v7.4s, #12
- sqrshrn v19.4h, v2.4s, #12
- sqrshrn2 v19.8h, v3.4s, #12
+ sqrdmulh v4.8h, v16.8h, v0.h[0]
+ sqrdmulh v5.8h, v17.8h, v0.h[0]
+ sqrdmulh v6.8h, v18.8h, v0.h[0]
+ sqrdmulh v7.8h, v19.8h, v0.h[0]
+ sqadd v16.8h, v16.8h, v4.8h
+ sqadd v17.8h, v17.8h, v5.8h
+ sqadd v18.8h, v18.8h, v6.8h
+ sqadd v19.8h, v19.8h, v7.8h
ret
endfunc
@@ -1360,23 +1320,23 @@
endfunc
function inv_identity_8x16_neon
- mov w16, #2*5793
+ mov w16, #2*(5793-4096)*8
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- smull v2.4s, v\i\().4h, v0.h[0]
- smull2 v3.4s, v\i\().8h, v0.h[0]
- sqrshrn v\i\().4h, v2.4s, #12
- sqrshrn2 v\i\().8h, v3.4s, #12
+ sqrdmulh v2.8h, v\i\().8h, v0.h[0]
+ sqadd v\i\().8h, v\i\().8h, v\i\().8h
+ sqadd v\i\().8h, v\i\().8h, v2.8h
.endr
ret
endfunc
function inv_identity_4x16_neon
- mov w16, #2*5793
+ mov w16, #2*(5793-4096)*8
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- smull v2.4s, v\i\().4h, v0.h[0]
- sqrshrn v\i\().4h, v2.4s, #12
+ sqrdmulh v2.4h, v\i\().4h, v0.h[0]
+ sqadd v\i\().4h, v\i\().4h, v\i\().4h
+ sqadd v\i\().4h, v\i\().4h, v2.4h
.endr
ret
endfunc
@@ -1397,6 +1357,22 @@
.endr
.endm
+.macro identity_8x8_shift1 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ sqrdmulh v2.8h, \i, \c
+ srshr v2.8h, v2.8h, #1
+ sqadd \i, \i, v2.8h
+.endr
+.endm
+
+.macro identity_8x8 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ sqrdmulh v2.8h, \i, \c
+ sqadd \i, \i, \i
+ sqadd \i, \i, v2.8h
+.endr
+.endm
+
function inv_txfm_horz_16x8_neon
mov x14, x30
movi v7.8h, #0
@@ -2282,7 +2258,7 @@
.macro def_identity_1632 w, h, wshort, hshort
function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
mov w16, #2896*8
- mov w17, #2*5793
+ mov w17, #2*(5793-4096)*8
dup v1.4h, w16
movi v0.8h, #0
mov v1.h[1], w17
@@ -2302,12 +2278,11 @@
.if \w == 16
// 16x32
- scale_wide .8h, v1.h[1], v16, v17, v18, v19, v20, v21, v22, v23
- shift_8_regs srshr, 1
+ identity_8x8_shift1 v1.h[1]
.else
// 32x16
shift_8_regs sqshl, 1
- scale_wide .8h, v1.h[1], v16, v17, v18, v19, v20, v21, v22, v23
+ identity_8x8 v1.h[1]
.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5