ref: 5fa6c44a61fbf946646899d9db24d92cdce478ac
parent: 71e13008acae0a3cf6cee1790914248ee1140c89
author: Liwei Wang <liwei@multicorewareinc.com>
date: Wed Dec 26 09:57:39 EST 2018
Add SSSE3 implementation for the 8x8 blocks in itx Cycle times: inv_txfm_add_8x8_adst_adst_0_8bpc_c: 2165.6 inv_txfm_add_8x8_adst_adst_0_8bpc_ssse3: 194.5 inv_txfm_add_8x8_adst_adst_1_8bpc_c: 2158.3 inv_txfm_add_8x8_adst_adst_1_8bpc_ssse3: 194.7 inv_txfm_add_8x8_adst_dct_0_8bpc_c: 2241.0 inv_txfm_add_8x8_adst_dct_0_8bpc_ssse3: 165.1 inv_txfm_add_8x8_adst_dct_1_8bpc_c: 2242.6 inv_txfm_add_8x8_adst_dct_1_8bpc_ssse3: 164.2 inv_txfm_add_8x8_adst_flipadst_0_8bpc_c: 2178.2 inv_txfm_add_8x8_adst_flipadst_0_8bpc_ssse3: 194.4 inv_txfm_add_8x8_adst_flipadst_1_8bpc_c: 2183.0 inv_txfm_add_8x8_adst_flipadst_1_8bpc_ssse3: 194.2 inv_txfm_add_8x8_adst_identity_0_8bpc_c: 1592.1 inv_txfm_add_8x8_adst_identity_0_8bpc_ssse3: 125.2 inv_txfm_add_8x8_adst_identity_1_8bpc_c: 1597.7 inv_txfm_add_8x8_adst_identity_1_8bpc_ssse3: 126.3 inv_txfm_add_8x8_dct_adst_0_8bpc_c: 2214.1 inv_txfm_add_8x8_dct_adst_0_8bpc_ssse3: 162.0 inv_txfm_add_8x8_dct_adst_1_8bpc_c: 2221.5 inv_txfm_add_8x8_dct_adst_1_8bpc_ssse3: 161.9 inv_txfm_add_8x8_dct_dct_0_8bpc_c: 2247.8 inv_txfm_add_8x8_dct_dct_0_8bpc_ssse3: 34.0 inv_txfm_add_8x8_dct_dct_1_8bpc_c: 2243.1 inv_txfm_add_8x8_dct_dct_1_8bpc_ssse3: 133.7 inv_txfm_add_8x8_dct_flipadst_0_8bpc_c: 2255.1 inv_txfm_add_8x8_dct_flipadst_0_8bpc_ssse3: 161.2 inv_txfm_add_8x8_dct_flipadst_1_8bpc_c: 2244.6 inv_txfm_add_8x8_dct_flipadst_1_8bpc_ssse3: 161.8 inv_txfm_add_8x8_dct_identity_0_8bpc_c: 1632.3 inv_txfm_add_8x8_dct_identity_0_8bpc_ssse3: 41.3 inv_txfm_add_8x8_dct_identity_1_8bpc_c: 1629.6 inv_txfm_add_8x8_dct_identity_1_8bpc_ssse3: 97.7 inv_txfm_add_8x8_flipadst_adst_0_8bpc_c: 2185.6 inv_txfm_add_8x8_flipadst_adst_0_8bpc_ssse3: 191.0 inv_txfm_add_8x8_flipadst_adst_1_8bpc_c: 2165.7 inv_txfm_add_8x8_flipadst_adst_1_8bpc_ssse3: 191.6 inv_txfm_add_8x8_flipadst_dct_0_8bpc_c: 2246.4 inv_txfm_add_8x8_flipadst_dct_0_8bpc_ssse3: 162.8 inv_txfm_add_8x8_flipadst_dct_1_8bpc_c: 2252.1 inv_txfm_add_8x8_flipadst_dct_1_8bpc_ssse3: 163.9 inv_txfm_add_8x8_flipadst_flipadst_0_8bpc_c: 2180.9 inv_txfm_add_8x8_flipadst_flipadst_0_8bpc_ssse3: 196.3 inv_txfm_add_8x8_flipadst_flipadst_1_8bpc_c: 2192.2 inv_txfm_add_8x8_flipadst_flipadst_1_8bpc_ssse3: 194.5 inv_txfm_add_8x8_flipadst_identity_0_8bpc_c: 1600.9 inv_txfm_add_8x8_flipadst_identity_0_8bpc_ssse3: 126.6 inv_txfm_add_8x8_flipadst_identity_1_8bpc_c: 1600.5 inv_txfm_add_8x8_flipadst_identity_1_8bpc_ssse3: 126.4 inv_txfm_add_8x8_identity_adst_0_8bpc_c: 1558.0 inv_txfm_add_8x8_identity_adst_0_8bpc_ssse3: 120.7 inv_txfm_add_8x8_identity_adst_1_8bpc_c: 1556.7 inv_txfm_add_8x8_identity_adst_1_8bpc_ssse3: 121.0 inv_txfm_add_8x8_identity_dct_0_8bpc_c: 1600.8 inv_txfm_add_8x8_identity_dct_0_8bpc_ssse3: 37.9 inv_txfm_add_8x8_identity_dct_1_8bpc_c: 1599.5 inv_txfm_add_8x8_identity_dct_1_8bpc_ssse3: 90.3 inv_txfm_add_8x8_identity_flipadst_0_8bpc_c: 1584.9 inv_txfm_add_8x8_identity_flipadst_0_8bpc_ssse3: 120.2 inv_txfm_add_8x8_identity_flipadst_1_8bpc_c: 1584.3 inv_txfm_add_8x8_identity_flipadst_1_8bpc_ssse3: 120.5 inv_txfm_add_8x8_identity_identity_0_8bpc_c: 975.9 inv_txfm_add_8x8_identity_identity_0_8bpc_ssse3: 54.7 inv_txfm_add_8x8_identity_identity_1_8bpc_c: 975.7 inv_txfm_add_8x8_identity_identity_1_8bpc_ssse3: 54.7
--- a/src/x86/itx_init_tmpl.c
+++ b/src/x86/itx_init_tmpl.c
@@ -80,6 +80,7 @@
decl_itx17_fns(4, 4, ssse3);
decl_itx16_fns(4, 8, ssse3);
decl_itx16_fns(8, 4, ssse3);
+decl_itx16_fns(8, 8, ssse3);
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
@@ -126,6 +127,7 @@
assign_itx17_fn(, 4, 4, ssse3);
assign_itx16_fn(R, 4, 8, ssse3);
assign_itx16_fn(R, 8, 4, ssse3);
+ assign_itx16_fn(, 8, 8, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -58,6 +58,8 @@
pd_2048: times 4 dd 2048
pw_2048: times 8 dw 2048
pw_4096: times 8 dw 4096
+pw_16384: times 8 dw 16384
+pw_m16384: times 8 dw -16384
pw_2896x8: times 8 dw 2896*8
pw_3344x8: times 8 dw 3344*8
pw_5793x4: times 8 dw 5793*4
@@ -69,6 +71,14 @@
SECTION .text
+%macro REPX 2-*
+ %xdefine %%f(x) %1
+%rep %0 - 1
+ %rotate 1
+ %%f(%1)
+%endrep
+%endmacro
+
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
%if ARCH_X86_64
@@ -841,23 +851,31 @@
%macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3]
- movq m%3, [dstq ]
+ movq m%3, [dstq ]
movq m%4, [dstq+strideq]
pxor m%5, m%5
punpcklbw m%3, m%5 ;extend byte to word
punpcklbw m%4, m%5 ;extend byte to word
+%ifnum %1
paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+%ifnum %2
paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
packuswb m%3, m%4
- movq [dstq ], m%3
+ movq [dstq ], m%3
punpckhqdq m%3, m%3
movq [dstq+strideq], m%3
%endmacro
%macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3]
- WRITE_8X2 0, 1, 4, 5, 6
+ WRITE_8X2 %1, %2, %5, %6, %7
lea dstq, [dstq+strideq*2]
- WRITE_8X2 2, 3, 4, 5, 6
+ WRITE_8X2 %3, %4, %5, %6, %7
%endmacro
%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
@@ -1023,6 +1041,7 @@
mova [coeffq+16*1], m6
mova [coeffq+16*2], m6
mova [coeffq+16*3], m6
+.end3:
WRITE_8X4 0, 1, 2, 3, 4, 5, 6
RET
@@ -1116,3 +1135,344 @@
pmulhrsw m2, m4
pmulhrsw m3, m4
jmp m(iadst_8x4_internal).end
+
+%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
+ INV_TXFM_FN %1, %2, %3, 8x8, 8
+%ifidn %1_%2, dct_identity
+ mova m0, [o(pw_2896x8)]
+ pmulhrsw m0, [coeffq]
+ mova m1, [o(pw_16384)]
+ pmulhrsw m0, m1
+ psrlw m1, 2
+ pmulhrsw m0, m1
+ punpckhwd m7, m0, m0
+ punpcklwd m0, m0
+ pshufd m3, m0, q3333
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ call m(iadst_8x4_internal).end2
+ pshufd m3, m7, q3333
+ pshufd m2, m7, q2222
+ pshufd m1, m7, q1111
+ pshufd m0, m7, q0000
+ lea dstq, [dstq+strideq*2]
+ TAIL_CALL m(iadst_8x4_internal).end3
+%elif %3 >= 0
+%ifidn %1, dct
+ pshuflw m0, [coeffq], q0000
+ punpcklwd m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mova m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m2
+ psrlw m2, 3
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+.end:
+ mov r2d, 2
+.end2:
+ lea r3, [strideq*3]
+.loop:
+ WRITE_8X4 0, 0, 0, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*2]
+ dec r2d
+ jg .loop
+ RET
+%else ; identity
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+ punpcklwd m0, [coeffq+16*4]
+ punpcklwd m1, [coeffq+16*5]
+ punpcklwd m2, [coeffq+16*6]
+ punpcklwd m3, [coeffq+16*7]
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ punpcklwd m0, m1
+ pmulhrsw m0, [o(pw_2896x8)]
+ pmulhrsw m0, [o(pw_2048)]
+ pxor m4, m4
+ REPX {mova [coeffq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
+ jmp m(inv_txfm_add_dct_dct_8x8).end
+%endif
+%endif
+%endmacro
+
+%macro ITX_8X8_LOAD_COEFS 0
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+ mova m4, [coeffq+16*4]
+ mova m5, [coeffq+16*5]
+ mova m6, [coeffq+16*6]
+%endmacro
+
+%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a
+ ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276 ;t5a, t6a
+ psubsw m%5, m%1, m%3 ;t5a
+ paddsw m%1, m%3 ;t4
+ psubsw m%6, m%4, m%2 ;t6a
+ paddsw m%4, m%2 ;t7
+ mova m%3, [o(pw_2896x8)]
+ psubw m%2, m%6, m%5 ;t6a - t5a
+ paddw m%6, m%5 ;t6a + t5a
+ pmulhrsw m%2, m%3 ;t5
+ pmulhrsw m%3, m%6 ;t6
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct, 0
+INV_TXFM_8X8_FN dct, identity, 7
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+ ITX_8X8_LOAD_COEFS
+ call .main
+
+.pass1_end:
+ mova m7, [o(pw_16384)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [coeffq+16*6], m6
+
+.pass1_end2:
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [coeffq+16*7]
+
+.pass1_end3:
+ punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53
+ punpckhwd m1, m5 ;14 54 15 55 16 56 17 57
+ punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47
+ punpcklwd m0, m4 ;00 40 01 41 02 42 03 43
+ punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77
+ punpcklwd m3, m7 ;30 70 31 71 32 72 33 73
+ punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77
+ punpcklwd m1, m4 ;14 34 54 74 15 35 55 75
+ punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73
+ punpcklwd m6, m3 ;10 30 50 70 11 31 51 71
+ mova [coeffq+16*5], m6
+ mova m6, [coeffq+16*6]
+ punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67
+ punpcklwd m2, m6 ;20 60 21 61 22 62 23 63
+ punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67
+ punpcklwd m5, m3 ;04 24 44 64 05 25 45 65
+ punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63
+ punpcklwd m0, m2 ;00 20 40 60 01 21 41 61
+
+ punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77
+ punpcklwd m6, m7 ;06 16 26 36 46 56 66 76
+ mova [coeffq+16*7], m2
+ punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72
+ punpckhwd m3, m4 ;03 13 23 33 43 53 63 73
+ punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74
+ punpckhwd m5, m1 ;05 15 25 35 45 55 65 75
+ mova m7, [coeffq+16*5]
+ punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71
+ punpcklwd m0, m7 ;00 10 20 30 40 50 60 70
+ jmp tx2q
+
+.pass2:
+ call .main
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [coeffq+16*6], m6
+
+.end2:
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [coeffq+16*7]
+ mova [coeffq+16*5], m5
+ mova [coeffq+16*7], m7
+
+.end3:
+ WRITE_8X4 0, 1, 2, 3, 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X4 4, [coeffq+16*5], [coeffq+16*6], [coeffq+16*7], 5, 6, 7
+
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ ret
+
+ALIGN function_align
+.main:
+ mova [coeffq+16*6], m3
+ mova [coeffq+16*5], m1
+ mova m7, [o(pd_2048)]
+ IDCT4_1D 0, 2, 4, 6, 1, 3, 7
+ mova m3, [coeffq+16*5]
+ mova [coeffq+16*5], m2
+ mova m2, [coeffq+16*6]
+ mova [coeffq+16*6], m4
+ mova m4, [coeffq+16*7]
+ mova [coeffq+16*7], m6
+ IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7
+ mova m6, [coeffq+16*7]
+ psubsw m7, m0, m4 ;out7
+ paddsw m0, m4 ;out0
+ mova [coeffq+16*7], m7
+ mova m1, [coeffq+16*5]
+ psubsw m4, m6, m3 ;out4
+ paddsw m3, m6 ;out3
+ mova m7, [coeffq+16*6]
+ psubsw m6, m1, m5 ;out6
+ paddsw m1, m5 ;out1
+ psubsw m5, m7, m2 ;out5
+ paddsw m2, m7 ;out2
+ ret
+
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+ ITX_8X8_LOAD_COEFS
+ call .main
+ mova m7, [o(pw_16384)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [coeffq+16*6], m6
+ pxor m6, m6
+ psubw m6, m7
+ mova m7, m6
+ jmp m(idct_8x8_internal).pass1_end2
+
+ALIGN function_align
+.pass2:
+ call .main
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [coeffq+16*6], m6
+ pxor m6, m6
+ psubw m6, m7
+ mova m7, m6
+ jmp m(idct_8x8_internal).end2
+
+ALIGN function_align
+.main:
+ mova [coeffq+16*6], m3
+ mova [coeffq+16*5], m4
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a
+ ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a
+ paddsw m3, m2, m6 ;t2
+ psubsw m2, m6 ;t6
+ paddsw m4, m5, m1 ;t3
+ psubsw m5, m1 ;t7
+ ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a
+
+ mova m6, [coeffq+16*5]
+ mova [coeffq+16*5], m5
+ mova m1, [coeffq+16*6]
+ mova [coeffq+16*6], m2
+ mova m5, [coeffq+16*7]
+ mova [coeffq+16*7], m3
+ ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a
+ ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a
+ psubsw m2, m0, m6 ;t4
+ paddsw m0, m6 ;t0
+ paddsw m3, m5, m1 ;t1
+ psubsw m5, m1 ;t5
+ ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a
+
+ mova m7, [coeffq+16*7]
+ paddsw m1, m3, m4 ;-out7
+ psubsw m3, m4 ;t3
+ mova [coeffq+16*7], m1
+ psubsw m4, m0, m7 ;t2
+ paddsw m0, m7 ;out0
+ mova m6, [coeffq+16*5]
+ mova m7, [coeffq+16*6]
+ paddsw m1, m5, m6 ;-out1
+ psubsw m5, m6 ;t6
+ paddsw m6, m2, m7 ;out6
+ psubsw m2, m7 ;t7
+ paddw m7, m4, m3 ;t2 + t3
+ psubw m4, m3 ;t2 - t3
+ paddw m3, m5, m2 ;t6 + t7
+ psubw m5, m2 ;t6 - t7
+ mova m2, [o(pw_2896x8)]
+ pmulhrsw m4, m2 ;out4
+ pmulhrsw m5, m2 ;-out5
+ pmulhrsw m7, m2 ;-out3
+ pmulhrsw m2, m3 ;out2
+ mova m3, m7
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+ ITX_8X8_LOAD_COEFS
+ call m(iadst_8x8_internal).main
+ mova m7, [o(pw_m16384)]
+ pmulhrsw m1, m7
+ mova [coeffq+16*6], m1
+ mova m1, m6
+ mova m6, m2
+ pmulhrsw m2, m5, m7
+ mova m5, m6
+ mova m6, m4
+ pmulhrsw m4, m3, m7
+ mova m3, m6
+ mova m6, m0
+ mova m0, m7
+ pxor m7, m7
+ psubw m7, m0
+ pmulhrsw m0, [coeffq+16*7]
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, m6
+ jmp m(idct_8x8_internal).pass1_end3
+
+ALIGN function_align
+.pass2:
+ call m(iadst_8x8_internal).main
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [coeffq+16*5], m2
+ mova m2, m0
+ pxor m0, m0
+ psubw m0, m7
+ mova m7, m2
+ pmulhrsw m1, m0
+ pmulhrsw m2, m5, m0
+ mova [coeffq+16*6], m1
+ mova m5, m4
+ mova m1, m6
+ pmulhrsw m4, m3, m0
+ pmulhrsw m0, [coeffq+16*7]
+ mova m3, m5
+ mova [coeffq+16*7], m7
+ jmp m(idct_8x8_internal).end3
+
+INV_TXFM_8X8_FN identity, dct, 7
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+ mova m4, [coeffq+16*4]
+ mova m5, [coeffq+16*5]
+ mova m7, [coeffq+16*7]
+ jmp m(idct_8x8_internal).pass1_end3
+
+ALIGN function_align
+.pass2:
+ mova m7, [o(pw_4096)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [coeffq+16*7]
+ mova [coeffq+16*5], m5
+ mova [coeffq+16*6], m6
+ mova [coeffq+16*7], m7
+ jmp m(idct_8x8_internal).end3