ref: 8c5d34c85613aa8c6ba2f6da4cf8ba6b6e41ef8f
parent: 1703f21fb708f3e75ec8889c2a7592652d1ecfbc
author: Henrik Gramner <gramner@twoorioles.com>
date: Fri Dec 21 21:45:00 EST 2018
Add tail call optimizations in SSSE3 itx
--- a/src/ext/x86/x86inc.asm
+++ b/src/ext/x86/x86inc.asm
@@ -682,7 +682,7 @@
BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
-%macro TAIL_CALL 2 ; callee, is_nonadjacent
+%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
%if has_epilogue
call %1
RET
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -198,9 +198,11 @@
%macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack
cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
%undef cmp
+ %define %%p1 m(i%1_%4_internal)
%if ARCH_X86_32
LEA r5, $$
%endif
+%if has_epilogue
%if %3 > 0
cmp eobd, %3
jle %%end
@@ -209,10 +211,23 @@
jz %%end
%endif
lea tx2q, [o(m(i%2_%4_internal).pass2)]
- call m(i%1_%4_internal)
+ call %%p1
RET
+%%end:
+%else
+ lea tx2q, [o(m(i%2_%4_internal).pass2)]
+%if %3 > 0
+ cmp eobd, %3
+ jg %%p1
+%elif %3 == 0
+ test eobd, eobd
+ jnz %%p1
+%else
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
ALIGN function_align
%%end:
+%endif
+%endif
%endmacro
%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
@@ -225,8 +240,7 @@
punpcklwd m0, m0
punpckhdq m1, m0, m0
punpckldq m0, m0
- call m(iadst_4x4_internal).end
- RET
+ TAIL_CALL m(iadst_4x4_internal).end
%elifidn %1_%2, identity_dct
mova m1, [coeffq+16*0]
mova m2, [coeffq+16*1]
@@ -238,8 +252,7 @@
pmulhrsw m0, [o(pw_5793x4)]
pmulhrsw m0, [o(pw_2896x8)]
mova m1, m0
- call m(iadst_4x4_internal).end
- RET
+ TAIL_CALL m(iadst_4x4_internal).end
%elif %3 >= 0
pshuflw m0, [coeffq], q0000
punpcklqdq m0, m0
@@ -259,13 +272,11 @@
pmulhrsw m0, m1
%endif
mova m1, m0
- call m(iadst_4x4_internal).end2
- RET
+ TAIL_CALL m(iadst_4x4_internal).end2
%else ; adst / flipadst
pmulhrsw m1, m0, [o(iadst4_dconly2b)]
pmulhrsw m0, [o(iadst4_dconly2a)]
- call m(i%2_4x4_internal).end2
- RET
+ TAIL_CALL m(i%2_4x4_internal).end2
%endif
%endif
%endmacro
@@ -624,8 +635,7 @@
punpckldq m0, m0
punpckhdq m3, m2, m2
punpckldq m2, m2
- call m(iadst_4x8_internal).end3
- RET
+ TAIL_CALL m(iadst_4x8_internal).end3
%elifidn %1_%2, identity_dct
movd m0, [coeffq+16*0]
punpcklwd m0, [coeffq+16*1]
@@ -642,8 +652,7 @@
mova m1, m0
mova m2, m0
mova m3, m0
- call m(iadst_4x8_internal).end3
- RET
+ TAIL_CALL m(iadst_4x8_internal).end3
%elifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000
punpcklqdq m0, m0
@@ -656,8 +665,7 @@
mova m1, m0
mova m2, m0
mova m3, m0
- call m(iadst_4x8_internal).end4
- RET
+ TAIL_CALL m(iadst_4x8_internal).end4
%else ; adst_dct / flipadst_dct
pshuflw m0, [coeffq], q0000
punpcklqdq m0, m0
@@ -674,8 +682,7 @@
mova m1, m0
mova m2, m0
mova m3, m0
- call m(iadst_4x8_internal).end4
- RET
+ TAIL_CALL m(iadst_4x8_internal).end4
%endif
%endif
%endmacro
@@ -923,8 +930,7 @@
%endif
%endif
%endif
- call m(iadst_8x4_internal).end2
- RET
+ TAIL_CALL m(iadst_8x4_internal).end2
%endif
%endmacro