ref: eb01bdb9763f3c1990d748682cc5b853fd05ca69
parent: b53a99b97f93d0eb15d1f532739ca062fe44b4ca
author: Henrik Gramner <gramner@twoorioles.com>
date: Sat Dec 15 14:01:52 EST 2018
Improve the fast path of IDCT 8x32 AVX2
--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -113,6 +113,15 @@
COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842
COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301
+pw_201_4091x8: dw 201*8, 4091*8
+pw_m601_4052x8: dw -601*8, 4052*8
+pw_995_3973x8: dw 995*8, 3973*8
+pw_m1380_3857x8: dw -1380*8, 3857*8
+pw_1751_3703x8: dw 1751*8, 3703*8
+pw_m2106_3513x8: dw -2106*8, 3513*8
+pw_2440_3290x8: dw 2440*8, 3290*8
+pw_m2751_3035x8: dw -2751*8, 3035*8
+
%define o_idct64_offset idct64_mul - (o_base) - 8
SECTION .text
@@ -215,12 +224,6 @@
packssdw m%2, m%3
%endmacro
-%macro ITX_MULHRSW_SHL3 4 ; dst/src, tmp, coef[1-2]
- vpbroadcastd m%2, [pw_%3_%4]
- psllw m%2, 3
- pmulhrsw m%1, m%2
-%endmacro
-
%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784 ; t2, t3
vpbroadcastd m%6, [o(pw_2896x8)]
@@ -3277,6 +3280,15 @@
%endif
%endmacro
+%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
+ vpbroadcastd m%3, [r4-pw_201_4091x8+pw_%4_%5x8]
+ punpcklwd m%1, m%2, m%2
+ pmulhrsw m%1, m%3
+ vpbroadcastd m%3, [r4-pw_201_4091x8+pw_%6_%7x8]
+ punpckhwd m%2, m%2
+ pmulhrsw m%2, m%3
+%endmacro
+
cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob
lea rax, [o_base]
test eobd, eobd
@@ -3431,22 +3443,11 @@
mova [rsp+gprsize+1*32], m1
mova m0, [rsp+gprsize+2*32]
mova [rsp+gprsize+2*32], m6
- punpcklwd m1, m8, m8
- punpckhwd m8, m8
- punpcklwd m15, m9, m9
- punpckhwd m9, m9
- punpcklwd m14, m0, m0
- punpckhwd m0, m0
- punpcklwd m13, m11, m11
- punpckhwd m11, m11
- ITX_MULHRSW_SHL3 1, 6, 201, 4091 ; t16a, t31a
- ITX_MULHRSW_SHL3 8, 6, m601, 4052 ; t23a, t24a
- ITX_MULHRSW_SHL3 15, 6, 995, 3973 ; t20a, t27a
- ITX_MULHRSW_SHL3 9, 6, m1380, 3857 ; t19a, t28a
- ITX_MULHRSW_SHL3 14, 6, 1751, 3703 ; t18a, t29a
- ITX_MULHRSW_SHL3 0, 6, m2106, 3513 ; t21a, t26a
- ITX_MULHRSW_SHL3 13, 6, 2440, 3290 ; t22a, t25a
- ITX_MULHRSW_SHL3 11, 6, m2751, 3035 ; t17a, t30a
+ lea r4, [rax-(o_base)+pw_201_4091x8]
+ ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
+ ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
jmp .main2
ALIGN function_align
.main: