ref: 5debc462b0282ee63f52a5e745dfdb7f1d29a202
parent: cbec1241fe6f6d43710348a5c04aad1b81059f19
author: Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
date: Fri Mar 8 17:20:05 EST 2019
x86: optimize SSSE3 sgr_calc_ab{1,2} This optimization is so small 10 runs with a fixed seed were needed to get some relevant numbers. This has been done for 3x3 case only. before: mean=113265.42 stddev=954.392 after: mean=112654.71 stddev=884.833
--- a/src/x86/looprestoration_ssse3.asm
+++ b/src/x86/looprestoration_ssse3.asm
@@ -35,6 +35,7 @@
db 1, 2
pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
+pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
pb_0: times 16 db 0
pb_2: times 16 db 2
pb_3: times 16 db 3
@@ -509,17 +510,11 @@
;; self-guided ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;
-%macro MULLD 2-3 1 ; %3 = is_constant
- pmuludq m5, %1, %2
- psrlq %1, 32
- %if %3 == 0
- pshufd m3, %2, q2301
- pmuludq %1, m3
- %else
- pmuludq %1, %2
- %endif
- shufps %1, m5, q2020
- pshufd %1, %1, q1302
+%macro MULLD 2
+ pmulhuw m5, %1, %2
+ pmullw %1, %2
+ pslld m5, 16
+ paddd %1, m5
%endmacro
%macro GATHERDD 2
@@ -777,7 +772,8 @@
SETUP_PIC r5, 0
%endif
movd m6, sd
- pshufd m6, m6, 0
+ pshuflw m6, m6, q0000
+ punpcklqdq m6, m6
pxor m7, m7
DEFINE_ARGS a, b, w, h, x
%if ARCH_X86_64
@@ -784,10 +780,12 @@
mova m8, [pd_0xF00801C7]
mova m9, [pw_256]
psrld m10, m9, 13 ; pd_2048
+ mova m11, [pb_unpcklwdw]
%else
%define m8 [PIC_sym(pd_0xF00801C7)]
%define m9 [PIC_sym(pw_256)]
%define m10 [PIC_sym(pd_2048)]
+ %define m11 [PIC_sym(pb_unpcklwdw)]
%endif
.loop_y:
mov xq, -2
@@ -818,10 +816,12 @@
GATHERDD m2, m3
psrld m4, 24
psrld m2, 24
- MULLD m0, m4, 0
- MULLD m1, m2, 0
- packssdw m4, m2
- psubw m5, m9, m4
+ packssdw m3, m4, m2
+ pshufb m4, m11
+ MULLD m0, m4
+ pshufb m2, m11
+ MULLD m1, m2
+ psubw m5, m9, m3
paddd m0, m10
paddd m1, m10
psrld m0, 12
@@ -1516,7 +1516,8 @@
SETUP_PIC r5, 0
%endif
movd m6, sd
- pshufd m6, m6, 0
+ pshuflw m6, m6, q0000
+ punpcklqdq m6, m6
pxor m7, m7
DEFINE_ARGS a, b, w, h, x
%if ARCH_X86_64