ref: 36133b04c0d3f82b16902de2ed57fe58d7c30990
parent: 5be37810d26d224f1ec75ff688d21aeadb863501
author: James Zern <jzern@google.com>
date: Mon Feb 3 11:57:58 EST 2020
loopfilter_sse2: call unsuffixed lpf functions this allows calls to use better versions (e.g., avx2) if available. in most other cases the function pointer will be defined to the sse2 variant if another isn't available. this improves performance at 1080P by ~2% on a Xeon E5-2690. Change-Id: Ie9da3a567021f8416651a29b8c9ab9238dc4bdf1
--- a/vpx_dsp/x86/loopfilter_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c
@@ -1674,8 +1674,8 @@
transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
// Loop filtering
- vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
- blimit1, limit1, thresh1);
+ vpx_lpf_horizontal_4_dual(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1);
src[0] = t_dst;
src[1] = t_dst + 8;
dst[0] = s - 4;
@@ -1700,7 +1700,7 @@
transpose(src, pitch, dst, 8, 1);
// Loop filtering
- vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
+ vpx_lpf_horizontal_8(t_dst + 4 * 8, 8, blimit, limit, thresh);
src[0] = t_dst;
dst[0] = s - 4;
@@ -1721,8 +1721,8 @@
transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
// Loop filtering
- vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
- blimit1, limit1, thresh1);
+ vpx_lpf_horizontal_8_dual(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1);
src[0] = t_dst;
src[1] = t_dst + 8;
@@ -1750,7 +1750,7 @@
transpose(src, pitch, dst, 8, 2);
// Loop filtering
- vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
+ vpx_lpf_horizontal_16(t_dst + 8 * 8, 8, blimit, limit, thresh);
src[0] = t_dst;
src[1] = t_dst + 8 * 8;
@@ -1771,7 +1771,7 @@
transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
// Loop filtering
- vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
+ vpx_lpf_horizontal_16_dual(t_dst + 8 * 16, 16, blimit, limit, thresh);
// Transpose back
transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);