ref: a75ee78bd998f393b9deada2859cf5f8ebbafe75
parent: ea74e3d513206fcdda4316f3f1303df47b890d48
author: Victorien Le Couviour--Tuffet <victorien@videolan.org>
date: Mon Jun 15 09:46:55 EDT 2020
x86: Add put/prep_bilin_scaled AVX2 asm Bilin scaled being very rarely used, add a new table entry to mc_subpel_filters, and jump to the put/prep_8tap_scaled code. AVX2 performance is obviously the same as the 8tap code, the speed up is much smaller though, as the C code is a true bilinear codepath, auto-vectorized. Yet, the AVX2 performance are always better.
--- a/src/tables.c
+++ b/src/tables.c
@@ -442,7 +442,7 @@
0
};
-const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
+const int8_t ALIGN(dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8], 8) = {
[DAV1D_FILTER_8TAP_REGULAR] = {
{ 0, 1, -3, 63, 4, -1, 0, 0 },
{ 0, 1, -5, 61, 9, -2, 0, 0 },
@@ -524,6 +524,27 @@
{ 0, 0, 2, 20, 31, 11, 0, 0 },
{ 0, 0, 2, 18, 31, 13, 0, 0 },
{ 0, 0, 1, 17, 31, 15, 0, 0 }
+#if ARCH_X86_64
+ /* Bilin scaled being very rarely used, add a new table entry
+ * and use the put/prep_8tap_scaled code, thus acting as a
+ * scaled bilinear filter. */
+ }, [5] = {
+ { 0, 0, 0, 60, 4, 0, 0, 0 },
+ { 0, 0, 0, 56, 8, 0, 0, 0 },
+ { 0, 0, 0, 52, 12, 0, 0, 0 },
+ { 0, 0, 0, 48, 16, 0, 0, 0 },
+ { 0, 0, 0, 44, 20, 0, 0, 0 },
+ { 0, 0, 0, 40, 24, 0, 0, 0 },
+ { 0, 0, 0, 36, 28, 0, 0, 0 },
+ { 0, 0, 0, 32, 32, 0, 0, 0 },
+ { 0, 0, 0, 28, 36, 0, 0, 0 },
+ { 0, 0, 0, 24, 40, 0, 0, 0 },
+ { 0, 0, 0, 20, 44, 0, 0, 0 },
+ { 0, 0, 0, 16, 48, 0, 0, 0 },
+ { 0, 0, 0, 12, 52, 0, 0, 0 },
+ { 0, 0, 0, 8, 56, 0, 0, 0 },
+ { 0, 0, 0, 4, 60, 0, 0, 0 }
+#endif
}
};
--- a/src/tables.h
+++ b/src/tables.h
@@ -110,7 +110,7 @@
extern const int16_t dav1d_sgr_params[16][4];
extern const uint8_t dav1d_sgr_x_by_x[256];
-extern const int8_t dav1d_mc_subpel_filters[5][15][8];
+extern const int8_t dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8];
extern const int8_t dav1d_mc_warp_filter[193][8];
extern const int8_t dav1d_resize_filter[64][8];
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -5719,12 +5719,21 @@
%undef isprep
%endmacro
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, (5*15 << 16) | 5*15
+ jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
+%endmacro
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+
%if WIN64
DECLARE_REG_TMP 6, 5
%else
DECLARE_REG_TMP 6, 8
%endif
-%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+BILIN_SCALED_FN put
PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
@@ -5741,7 +5750,7 @@
%else
DECLARE_REG_TMP 6, 7
%endif
-%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+BILIN_SCALED_FN prep
PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -99,6 +99,7 @@
decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2);
@@ -109,6 +110,7 @@
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2);
decl_avg_fn(dav1d_avg_avx512icl);
decl_avg_fn(dav1d_avg_avx2);
@@ -264,6 +266,7 @@
init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
@@ -274,6 +277,7 @@
init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
c->avg = dav1d_avg_avx2;
c->w_avg = dav1d_w_avg_avx2;