ref: 9c29f229c5aa7d2d9564d44e8932011f23ac4e77
parent: 361a3c8ee2d03f87f42a76213ee0f93e49fa9ec3
author: Henrik Gramner <gramner@twoorioles.com>
date: Fri Jan 24 15:34:18 EST 2020
checkasm: Increase buffer alignment to 64-byte on x86-64 Required for AVX-512.
--- a/include/common/attributes.h
+++ b/include/common/attributes.h
@@ -43,15 +43,18 @@
#endif
#if ARCH_X86_64
-/* x86-64 needs 32-byte alignment for AVX2. */
+/* x86-64 needs 32- and 64-byte alignment for AVX2 and AVX-512. */
+#define ALIGN_64_VAL 64
#define ALIGN_32_VAL 32
#define ALIGN_16_VAL 16
#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
/* ARM doesn't benefit from anything more than 16-byte alignment. */
+#define ALIGN_64_VAL 16
#define ALIGN_32_VAL 16
#define ALIGN_16_VAL 16
#else
/* No need for extra alignment on platforms without assembly. */
+#define ALIGN_64_VAL 8
#define ALIGN_32_VAL 8
#define ALIGN_16_VAL 8
#endif
@@ -76,9 +79,10 @@
* becomes:
* ALIGN_STK_$align(uint8_t, var, 1, [2][3][4])
*/
+#define ALIGN_STK_64(type, var, sz1d, sznd) \
+ ALIGN(type var[sz1d]sznd, ALIGN_64_VAL)
#define ALIGN_STK_32(type, var, sz1d, sznd) \
ALIGN(type var[sz1d]sznd, ALIGN_32_VAL)
-// as long as stack is itself 16-byte aligned, this works (win64, gcc)
#define ALIGN_STK_16(type, var, sz1d, sznd) \
ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
--- a/tests/checkasm/cdef.c
+++ b/tests/checkasm/cdef.c
@@ -45,9 +45,9 @@
}
static void check_cdef_filter(const cdef_fn fn, const int w, const int h) {
- ALIGN_STK_32(pixel, c_src, 10 * 16 + 8, ), *const c_dst = c_src + 8;
- ALIGN_STK_32(pixel, a_src, 10 * 16 + 8, ), *const a_dst = a_src + 8;
- ALIGN_STK_32(pixel, top, 16 * 2 + 8, );
+ ALIGN_STK_64(pixel, c_src, 10 * 16 + 8, ), *const c_dst = c_src + 8;
+ ALIGN_STK_64(pixel, a_src, 10 * 16 + 8, ), *const a_dst = a_src + 8;
+ ALIGN_STK_64(pixel, top, 16 * 2 + 8, );
pixel left[8][2];
pixel *const top_ptrs[2] = { top + 8, top + 24 };
const ptrdiff_t stride = 16 * sizeof(pixel);
@@ -103,7 +103,7 @@
}
static void check_cdef_direction(const cdef_dir_fn fn) {
- ALIGN_STK_32(pixel, src, 8 * 8,);
+ ALIGN_STK_64(pixel, src, 8 * 8,);
declare_func(int, pixel *src, ptrdiff_t dst_stride, unsigned *var
HIGHBD_DECL_SUFFIX);
--- a/tests/checkasm/filmgrain.c
+++ b/tests/checkasm/filmgrain.c
@@ -137,9 +137,9 @@
}
static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
- ALIGN_STK_32(pixel, c_dst, 128 * 32,);
- ALIGN_STK_32(pixel, a_dst, 128 * 32,);
- ALIGN_STK_32(pixel, src, 128 * 32,);
+ ALIGN_STK_64(pixel, c_dst, 128 * 32,);
+ ALIGN_STK_64(pixel, a_dst, 128 * 32,);
+ ALIGN_STK_64(pixel, src, 128 * 32,);
const ptrdiff_t stride = 128 * sizeof(pixel);
declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
@@ -207,10 +207,10 @@
}
static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
- ALIGN_STK_32(pixel, c_dst, 128 * 32,);
- ALIGN_STK_32(pixel, a_dst, 128 * 32,);
- ALIGN_STK_32(pixel, src, 128 * 32,);
- ALIGN_STK_32(pixel, luma_src, 128 * 32,);
+ ALIGN_STK_64(pixel, c_dst, 128 * 32,);
+ ALIGN_STK_64(pixel, a_dst, 128 * 32,);
+ ALIGN_STK_64(pixel, src, 128 * 32,);
+ ALIGN_STK_64(pixel, luma_src, 128 * 32,);
const ptrdiff_t lstride = 128 * sizeof(pixel);
declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
--- a/tests/checkasm/ipred.c
+++ b/tests/checkasm/ipred.c
@@ -66,9 +66,9 @@
};
static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
- ALIGN_STK_32(pixel, c_dst, 64 * 64,);
- ALIGN_STK_32(pixel, a_dst, 64 * 64,);
- ALIGN_STK_32(pixel, topleft_buf, 257,);
+ ALIGN_STK_64(pixel, c_dst, 64 * 64,);
+ ALIGN_STK_64(pixel, a_dst, 64 * 64,);
+ ALIGN_STK_64(pixel, topleft_buf, 257,);
pixel *const topleft = topleft_buf + 128;
declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
@@ -132,9 +132,9 @@
}
static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
- ALIGN_STK_32(int16_t, c_dst, 32 * 32,);
- ALIGN_STK_32(int16_t, a_dst, 32 * 32,);
- ALIGN_STK_32(pixel, luma, 32 * 32,);
+ ALIGN_STK_64(int16_t, c_dst, 32 * 32,);
+ ALIGN_STK_64(int16_t, a_dst, 32 * 32,);
+ ALIGN_STK_64(pixel, luma, 32 * 32,);
declare_func(void, int16_t *ac, const pixel *y, ptrdiff_t stride,
int w_pad, int h_pad, int cw, int ch);
@@ -175,10 +175,10 @@
}
static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
- ALIGN_STK_32(pixel, c_dst, 32 * 32,);
- ALIGN_STK_32(pixel, a_dst, 32 * 32,);
- ALIGN_STK_32(int16_t, ac, 32 * 32,);
- ALIGN_STK_32(pixel, topleft_buf, 257,);
+ ALIGN_STK_64(pixel, c_dst, 32 * 32,);
+ ALIGN_STK_64(pixel, a_dst, 32 * 32,);
+ ALIGN_STK_64(int16_t, ac, 32 * 32,);
+ ALIGN_STK_64(pixel, topleft_buf, 257,);
pixel *const topleft = topleft_buf + 128;
declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
@@ -227,9 +227,9 @@
}
static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
- ALIGN_STK_32(pixel, c_dst, 64 * 64,);
- ALIGN_STK_32(pixel, a_dst, 64 * 64,);
- ALIGN_STK_32(uint8_t, idx, 64 * 64,);
+ ALIGN_STK_64(pixel, c_dst, 64 * 64,);
+ ALIGN_STK_64(pixel, a_dst, 64 * 64,);
+ ALIGN_STK_64(uint8_t, idx, 64 * 64,);
ALIGN_STK_16(uint16_t, pal, 8,);
declare_func(void, pixel *dst, ptrdiff_t stride, const uint16_t *pal,
--- a/tests/checkasm/itx.c
+++ b/tests/checkasm/itx.c
@@ -226,9 +226,9 @@
Dav1dInvTxfmDSPContext c;
bitfn(dav1d_itx_dsp_init)(&c);
- ALIGN_STK_32(coef, coeff, 2, [32 * 32]);
- ALIGN_STK_32(pixel, c_dst, 64 * 64,);
- ALIGN_STK_32(pixel, a_dst, 64 * 64,);
+ ALIGN_STK_64(coef, coeff, 2, [32 * 32]);
+ ALIGN_STK_64(pixel, c_dst, 64 * 64,);
+ ALIGN_STK_64(pixel, a_dst, 64 * 64,);
static const uint8_t txfm_size_order[N_RECT_TX_SIZES] = {
TX_4X4, RTX_4X8, RTX_4X16,
--- a/tests/checkasm/loopfilter.c
+++ b/tests/checkasm/loopfilter.c
@@ -95,8 +95,8 @@
const int n_blks, const int lf_idx,
const int is_chroma, const int dir)
{
- ALIGN_STK_32(pixel, c_dst_mem, 128 * 16,);
- ALIGN_STK_32(pixel, a_dst_mem, 128 * 16,);
+ ALIGN_STK_64(pixel, c_dst_mem, 128 * 16,);
+ ALIGN_STK_64(pixel, a_dst_mem, 128 * 16,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const uint32_t *mask,
const uint8_t (*l)[4], ptrdiff_t b4_stride,
--- a/tests/checkasm/looprestoration.c
+++ b/tests/checkasm/looprestoration.c
@@ -44,9 +44,9 @@
}
static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
- ALIGN_STK_32(pixel, c_dst, 448 * 64,);
- ALIGN_STK_32(pixel, a_dst, 448 * 64,);
- ALIGN_STK_32(pixel, h_edge, 448 * 8,);
+ ALIGN_STK_64(pixel, c_dst, 448 * 64,);
+ ALIGN_STK_64(pixel, a_dst, 448 * 64,);
+ ALIGN_STK_64(pixel, h_edge, 448 * 8,);
pixel left[64][4];
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
@@ -116,9 +116,9 @@
}
static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {
- ALIGN_STK_32(pixel, c_dst, 448 * 64,);
- ALIGN_STK_32(pixel, a_dst, 448 * 64,);
- ALIGN_STK_32(pixel, h_edge, 448 * 8,);
+ ALIGN_STK_64(pixel, c_dst, 448 * 64,);
+ ALIGN_STK_64(pixel, a_dst, 448 * 64,);
+ ALIGN_STK_64(pixel, h_edge, 448 * 8,);
pixel left[64][4];
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
--- a/tests/checkasm/mc.c
+++ b/tests/checkasm/mc.c
@@ -55,9 +55,9 @@
}
static void check_mc(Dav1dMCDSPContext *const c) {
- ALIGN_STK_32(pixel, src_buf, 135 * 135,);
- ALIGN_STK_32(pixel, c_dst, 128 * 128,);
- ALIGN_STK_32(pixel, a_dst, 128 * 128,);
+ ALIGN_STK_64(pixel, src_buf, 135 * 135,);
+ ALIGN_STK_64(pixel, c_dst, 128 * 128,);
+ ALIGN_STK_64(pixel, a_dst, 128 * 128,);
const pixel *src = src_buf + 135 * 3 + 3;
const ptrdiff_t src_stride = 135 * sizeof(pixel);
@@ -118,9 +118,9 @@
}
static void check_mct(Dav1dMCDSPContext *const c) {
- ALIGN_STK_32(pixel, src_buf, 135 * 135,);
- ALIGN_STK_32(int16_t, c_tmp, 128 * 128,);
- ALIGN_STK_32(int16_t, a_tmp, 128 * 128,);
+ ALIGN_STK_64(pixel, src_buf, 135 * 135,);
+ ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
+ ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
const pixel *src = src_buf + 135 * 3 + 3;
const ptrdiff_t src_stride = 135 * sizeof(pixel);
@@ -173,9 +173,9 @@
}
static void check_avg(Dav1dMCDSPContext *const c) {
- ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
- ALIGN_STK_32(pixel, c_dst, 135 * 135,);
- ALIGN_STK_32(pixel, a_dst, 128 * 128,);
+ ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+ ALIGN_STK_64(pixel, c_dst, 135 * 135,);
+ ALIGN_STK_64(pixel, a_dst, 128 * 128,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX);
@@ -204,9 +204,9 @@
}
static void check_w_avg(Dav1dMCDSPContext *const c) {
- ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
- ALIGN_STK_32(pixel, c_dst, 135 * 135,);
- ALIGN_STK_32(pixel, a_dst, 128 * 128,);
+ ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+ ALIGN_STK_64(pixel, c_dst, 135 * 135,);
+ ALIGN_STK_64(pixel, a_dst, 128 * 128,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX);
@@ -236,10 +236,10 @@
}
static void check_mask(Dav1dMCDSPContext *const c) {
- ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
- ALIGN_STK_32(pixel, c_dst, 135 * 135,);
- ALIGN_STK_32(pixel, a_dst, 128 * 128,);
- ALIGN_STK_32(uint8_t, mask, 128 * 128,);
+ ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+ ALIGN_STK_64(pixel, c_dst, 135 * 135,);
+ ALIGN_STK_64(pixel, a_dst, 128 * 128,);
+ ALIGN_STK_64(uint8_t, mask, 128 * 128,);
for (int i = 0; i < 128 * 128; i++)
mask[i] = rnd() % 65;
@@ -271,11 +271,11 @@
}
static void check_w_mask(Dav1dMCDSPContext *const c) {
- ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
- ALIGN_STK_32(pixel, c_dst, 135 * 135,);
- ALIGN_STK_32(pixel, a_dst, 128 * 128,);
- ALIGN_STK_32(uint8_t, c_mask, 128 * 128,);
- ALIGN_STK_32(uint8_t, a_mask, 128 * 128,);
+ ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+ ALIGN_STK_64(pixel, c_dst, 135 * 135,);
+ ALIGN_STK_64(pixel, a_dst, 128 * 128,);
+ ALIGN_STK_64(uint8_t, c_mask, 128 * 128,);
+ ALIGN_STK_64(uint8_t, a_mask, 128 * 128,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
const int16_t *tmp2, int w, int h, uint8_t *mask, int sign
@@ -321,10 +321,10 @@
}
static void check_blend(Dav1dMCDSPContext *const c) {
- ALIGN_STK_32(pixel, tmp, 32 * 32,);
- ALIGN_STK_32(pixel, c_dst, 32 * 32,);
- ALIGN_STK_32(pixel, a_dst, 32 * 32,);
- ALIGN_STK_32(uint8_t, mask, 32 * 32,);
+ ALIGN_STK_64(pixel, tmp, 32 * 32,);
+ ALIGN_STK_64(pixel, c_dst, 32 * 32,);
+ ALIGN_STK_64(pixel, a_dst, 32 * 32,);
+ ALIGN_STK_64(uint8_t, mask, 32 * 32,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h, const uint8_t *mask);
@@ -357,9 +357,9 @@
}
static void check_blend_v(Dav1dMCDSPContext *const c) {
- ALIGN_STK_32(pixel, tmp, 32 * 128,);
- ALIGN_STK_32(pixel, c_dst, 32 * 128,);
- ALIGN_STK_32(pixel, a_dst, 32 * 128,);
+ ALIGN_STK_64(pixel, tmp, 32 * 128,);
+ ALIGN_STK_64(pixel, c_dst, 32 * 128,);
+ ALIGN_STK_64(pixel, a_dst, 32 * 128,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h);
@@ -391,9 +391,9 @@
}
static void check_blend_h(Dav1dMCDSPContext *const c) {
- ALIGN_STK_32(pixel, tmp, 128 * 32,);
- ALIGN_STK_32(pixel, c_dst, 128 * 32,);
- ALIGN_STK_32(pixel, a_dst, 128 * 32,);
+ ALIGN_STK_64(pixel, tmp, 128 * 32,);
+ ALIGN_STK_64(pixel, c_dst, 128 * 32,);
+ ALIGN_STK_64(pixel, a_dst, 128 * 32,);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h);
@@ -424,9 +424,9 @@
}
static void check_warp8x8(Dav1dMCDSPContext *const c) {
- ALIGN_STK_32(pixel, src_buf, 15 * 15,);
- ALIGN_STK_32(pixel, c_dst, 8 * 8,);
- ALIGN_STK_32(pixel, a_dst, 8 * 8,);
+ ALIGN_STK_64(pixel, src_buf, 15 * 15,);
+ ALIGN_STK_64(pixel, c_dst, 8 * 8,);
+ ALIGN_STK_64(pixel, a_dst, 8 * 8,);
int16_t abcd[4];
const pixel *src = src_buf + 15 * 3 + 3;
const ptrdiff_t dst_stride = 8 * sizeof(pixel);
@@ -462,9 +462,9 @@
}
static void check_warp8x8t(Dav1dMCDSPContext *const c) {
- ALIGN_STK_32(pixel, src_buf, 15 * 15,);
- ALIGN_STK_32(int16_t, c_tmp, 8 * 8,);
- ALIGN_STK_32(int16_t, a_tmp, 8 * 8,);
+ ALIGN_STK_64(pixel, src_buf, 15 * 15,);
+ ALIGN_STK_64(int16_t, c_tmp, 8 * 8,);
+ ALIGN_STK_64(int16_t, a_tmp, 8 * 8,);
int16_t abcd[4];
const pixel *src = src_buf + 15 * 3 + 3;
const ptrdiff_t src_stride = 15 * sizeof(pixel);
@@ -534,9 +534,9 @@
}
static void check_emuedge(Dav1dMCDSPContext *const c) {
- ALIGN_STK_32(pixel, c_dst, 135 * 192,);
- ALIGN_STK_32(pixel, a_dst, 135 * 192,);
- ALIGN_STK_32(pixel, src, 160 * 160,);
+ ALIGN_STK_64(pixel, c_dst, 135 * 192,);
+ ALIGN_STK_64(pixel, a_dst, 135 * 192,);
+ ALIGN_STK_64(pixel, src, 160 * 160,);
for (int i = 0; i < 160 * 160; i++)
src[i] = rnd() & ((1U << BITDEPTH) - 1);