ref: 82eda83acd4b903d9a1ea03687951f3d6d8cd6b8
parent: 2e5e05b767f0c61d16a6df8ae787791555ed2c42
author: Henrik Gramner <gramner@twoorioles.com>
date: Fri Nov 22 15:50:57 EST 2019
Avoid excessive L2 collisions with certain frame widths Memory addresses with certain power-of-two offsets will map to the same set of cache lines. Using such offsets as strides will cause excessive cache evictions resulting in more cache misses. Avoid this by adding a small padding when the stride is a multiple of 1024 (somewhat arbitrarily chosen as the specific number depends on the hardware implementation) when allocating picture buffers.
--- a/src/picture.c
+++ b/src/picture.c
@@ -52,17 +52,24 @@
const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
- p->stride[0] = aligned_w << hbd;
- p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
- const size_t y_sz = p->stride[0] * aligned_h;
- const size_t uv_sz = p->stride[1] * (aligned_h >> ss_ver);
- const size_t pic_size = y_sz + 2 * uv_sz;
-
- uint8_t *data = dav1d_alloc_aligned(pic_size + DAV1D_PICTURE_ALIGNMENT,
- DAV1D_PICTURE_ALIGNMENT);
- if (data == NULL) {
- return DAV1D_ERR(ENOMEM);
- }
+ ptrdiff_t y_stride = aligned_w << hbd;
+ ptrdiff_t uv_stride = has_chroma ? y_stride >> ss_hor : 0;
+ /* Due to how mapping of addresses to sets works in most L1 and L2 cache
+ * implementations, strides of multiples of certain power-of-two numbers
+ * may cause multiple rows of the same superblock to map to the same set,
+ * causing evictions of previous rows resulting in a reduction in cache
+ * hit rate. Avoid that by slightly padding the stride when necessary. */
+ if (!(y_stride & 1023))
+ y_stride += DAV1D_PICTURE_ALIGNMENT;
+ if (!(uv_stride & 1023) && has_chroma)
+ uv_stride += DAV1D_PICTURE_ALIGNMENT;
+ p->stride[0] = y_stride;
+ p->stride[1] = uv_stride;
+ const size_t y_sz = y_stride * aligned_h;
+ const size_t uv_sz = uv_stride * (aligned_h >> ss_ver);
+ const size_t pic_size = y_sz + 2 * uv_sz + DAV1D_PICTURE_ALIGNMENT;
+ uint8_t *data = dav1d_alloc_aligned(pic_size, DAV1D_PICTURE_ALIGNMENT);
+ if (!data) return DAV1D_ERR(ENOMEM);
p->data[0] = data;
p->data[1] = has_chroma ? data + y_sz : NULL;