ref: 3e69410e29b0cc4e8a6e9712d7c980d702597d62
parent: d503125101116d2b399287824d7902b6351b691d
author: Timothy B. Terriberry <tterribe@xiph.org>
date: Fri Feb 9 11:42:15 EST 2024
Fix OOB read in fixed-point NEON intrinsics. xcorr_kernel_neon_fixed() read one more sample from y[] in the main loop than it needed to allow use of vector loads, but unlike the native asm in celt_pitch_xcorr_arm.s, the loop condition did not exit early enough to prevent this from overrunning the end of the array. Additionally, the tail loop _always_ read one value beyond what it needed. This patch fixes the loop condition on the main loop. Since this makes the tail section run even for lengths that are a multiple of 8 (e.g., on fully half the multiplies for usages like celt_fir() or celt_iir() with an order of 16, which is common), rather than try to fix the tail loop, we replace it with a non-looping adaptation of the native asm, which continues to use vector loads as much as possible for the remaining elements (and also does not read ahead past the end of the y[] array). Overall slowdown of test_opus_encode on a Raspberry Pi 5 Model B Rev 1.0 is 0.12% vs. 0.13% for fixing the existing tail loop. Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
--- a/celt/arm/celt_neon_intr.c
+++ b/celt/arm/celt_neon_intr.c
@@ -38,6 +38,8 @@
#include "../pitch.h"
#if defined(FIXED_POINT)
+#include <string.h>
+
void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
{
int j;
@@ -47,7 +49,10 @@
int16x4_t y0 = vld1_s16(y);
y += 4;
- for (j = 0; j + 8 <= len; j += 8)
+ /* This loop loads one y value more than we actually need.
+ Therefore we have to stop as soon as there are 8 or fewer samples left
+ (instead of 7), to avoid reading past the end of the array. */
+ for (j = 0; j + 8 < len; j += 8)
{
/* Load x[0...7] */
int16x8_t xx = vld1q_s16(x);
@@ -80,20 +85,65 @@
x += 8;
y += 8;
}
-
- for (; j < len; j++)
- {
- int16x4_t x0 = vld1_dup_s16(x); /* load next x */
+ if (j + 4 < len) {
+ /* Load x[0...3] */
+ int16x4_t x0 = vld1_s16(x);
+ /* Load y[4...7] */
+ int16x4_t y4 = vld1_s16(y);
+ int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
+ int16x4_t y1 = vext_s16(y0, y4, 1);
+ int32x4_t a1 = vmlal_lane_s16(a0, y1, x0, 1);
+ int16x4_t y2 = vext_s16(y0, y4, 2);
+ int32x4_t a2 = vmlal_lane_s16(a1, y2, x0, 2);
+ int16x4_t y3 = vext_s16(y0, y4, 3);
+ int32x4_t a3 = vmlal_lane_s16(a2, y3, x0, 3);
+ y0 = y4;
+ a = a3;
+ x += 4;
+ y += 4;
+ j += 4;
+ }
+ if (j + 2 < len) {
+ /* Load x[0...1] */
+ int16x4x2_t xx = vld2_dup_s16(x);
+ int16x4_t x0 = xx.val[0];
+ int16x4_t x1 = xx.val[1];
+ /* Load y[4...5].
+ We would like to use vld1_dup_s32(), but casting the pointer would
+ break strict aliasing rules and potentially have alignment issues.
+ Fortunately the compiler seems capable of translating this memcpy()
+ and vdup_n_s32() into the equivalent vld1_dup_s32().*/
+ int32_t yy;
+ memcpy(&yy, y, sizeof(yy));
+ int16x4_t y4 = vreinterpret_s16_s32(vdup_n_s32(yy));
int32x4_t a0 = vmlal_s16(a, y0, x0);
-
- int16x4_t y4 = vld1_dup_s16(y); /* load next y */
- y0 = vext_s16(y0, y4, 1);
+ int16x4_t y1 = vext_s16(y0, y4, 1);
+ /* Replace bottom copy of {y[5], y[4]} in y4 with {y[3], y[2]} from y0,
+ using VSRI instead of VEXT, since it's a data-processing
+ instruction. */
+ y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
+ vreinterpret_s64_s16(y0), 32));
+ int32x4_t a1 = vmlal_s16(a0, y1, x1);
+ a = a1;
+ x += 2;
+ y += 2;
+ j += 2;
+ }
+ if (j + 1 < len) {
+ /* Load next x. */
+ int16x4_t x0 = vld1_dup_s16(x);
+ int32x4_t a0 = vmlal_s16(a, y0, x0);
+ /* Load last y. */
+ int16x4_t y4 = vld1_dup_s16(y);
+ y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
+ vreinterpret_s64_s16(y0), 16));
a = a0;
x++;
- y++;
}
-
- vst1q_s32(sum, a);
+ /* Load last x. */
+ int16x4_t x0 = vld1_dup_s16(x);
+ int32x4_t a0 = vmlal_s16(a, y0, x0);
+ vst1q_s32(sum, a0);
}
#else
--
⑨