ref: c182725cbc9e1e4892784a24c32b1bed80047b0c
parent: 19882cdbf9518f35eaeccff9702dbb5bab708e06
author: chiyotsai <chiyotsai@google.com>
date: Fri Nov 2 13:08:05 EDT 2018
Remove unnecessary calculation in 4-tap interpolation filter Reduces the number of rows calculated for 2D 4-tap interpolation filter from h+7 rows to h+3 rows. Also fixes a bug in the avx2 function for 4-tap filters where the last row is computed incorrectly. Performance: | Baseline | Result | Pct Gain | bitdepth lo| 4.00 fps | 4.02 fps | 0.5% | bitdepth 10| 1.90 fps | 1.91 fps | 0.5% | The performance is evaluated on speed 1 on jets.y4m br 500 over 100 frames. No BDBR loss is observed. Change-Id: I90b0d4d697319b7bba599f03c5dc01abd85d13b1
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -16,11 +16,17 @@
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
+// TODO(chiyotsai@google.com): Refactor the code here. Currently this is pretty
+// hacky and awful to read. Note that there is a filter_x[3] == 128 check in
+// HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function
+// assumes the filter is always 8 tap.
typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
uint8_t *output_ptr, ptrdiff_t out_pitch,
uint32_t output_height, const int16_t *filter);
-#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \
+// TODO(chiyotsai@google.com): Remove the is_avg argument to the MACROS once we
+// have 4-tap vert avg filter.
+#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \
void vpx_convolve8_##name##_##opt( \
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
@@ -33,6 +39,7 @@
assert(filter_row[3] != 128); \
assert(step_q4 == 16); \
if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
+ const int num_taps = 8; \
while (w >= 16) { \
vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter_row); \
@@ -47,7 +54,9 @@
vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter_row); \
} \
+ (void)num_taps; \
} else if (filter_row[2] | filter_row[5]) { \
+ const int num_taps = is_avg ? 8 : 4; \
while (w >= 16) { \
vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter_row); \
@@ -62,9 +71,11 @@
vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter_row); \
} \
+ (void)num_taps; \
} else { \
+ const int num_taps = 2; \
while (w >= 16) { \
- vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \
+ vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter_row); \
src += 16; \
dst += 16; \
@@ -71,16 +82,17 @@
w -= 16; \
} \
if (w == 8) { \
- vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \
+ vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter_row); \
} else if (w == 4) { \
- vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \
+ vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter_row); \
} \
+ (void)num_taps; \
} \
}
-#define FUN_CONV_2D(avg, opt) \
+#define FUN_CONV_2D(avg, opt, is_avg) \
void vpx_convolve8_##avg##opt( \
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
@@ -94,7 +106,7 @@
assert(h <= 64); \
assert(x_step_q4 == 16); \
assert(y_step_q4 == 16); \
- if (filter_x[0] | filter_x[1] | filter_x[2]) { \
+ if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) { \
DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
@@ -102,6 +114,15 @@
vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
filter, x0_q4, x_step_q4, y0_q4, \
y_step_q4, w, h); \
+ } else if (filter_x[2] | filter_x[5]) { \
+ const int num_taps = is_avg ? 8 : 4; \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
+ vpx_convolve8_horiz_##opt( \
+ src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1); \
+ vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64, \
+ dst, dst_stride, filter, x0_q4, \
+ x_step_q4, y0_q4, y_step_q4, w, h); \
} else { \
DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \
@@ -121,89 +142,96 @@
unsigned int output_height,
const int16_t *filter, int bd);
-#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \
- void vpx_highbd_convolve8_##name##_##opt( \
- const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
- ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
- int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \
- const int16_t *filter_row = filter[offset]; \
- if (step_q4 == 16 && filter_row[3] != 128) { \
- if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
- while (w >= 16) { \
- vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else if (filter_row[2] | filter_row[5]) { \
- while (w >= 16) { \
- vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else { \
- while (w >= 16) { \
- vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \
- src, src_stride, dst, dst_stride, h, filter_row, bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \
- src, src_stride, dst, dst_stride, h, filter_row, bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \
- src, src_stride, dst, dst_stride, h, filter_row, bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } \
- } \
- if (w) { \
- vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
- filter, x0_q4, x_step_q4, y0_q4, \
- y_step_q4, w, h, bd); \
- } \
+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, \
+ is_avg) \
+ void vpx_highbd_convolve8_##name##_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \
+ const int16_t *filter_row = filter_kernel[offset]; \
+ if (step_q4 == 16 && filter_row[3] != 128) { \
+ if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
+ const int num_taps = 8; \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ (void)num_taps; \
+ } else if (filter_row[2] | filter_row[5]) { \
+ const int num_taps = is_avg ? 8 : 4; \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ (void)num_taps; \
+ } else { \
+ const int num_taps = 2; \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ (void)num_taps; \
+ } \
+ } \
+ if (w) { \
+ vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+ filter_kernel, x0_q4, x_step_q4, y0_q4, \
+ y_step_q4, w, h, bd); \
+ } \
}
-#define HIGH_FUN_CONV_2D(avg, opt) \
+#define HIGH_FUN_CONV_2D(avg, opt, is_avg) \
void vpx_highbd_convolve8_##avg##opt( \
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
@@ -212,7 +240,8 @@
assert(w <= 64); \
assert(h <= 64); \
if (x_step_q4 == 16 && y_step_q4 == 16) { \
- if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \
+ if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) || \
+ filter_x[3] == 128) { \
DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
fdata2, 64, filter, x0_q4, x_step_q4, \
@@ -220,6 +249,16 @@
vpx_highbd_convolve8_##avg##vert_##opt( \
fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \
y0_q4, y_step_q4, w, h, bd); \
+ } else if (filter_x[2] | filter_x[5]) { \
+ const int num_taps = is_avg ? 8 : 4; \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
+ vpx_highbd_convolve8_horiz_##opt( \
+ src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1, \
+ bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt( \
+ fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter, \
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \
} else { \
DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \
@@ -235,6 +274,6 @@
bd); \
} \
}
-#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // VPX_VPX_DSP_X86_CONVOLVE_H_
--- a/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -1089,10 +1089,7 @@
// Repeat for the last row if needed
if (h > 0) {
- src_reg = _mm256_loadu_si256((const __m256i *)src_ptr);
- // Reorder into 2 1 1 2
- src_reg = _mm256_permute4x64_epi64(src_reg, 0x94);
-
+ src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
@@ -1099,12 +1096,12 @@
res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
&kernel_reg_23, &kernel_reg_45);
- res_reg = mm256_round_epi32(&res_first, ®_round, CONV8_ROUNDING_BITS);
+ res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS);
res_reg = _mm256_packus_epi32(res_reg, res_reg);
- res_reg = _mm256_permute4x64_epi64(res_reg, 0x8);
+ res_reg = _mm256_min_epi16(res_reg, reg_max);
- _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
+ mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
}
}
@@ -1279,10 +1276,6 @@
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the souce, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
@@ -1368,10 +1361,6 @@
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the souce, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
@@ -1476,9 +1465,10 @@
#define vpx_highbd_filter_block1d4_h4_avg_avx2 \
vpx_highbd_filter_block1d4_h8_avg_avx2
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
-HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
-HIGH_FUN_CONV_2D(, avx2);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), , avx2, 0);
+HIGH_FUN_CONV_2D(, avx2, 0);
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
@@ -1497,9 +1487,9 @@
#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
vpx_highbd_filter_block1d4_v2_avg_sse2
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
-HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
- avx2);
-HIGH_FUN_CONV_2D(avg_, avx2);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
+HIGH_FUN_CONV_2D(avg_, avx2, 1);
#undef HIGHBD_FUNC
--- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
+++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -133,10 +133,6 @@
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the souce, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -345,10 +341,6 @@
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the souce, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -531,10 +523,6 @@
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the souce, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -713,10 +701,6 @@
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the source, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
@@ -896,10 +880,6 @@
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the source, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
@@ -1060,10 +1040,12 @@
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
-FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
+ sse2, 0);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1);
// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
@@ -1075,8 +1057,8 @@
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
-FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_, sse2);
+FUN_CONV_2D(, sse2, 0);
+FUN_CONV_2D(avg_, sse2, 1);
#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
@@ -1157,11 +1139,12 @@
// const int16_t *filter_y,
// int y_step_q4,
// int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
- sse2);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), , sse2, 0);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1);
// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
@@ -1173,6 +1156,6 @@
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4,
// int y_step_q4, int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2);
-HIGH_FUN_CONV_2D(avg_, sse2);
+HIGH_FUN_CONV_2D(, sse2, 0);
+HIGH_FUN_CONV_2D(avg_, sse2, 1);
#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -464,10 +464,6 @@
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the souce, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -665,10 +661,6 @@
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the souce, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
@@ -839,10 +831,6 @@
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the souce, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
@@ -981,10 +969,12 @@
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4,
// int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
-FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
-FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
+ avx2, 0);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
@@ -996,6 +986,6 @@
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
-FUN_CONV_2D(, avx2);
-FUN_CONV_2D(avg_, avx2);
+FUN_CONV_2D(, avx2, 0);
+FUN_CONV_2D(avg_, avx2, 1);
#endif // HAVE_AX2 && HAVE_SSSE3
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -310,10 +310,6 @@
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the souce, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -483,10 +479,6 @@
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the souce, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -627,10 +619,6 @@
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the souce, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -743,10 +731,12 @@
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4,
// int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3);
-FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
+ ssse3, 0);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1);
static void filter_horiz_w8_ssse3(const uint8_t *const src,
const ptrdiff_t src_stride,
@@ -1093,5 +1083,5 @@
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_, ssse3);
+FUN_CONV_2D(, ssse3, 0);
+FUN_CONV_2D(avg_, ssse3, 1);