ref: bcd17e32c9498cb7a2375999226a60c500cc29da
parent: c875803def7344aa69334f4dada2a3812a10a964
author: Yunqing Wang <yunqingwang@google.com>
date: Tue Oct 16 05:24:18 EDT 2018
Fix the filter tap calculation in mips optimizations The interp filter tap calculation was not accurate to tell the difference between 2 taps and 4 taps. This patch fixed the bug, and resolved Jenkins test failures in mips sub-pel filter optimizations. BUG=webm:1568 Change-Id: I51eb8adb7ed194ef2ea7dd4aa57aa9870ee38cfc
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -789,13 +789,7 @@
}
}
-#if HAVE_MSA
-// TODO(any) MSA optimizations doesn't work with 4-tap interp filter. Need to be
-// fixed.
-const int kNumFilterBanks = 4;
-#else
const int kNumFilterBanks = 5;
-#endif
const int kNumFilters = 16;
TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
--- a/vpx_dsp/mips/convolve8_avg_dspr2.c
+++ b/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -15,6 +15,7 @@
#include "vpx_dsp/mips/convolve_common_dspr2.h"
#include "vpx_dsp/vpx_convolve.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
#if HAVE_DSPR2
@@ -341,7 +342,7 @@
assert(y_step_q4 == 16);
assert(((const int32_t *)filter_y)[1] != 0x800000);
- if (((const int32_t *)filter_y)[0] == 0) {
+ if (vpx_get_filter_taps(filter_y) == 2) {
vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter,
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
} else {
--- a/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
@@ -15,6 +15,7 @@
#include "vpx_dsp/mips/convolve_common_dspr2.h"
#include "vpx_dsp/vpx_convolve.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
#if HAVE_DSPR2
@@ -945,7 +946,7 @@
assert(x_step_q4 == 16);
assert(((const int32_t *)filter_x)[1] != 0x800000);
- if (((const int32_t *)filter_x)[0] == 0) {
+ if (vpx_get_filter_taps(filter_x) == 2) {
vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter,
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
} else {
--- a/vpx_dsp/mips/convolve8_dspr2.c
+++ b/vpx_dsp/mips/convolve8_dspr2.c
@@ -1322,7 +1322,7 @@
if (filter_x[3] == 0x80) {
copy_horiz_transposed(src - src_stride * 3, src_stride, temp,
intermediate_height, w, intermediate_height);
- } else if (((const int32_t *)filter_x)[0] == 0) {
+ } else if (vpx_get_filter_taps(filter_x) == 2) {
vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp,
intermediate_height, filter_x, w, intermediate_height);
} else {
@@ -1365,7 +1365,7 @@
/* copy the src to dst */
if (filter_y[3] == 0x80) {
copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w);
- } else if (((const int32_t *)filter_y)[0] == 0) {
+ } else if (vpx_get_filter_taps(filter_y) == 2) {
vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride,
filter_y, h, w);
} else {
--- a/vpx_dsp/mips/convolve8_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve8_horiz_dspr2.c
@@ -825,7 +825,7 @@
assert(x_step_q4 == 16);
assert(((const int32_t *)filter_x)[1] != 0x800000);
- if (((const int32_t *)filter_x)[0] == 0) {
+ if (vpx_get_filter_taps(filter_x) == 2) {
vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
x_step_q4, y0_q4, y_step_q4, w, h);
} else {
--- a/vpx_dsp/mips/convolve8_vert_dspr2.c
+++ b/vpx_dsp/mips/convolve8_vert_dspr2.c
@@ -325,7 +325,7 @@
assert(y_step_q4 == 16);
assert(((const int32_t *)filter_y)[1] != 0x800000);
- if (((const int32_t *)filter_y)[0] == 0) {
+ if (vpx_get_filter_taps(filter_y) == 2) {
vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
x_step_q4, y0_q4, y_step_q4, w, h);
} else {
--- a/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -658,7 +658,7 @@
filt_hor[cnt] = filter_x[cnt];
}
- if (((const int32_t *)filter_x)[0] == 0) {
+ if (vpx_get_filter_taps(filter_x) == 2) {
switch (w) {
case 4:
common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
--- a/vpx_dsp/mips/vpx_convolve8_avg_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -538,8 +538,8 @@
filt_ver[cnt] = filter_y[cnt];
}
- if (((const int32_t *)filter_x)[0] == 0 &&
- ((const int32_t *)filter_y)[0] == 0) {
+ if (vpx_get_filter_taps(filter_x) == 2 &&
+ vpx_get_filter_taps(filter_y) == 2) {
switch (w) {
case 4:
common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
@@ -571,8 +571,8 @@
x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
- } else if (((const int32_t *)filter_x)[0] == 0 ||
- ((const int32_t *)filter_y)[0] == 0) {
+ } else if (vpx_get_filter_taps(filter_x) == 2 ||
+ vpx_get_filter_taps(filter_y) == 2) {
vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
x_step_q4, y0_q4, y_step_q4, w, h);
} else {
--- a/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -625,7 +625,7 @@
filt_ver[cnt] = filter_y[cnt];
}
- if (((const int32_t *)filter_y)[0] == 0) {
+ if (vpx_get_filter_taps(filter_y) == 2) {
switch (w) {
case 4:
common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
--- a/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -634,7 +634,7 @@
filt_hor[cnt] = filter_x[cnt];
}
- if (((const int32_t *)filter_x)[0] == 0) {
+ if (vpx_get_filter_taps(filter_x) == 2) {
switch (w) {
case 4:
common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
--- a/vpx_dsp/mips/vpx_convolve8_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -558,8 +558,8 @@
filt_ver[cnt] = filter_y[cnt];
}
- if (((const int32_t *)filter_x)[0] == 0 &&
- ((const int32_t *)filter_y)[0] == 0) {
+ if (vpx_get_filter_taps(filter_x) == 2 &&
+ vpx_get_filter_taps(filter_y) == 2) {
switch (w) {
case 4:
common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
@@ -591,8 +591,8 @@
x_step_q4, y0_q4, y_step_q4, w, h);
break;
}
- } else if (((const int32_t *)filter_x)[0] == 0 ||
- ((const int32_t *)filter_y)[0] == 0) {
+ } else if (vpx_get_filter_taps(filter_x) == 2 ||
+ vpx_get_filter_taps(filter_y) == 2) {
vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
y0_q4, y_step_q4, w, h);
} else {
--- a/vpx_dsp/mips/vpx_convolve8_vert_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -641,7 +641,7 @@
filt_ver[cnt] = filter_y[cnt];
}
- if (((const int32_t *)filter_y)[0] == 0) {
+ if (vpx_get_filter_taps(filter_y) == 2) {
switch (w) {
case 4:
common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
--- a/vpx_dsp/vpx_filter.h
+++ b/vpx_dsp/vpx_filter.h
@@ -11,6 +11,7 @@
#ifndef VPX_VPX_DSP_VPX_FILTER_H_
#define VPX_VPX_DSP_VPX_FILTER_H_
+#include <assert.h>
#include "vpx/vpx_integer.h"
#ifdef __cplusplus
@@ -25,6 +26,14 @@
#define SUBPEL_TAPS 8
typedef int16_t InterpKernel[SUBPEL_TAPS];
+
+static INLINE int vpx_get_filter_taps(const int16_t *const filter) {
+ assert(filter[3] != 128);
+ if (!filter[0] && !filter[1] && !filter[2])
+ return 2;
+ else
+ return 8;
+}
#ifdef __cplusplus
} // extern "C"