ref: 69a5f5ecf7257e4ef7ca0c75d4d84c6d3a97fec0
parent: 72a5832a81f22b067f398df70ce0f6d9243db11d
author: levytamar82 <tamar.levy@intel.com>
date: Wed Jul 23 20:20:19 EDT 2014
Fix bug 807 in the sub_pixel_*variance* function the dst is aligned to 16 bytes and not to 32 bytes - now load unaligned data Change-Id: I2e0b9745543697efc56fefa32857ea10117af135
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -707,24 +707,7 @@
#endif
#if HAVE_AVX2
-// TODO(jzern): these prototypes can be removed after the avx2 versions are
-// reenabled in vp9_rtcd_defs.pl.
-extern "C" {
-unsigned int vp9_sub_pixel_variance32x32_avx2(
- const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
- const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance64x64_avx2(
- const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
- const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_avg_variance32x32_avx2(
- const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
- const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
- const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance64x64_avx2(
- const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
- const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
- const uint8_t *second_pred);
-}
+
const vp9_variance_fn_t variance16x16_avx2 = vp9_variance16x16_avx2;
const vp9_variance_fn_t variance32x16_avx2 = vp9_variance32x16_avx2;
const vp9_variance_fn_t variance32x32_avx2 = vp9_variance32x32_avx2;
@@ -743,7 +726,7 @@
const vp9_subpixvariance_fn_t subpel_variance64x64_avx2 =
vp9_sub_pixel_variance64x64_avx2;
INSTANTIATE_TEST_CASE_P(
- DISABLED_AVX2, VP9SubpelVarianceTest,
+ AVX2, VP9SubpelVarianceTest,
::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2),
make_tuple(6, 6, subpel_variance64x64_avx2)));
@@ -752,7 +735,7 @@
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_avx2 =
vp9_sub_pixel_avg_variance64x64_avx2;
INSTANTIATE_TEST_CASE_P(
- DISABLED_AVX2, VP9SubpelAvgVarianceTest,
+ AVX2, VP9SubpelAvgVarianceTest,
::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2),
make_tuple(6, 6, subpel_avg_variance64x64_avx2)));
#endif // HAVE_AVX2
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -447,10 +447,10 @@
specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x64/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -477,10 +477,10 @@
specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x32 neon/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance32x32 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance16x16 neon/, "$sse2_x86inc", "$ssse3_x86inc";
--- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
@@ -67,7 +67,7 @@
#define LOAD_SRC_DST \
/* load source and destination */ \
src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
- dst_reg = _mm256_load_si256((__m256i const *) (dst));
+ dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
#define AVG_NEXT_SRC(src_reg, size_stride) \
src_next_reg = _mm256_loadu_si256((__m256i const *) \
--
⑨