shithub: libvpx

Download patch

ref: efdfdf578792b0a9ed93e3b9dc747e1bfe7f5c0c
parent: d4a47a6cc0d869bea3071c15bc61da6836026d0b
author: levytamar82 <tamar.levy@intel.com>
date: Thu Jul 24 09:40:21 EDT 2014

32 Align Load bug
In the sub_pixel_avg_variance the parameter sec was also aligned load and
changed to unaligned.

Change-Id: I4d4966e0291059ea4d705baed1503dc58444fcb7

--- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
@@ -333,7 +333,7 @@
     if (y_offset == 0) {
       for (i = 0; i < height ; i++) {
         LOAD_SRC_DST
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         // expend each byte to 2 bytes
@@ -347,7 +347,7 @@
       for (i = 0; i < height ; i++) {
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, src_stride)
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         // expend each byte to 2 bytes
@@ -369,7 +369,7 @@
         MERGE_NEXT_SRC(src_reg, src_stride)
         FILTER_SRC(filter)
         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         MERGE_WITH_SRC(src_reg, zero_reg)
@@ -385,7 +385,7 @@
       for (i = 0; i < height ; i++) {
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, 1)
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         // expand each byte to 2 bytes
@@ -409,7 +409,7 @@
         AVG_NEXT_SRC(src_reg, 1)
         // average between previous average to current average
         src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
         sec+= sec_stride;
         // expand each byte to 2 bytes
@@ -437,7 +437,7 @@
         MERGE_WITH_SRC(src_avg, src_reg)
         FILTER_SRC(filter)
         src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
         // expand each byte to 2 bytes
         MERGE_WITH_SRC(src_avg, zero_reg)
@@ -459,7 +459,7 @@
         MERGE_NEXT_SRC(src_reg, 1)
         FILTER_SRC(filter)
         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         MERGE_WITH_SRC(src_reg, zero_reg)
         sec+= sec_stride;
@@ -487,7 +487,7 @@
         src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
         // average between previous pack to the current
         src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
         sec+= sec_stride;
         MERGE_WITH_SRC(src_pack, zero_reg)
@@ -524,7 +524,7 @@
         // filter the source
         FILTER_SRC(yfilter)
         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
         MERGE_WITH_SRC(src_pack, zero_reg)
         src_pack = src_reg;