shithub: libvpx

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -1115,8 +1115,8 @@

 add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";

 specialize qw/vp9_int_pro_col sse2/;

-add_proto qw/int vp9_vector_sad/, "int16_t const *ref, int16_t const *src, const int width";

-specialize qw/vp9_vector_sad sse2/;

+add_proto qw/int vp9_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";

+specialize qw/vp9_vector_var sse2/;

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

   add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";

--- a/vp9/encoder/vp9_avg.c

+++ b/vp9/encoder/vp9_avg.c

@@ -37,6 +37,7 @@

     hbuf[idx] = 0;

     for (i = 0; i < height; ++i)

       hbuf[idx] += ref[i * ref_stride];

+    hbuf[idx] /= 32;

     ++ref;

@@ -46,16 +47,23 @@

   int16_t sum = 0;

   for (idx = 0; idx < width; ++idx)

     sum += ref[idx];

-  return sum;

+  return sum / 32;

-int vp9_vector_sad_c(int16_t const *ref, int16_t const *src,

-                     const int width) {

+int vp9_vector_var_c(int16_t const *ref, int16_t const *src,

+                     const int bwl) {

   int i;

-  int this_sad = 0;

-  for (i = 0; i < width; ++i)

-    this_sad += abs(ref[i] - src[i]);

-  return this_sad;

+  int width = 4 << bwl;

+  int sse = 0, mean = 0, var;

+  for (i = 0; i < width; ++i) {

+    int diff = ref[i] - src[i];

+    mean += diff;

+    sse += diff * diff;

+  }

+  var = sse - ((mean * mean) >> (bwl + 2));

+  return var;

 #if CONFIG_VP9_HIGHBITDEPTH

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -519,14 +519,14 @@

 #endif

 #if GLOBAL_MOTION

-static int vector_match(int16_t *ref, int16_t *src, int length) {

+static int vector_match(int16_t *ref, int16_t *src, int bwl) {

   int best_sad = INT_MAX;

   int this_sad;

   int d;

   int center, offset = 0;

-  int bw = length;  // redundant variable, to be changed in the experiments.

+  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.

   for (d = 0; d <= bw; d += 16) {

-    this_sad = vp9_vector_sad(&ref[d], src, length);

+    this_sad = vp9_vector_var(&ref[d], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       offset = d;

@@ -539,7 +539,7 @@

     // check limit

     if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp9_vector_sad(&ref[this_pos], src, length);

+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -552,7 +552,7 @@

     // check limit

     if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp9_vector_sad(&ref[this_pos], src, length);

+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -565,7 +565,7 @@

     // check limit

     if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp9_vector_sad(&ref[this_pos], src, length);

+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -578,7 +578,7 @@

     // check limit

     if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp9_vector_sad(&ref[this_pos], src, length);

+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -638,8 +638,8 @@

   // Find the best match per 1-D search

-  tmp_mv->col = vector_match(hbuf, src_hbuf, bw);

-  tmp_mv->row = vector_match(vbuf, src_vbuf, bh);

+  tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);

+  tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);

   best_sad = INT_MAX;

   this_mv = *tmp_mv;

--- a/vp9/encoder/vp9_pickmode.c

+++ b/vp9/encoder/vp9_pickmode.c

@@ -192,7 +192,6 @@

                                  cond_cost_list(cpi, cost_list),

                                  x->nmvjointcost, x->mvcost,

                                  &dis, &x->pred_sse[ref], NULL, 0, 0);

-    x->pred_mv[ref] = tmp_mv->as_mv;

   if (scaled_ref_frame) {

--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c

+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c

@@ -90,6 +90,9 @@

   s0 = _mm_adds_epu16(s0, t0);

   s1 = _mm_adds_epu16(s1, t1);

+  s0 = _mm_srai_epi16(s0, 5);

+  s1 = _mm_srai_epi16(s1, 5);

   _mm_store_si128((__m128i *)hbuf, s0);

   hbuf += 8;

   _mm_store_si128((__m128i *)hbuf, s1);

@@ -112,51 +115,49 @@

   s1 = _mm_srli_si128(s0, 8);

   s0 = _mm_adds_epu16(s0, s1);

-  return _mm_extract_epi16(s0, 0);

+  return (_mm_extract_epi16(s0, 0)) >> 5;

-int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src,

-                        const int width) {

+int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,

+                        const int bwl) {

   int idx;

-  __m128i zero = _mm_setzero_si128();

-  __m128i sum;

+  int width = 4 << bwl;

+  int16_t mean;

   __m128i v0 = _mm_loadu_si128((const __m128i *)ref);

   __m128i v1 = _mm_load_si128((const __m128i *)src);

   __m128i diff = _mm_subs_epi16(v0, v1);

-  __m128i sign = _mm_srai_epi16(diff, 15);

+  __m128i sum = diff;

+  __m128i sse = _mm_madd_epi16(diff, diff);

-  diff = _mm_xor_si128(diff, sign);

-  sum = _mm_sub_epi16(diff, sign);

   ref += 8;

   src += 8;

-  v0 = _mm_unpacklo_epi16(sum, zero);

-  v1 = _mm_unpackhi_epi16(sum, zero);

-  sum = _mm_add_epi32(v0, v1);

   for (idx = 8; idx < width; idx += 8) {

     v0 = _mm_loadu_si128((const __m128i *)ref);

     v1 = _mm_load_si128((const __m128i *)src);

     diff = _mm_subs_epi16(v0, v1);

-    sign = _mm_srai_epi16(diff, 15);

-    diff = _mm_xor_si128(diff, sign);

-    diff = _mm_sub_epi16(diff, sign);

-    v0 = _mm_unpacklo_epi16(diff, zero);

-    v1 = _mm_unpackhi_epi16(diff, zero);

+    sum = _mm_add_epi16(sum, diff);

+    v0  = _mm_madd_epi16(diff, diff);

+    sse = _mm_add_epi32(sse, v0);

-    sum = _mm_add_epi32(sum, v0);

-    sum = _mm_add_epi32(sum, v1);

     ref += 8;

     src += 8;

-  v0 = _mm_srli_si128(sum, 8);

-  sum = _mm_add_epi32(sum, v0);

-  v0 = _mm_srli_epi64(sum, 32);

-  sum = _mm_add_epi32(sum, v0);

+  v0  = _mm_srli_si128(sum, 8);

+  sum = _mm_add_epi16(sum, v0);

+  v0  = _mm_srli_epi64(sum, 32);

+  sum = _mm_add_epi16(sum, v0);

+  v0  = _mm_srli_epi32(sum, 16);

+  sum = _mm_add_epi16(sum, v0);

-  return _mm_cvtsi128_si32(sum);

+  v1  = _mm_srli_si128(sse, 8);

+  sse = _mm_add_epi32(sse, v1);

+  v1  = _mm_srli_epi64(sse, 32);

+  sse = _mm_add_epi32(sse, v1);

+  mean = _mm_extract_epi16(sum, 0);

+  return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));

--

⑨