shithub: libvpx

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -519,13 +519,14 @@

 #endif

 #if GLOBAL_MOTION

-static int vector_match(int16_t *ref, int16_t *src) {

+static int vector_match(int16_t *ref, int16_t *src, int length) {

   int best_sad = INT_MAX;

   int this_sad;

   int d;

   int center, offset = 0;

-  for (d = 0; d <= 64; d += 16) {

-    this_sad = vp9_vector_sad(&ref[d], src, 64);

+  int bw = length;  // redundant variable, to be changed in the experiments.

+  for (d = 0; d <= bw; d += 16) {

+    this_sad = vp9_vector_sad(&ref[d], src, length);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       offset = d;

@@ -536,9 +537,9 @@

   for (d = -8; d <= 8; d += 16) {

     int this_pos = offset + d;

     // check limit

-    if (this_pos < 0 || this_pos > 64)

+    if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);

+    this_sad = vp9_vector_sad(&ref[this_pos], src, length);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -549,9 +550,9 @@

   for (d = -4; d <= 4; d += 8) {

     int this_pos = offset + d;

     // check limit

-    if (this_pos < 0 || this_pos > 64)

+    if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);

+    this_sad = vp9_vector_sad(&ref[this_pos], src, length);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -562,9 +563,9 @@

   for (d = -2; d <= 2; d += 4) {

     int this_pos = offset + d;

     // check limit

-    if (this_pos < 0 || this_pos > 64)

+    if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);

+    this_sad = vp9_vector_sad(&ref[this_pos], src, length);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -575,9 +576,9 @@

   for (d = -1; d <= 1; d += 2) {

     int this_pos = offset + d;

     // check limit

-    if (this_pos < 0 || this_pos > 64)

+    if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);

+    this_sad = vp9_vector_sad(&ref[this_pos], src, length);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -584,7 +585,7 @@

-  return (center - 32);

+  return (center - (bw >> 1));

 static const MV search_pos[9] = {

@@ -592,7 +593,8 @@

   {1, -1}, {1, 0}, {1, 1},

};

-static void motion_estimation(VP9_COMP *cpi, MACROBLOCK *x) {

+static void motion_estimation(VP9_COMP *cpi, MACROBLOCK *x,

+                              BLOCK_SIZE bsize) {

   MACROBLOCKD *xd = &x->e_mbd;

   DECLARE_ALIGNED(16, int16_t, hbuf[128]);

   DECLARE_ALIGNED(16, int16_t, vbuf[128]);

@@ -599,9 +601,10 @@

   DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);

   DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);

   int idx;

-  const int stride = 64;

-  const int search_width = 128;

-  const int search_height = 128;

+  const int bw = 4 << b_width_log2_lookup[bsize];

+  const int bh = 4 << b_height_log2_lookup[bsize];

+  const int search_width = bw << 1;

+  const int search_height = bh << 1;

   const int src_stride = x->plane[0].src.stride;

   const int ref_stride = xd->plane[0].pre[0].stride;

   uint8_t const *ref_buf, *src_buf;

@@ -610,35 +613,34 @@

   MV this_mv;

   // Set up prediction 1-D reference set

-  ref_buf = xd->plane[0].pre[0].buf + (-32);

+  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);

   for (idx = 0; idx < search_width; idx += 16) {

-    vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, 64);

+    vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);

     ref_buf += 16;

-  ref_buf = xd->plane[0].pre[0].buf + (-32) * ref_stride;

+  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;

   for (idx = 0; idx < search_height; ++idx) {

-    vbuf[idx] = vp9_int_pro_col(ref_buf, 64);

+    vbuf[idx] = vp9_int_pro_col(ref_buf, bw);

     ref_buf += ref_stride;

   // Set up src 1-D reference set

-  for (idx = 0; idx < stride; idx += 16) {

+  for (idx = 0; idx < bw; idx += 16) {

     src_buf = x->plane[0].src.buf + idx;

-    vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, 64);

+    vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);

   src_buf = x->plane[0].src.buf;

-  for (idx = 0; idx < stride; ++idx) {

-    src_vbuf[idx] = vp9_int_pro_col(src_buf, 64);

+  for (idx = 0; idx < bh; ++idx) {

+    src_vbuf[idx] = vp9_int_pro_col(src_buf, bw);

     src_buf += src_stride;

   // Find the best match per 1-D search

+  tmp_mv->col = vector_match(hbuf, src_hbuf, bw);

+  tmp_mv->row = vector_match(vbuf, src_vbuf, bh);

-  tmp_mv->col = vector_match(hbuf, src_hbuf);

-  tmp_mv->row = vector_match(vbuf, src_vbuf);

   best_sad = INT_MAX;

   this_mv = *tmp_mv;

   for (idx = 0; idx < 9; ++idx) {

@@ -648,8 +650,8 @@

         (search_pos[idx].row + this_mv.row) * ref_stride +

         (search_pos[idx].col + this_mv.col);

-    this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src_buf, src_stride,

-                                            ref_buf, ref_stride);

+    this_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride,

+                                      ref_buf, ref_stride);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       tmp_mv->row = search_pos[idx].row + this_mv.row;

@@ -717,7 +719,7 @@

     mbmi->interp_filter = BILINEAR;

 #if GLOBAL_MOTION

-    motion_estimation(cpi, x);

+    motion_estimation(cpi, x, BLOCK_64X64);

 #endif

     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);

--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c

+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c

@@ -100,23 +100,15 @@

   __m128i src_line = _mm_load_si128((const __m128i *)ref);

   __m128i s0 = _mm_sad_epu8(src_line, zero);

   __m128i s1;

-  (void) width;  // width = 64

+  int i;

-  ref += 16;

-  src_line = _mm_load_si128((const __m128i *)ref);

-  s1 = _mm_sad_epu8(src_line, zero);

-  s0 = _mm_adds_epu16(s0, s1);

+  for (i = 16; i < width; i += 16) {

+    ref += 16;

+    src_line = _mm_load_si128((const __m128i *)ref);

+    s1 = _mm_sad_epu8(src_line, zero);

+    s0 = _mm_adds_epu16(s0, s1);

+  }

-  ref += 16;

-  src_line = _mm_load_si128((const __m128i *)ref);

-  s1 = _mm_sad_epu8(src_line, zero);

-  s0 = _mm_adds_epu16(s0, s1);

-  ref += 16;

-  src_line = _mm_load_si128((const __m128i *)ref);

-  s1 = _mm_sad_epu8(src_line, zero);

-  s0 = _mm_adds_epu16(s0, s1);

   s1 = _mm_srli_si128(s0, 8);

   s0 = _mm_adds_epu16(s0, s1);

@@ -136,8 +128,6 @@

   diff = _mm_xor_si128(diff, sign);

   sum = _mm_sub_epi16(diff, sign);

-  (void) width;  // width = 64;

   ref += 8;

   src += 8;

@@ -145,7 +135,7 @@

   v1 = _mm_unpackhi_epi16(sum, zero);

   sum = _mm_add_epi32(v0, v1);

-  for (idx = 1; idx < 8; ++idx) {

+  for (idx = 8; idx < width; idx += 8) {

     v0 = _mm_loadu_si128((const __m128i *)ref);

     v1 = _mm_load_si128((const __m128i *)src);

     diff = _mm_subs_epi16(v0, v1);

--

⑨