shithub: libvpx

Download patch

ref: 54eda13f8df587fe0a5a202f232f66863aff445a
parent: 1ca4d51b2e1a370b807919e567f6d7796885c40f
author: Jingning Han <jingning@google.com>
date: Mon Mar 9 14:55:38 EDT 2015

Apply fast motion search to golden reference frame

This commit enables the rtc coding mode to run integral projection
based motion search for golden reference frame. It improves the
speed -6 compression performance by 1.1% on average, 3.46% for
jimred_vga, 6.46% for tacomascmvvga, and 0.5% for vidyo clips. The
speed -6 is about 6% slower.

Change-Id: I0fe402ad2edf0149d0349ad304ab9b2abdf0c804

--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -784,15 +784,43 @@
         continue;
 
       if (this_mode == NEWMV) {
-        if (ref_frame > LAST_FRAME)
-          continue;
         if (cpi->sf.partition_search_type != VAR_BASED_PARTITION &&
             best_rdc.rdcost < (int64_t)(1 << num_pels_log2_lookup[bsize]))
           continue;
-        if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                    &frame_mv[NEWMV][ref_frame],
-                                    &rate_mv, best_rdc.rdcost))
+
+        if (ref_frame > LAST_FRAME) {
+          int tmp_sad;
+          int dis, cost_list[5];
+
+          if (bsize < BLOCK_16X16)
+            continue;
+
+          tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
+          if (tmp_sad > x->pred_mv_sad[LAST_FRAME])
+            continue;
+
+          frame_mv[NEWMV][ref_frame].as_int = mbmi->mv[0].as_int;
+          rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
+                                    &mbmi->ref_mvs[ref_frame][0].as_mv,
+                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+          frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+          frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+          cpi->find_fractional_mv_step(x, &frame_mv[NEWMV][ref_frame].as_mv,
+                                       &mbmi->ref_mvs[ref_frame][0].as_mv,
+                                       cpi->common.allow_high_precision_mv,
+                                       x->errorperbit,
+                                       &cpi->fn_ptr[bsize],
+                                       cpi->sf.mv.subpel_force_stop,
+                                       cpi->sf.mv.subpel_iters_per_step,
+                                       cond_cost_list(cpi, cost_list),
+                                       x->nmvjointcost, x->mvcost, &dis,
+                                       &x->pred_sse[ref_frame], NULL, 0, 0);
+        } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                           &frame_mv[NEWMV][ref_frame],
+                                           &rate_mv, best_rdc.rdcost)) {
           continue;
+        }
       }
 
       if (this_mode != NEARESTMV &&
@@ -817,7 +845,7 @@
       }
 
       if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
-          pred_filter_search &&
+          pred_filter_search && (ref_frame == LAST_FRAME) &&
           ((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
            (mbmi->mv[0].as_mv.col & 0x07) != 0)) {
         int pf_rate[3];
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -61,7 +61,7 @@
                           const int ref_stride, const int height) {
   int idx;
   __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_load_si128((const __m128i *)ref);
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
   __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
   __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
   __m128i t0, t1;
@@ -69,7 +69,7 @@
   ref += ref_stride;
 
   for (idx = 1; idx < height_1; idx += 2) {
-    src_line = _mm_load_si128((const __m128i *)ref);
+    src_line = _mm_loadu_si128((const __m128i *)ref);
     t0 = _mm_unpacklo_epi8(src_line, zero);
     t1 = _mm_unpackhi_epi8(src_line, zero);
     s0 = _mm_adds_epu16(s0, t0);
@@ -76,7 +76,7 @@
     s1 = _mm_adds_epu16(s1, t1);
     ref += ref_stride;
 
-    src_line = _mm_load_si128((const __m128i *)ref);
+    src_line = _mm_loadu_si128((const __m128i *)ref);
     t0 = _mm_unpacklo_epi8(src_line, zero);
     t1 = _mm_unpackhi_epi8(src_line, zero);
     s0 = _mm_adds_epu16(s0, t0);
@@ -84,7 +84,7 @@
     ref += ref_stride;
   }
 
-  src_line = _mm_load_si128((const __m128i *)ref);
+  src_line = _mm_loadu_si128((const __m128i *)ref);
   t0 = _mm_unpacklo_epi8(src_line, zero);
   t1 = _mm_unpackhi_epi8(src_line, zero);
   s0 = _mm_adds_epu16(s0, t0);
@@ -101,9 +101,9 @@
     s1 = _mm_srai_epi16(s1, 3);
   }
 
-  _mm_store_si128((__m128i *)hbuf, s0);
+  _mm_storeu_si128((__m128i *)hbuf, s0);
   hbuf += 8;
-  _mm_store_si128((__m128i *)hbuf, s1);
+  _mm_storeu_si128((__m128i *)hbuf, s1);
 }
 
 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {