shithub: libvpx

--- a/vp9/encoder/vp9_pickmode.c

+++ b/vp9/encoder/vp9_pickmode.c

@@ -784,15 +784,43 @@

         continue;

       if (this_mode == NEWMV) {

-        if (ref_frame > LAST_FRAME)

-          continue;

         if (cpi->sf.partition_search_type != VAR_BASED_PARTITION &&

             best_rdc.rdcost < (int64_t)(1 << num_pels_log2_lookup[bsize]))

           continue;

-        if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,

-                                    &frame_mv[NEWMV][ref_frame],

-                                    &rate_mv, best_rdc.rdcost))

+        if (ref_frame > LAST_FRAME) {

+          int tmp_sad;

+          int dis, cost_list[5];

+          if (bsize < BLOCK_16X16)

+            continue;

+          tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);

+          if (tmp_sad > x->pred_mv_sad[LAST_FRAME])

+            continue;

+          frame_mv[NEWMV][ref_frame].as_int = mbmi->mv[0].as_int;

+          rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,

+                                    &mbmi->ref_mvs[ref_frame][0].as_mv,

+                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);

+          frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;

+          frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;

+          cpi->find_fractional_mv_step(x, &frame_mv[NEWMV][ref_frame].as_mv,

+                                       &mbmi->ref_mvs[ref_frame][0].as_mv,

+                                       cpi->common.allow_high_precision_mv,

+                                       x->errorperbit,

+                                       &cpi->fn_ptr[bsize],

+                                       cpi->sf.mv.subpel_force_stop,

+                                       cpi->sf.mv.subpel_iters_per_step,

+                                       cond_cost_list(cpi, cost_list),

+                                       x->nmvjointcost, x->mvcost, &dis,

+                                       &x->pred_sse[ref_frame], NULL, 0, 0);

+        } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,

+                                           &frame_mv[NEWMV][ref_frame],

+                                           &rate_mv, best_rdc.rdcost)) {

           continue;

+        }

       if (this_mode != NEARESTMV &&

@@ -817,7 +845,7 @@

       if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&

-          pred_filter_search &&

+          pred_filter_search && (ref_frame == LAST_FRAME) &&

           ((mbmi->mv[0].as_mv.row & 0x07) != 0 ||

            (mbmi->mv[0].as_mv.col & 0x07) != 0)) {

         int pf_rate[3];

--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c

+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c

@@ -61,7 +61,7 @@

                           const int ref_stride, const int height) {

   int idx;

   __m128i zero = _mm_setzero_si128();

-  __m128i src_line = _mm_load_si128((const __m128i *)ref);

+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);

   __m128i s0 = _mm_unpacklo_epi8(src_line, zero);

   __m128i s1 = _mm_unpackhi_epi8(src_line, zero);

   __m128i t0, t1;

@@ -69,7 +69,7 @@

   ref += ref_stride;

   for (idx = 1; idx < height_1; idx += 2) {

-    src_line = _mm_load_si128((const __m128i *)ref);

+    src_line = _mm_loadu_si128((const __m128i *)ref);

     t0 = _mm_unpacklo_epi8(src_line, zero);

     t1 = _mm_unpackhi_epi8(src_line, zero);

     s0 = _mm_adds_epu16(s0, t0);

@@ -76,7 +76,7 @@

     s1 = _mm_adds_epu16(s1, t1);

     ref += ref_stride;

-    src_line = _mm_load_si128((const __m128i *)ref);

+    src_line = _mm_loadu_si128((const __m128i *)ref);

     t0 = _mm_unpacklo_epi8(src_line, zero);

     t1 = _mm_unpackhi_epi8(src_line, zero);

     s0 = _mm_adds_epu16(s0, t0);

@@ -84,7 +84,7 @@

     ref += ref_stride;

-  src_line = _mm_load_si128((const __m128i *)ref);

+  src_line = _mm_loadu_si128((const __m128i *)ref);

   t0 = _mm_unpacklo_epi8(src_line, zero);

   t1 = _mm_unpackhi_epi8(src_line, zero);

   s0 = _mm_adds_epu16(s0, t0);

@@ -101,9 +101,9 @@

     s1 = _mm_srai_epi16(s1, 3);

-  _mm_store_si128((__m128i *)hbuf, s0);

+  _mm_storeu_si128((__m128i *)hbuf, s0);

   hbuf += 8;

-  _mm_store_si128((__m128i *)hbuf, s1);

+  _mm_storeu_si128((__m128i *)hbuf, s1);

 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {

--

⑨