shithub: libvpx

Download patch

ref: fcf7998a47f7e1ec27fe93f99e488d345560a9be
parent: ed995afba18ec356fa72772d20d3e2f93635b1e3
author: Ronald S. Bultje <rbultje@google.com>
date: Mon Jul 8 10:49:48 EDT 2013

Remove memcpy() in handle_inter_mode() filter selection.

Encode time of first 50 frames of bus (speed 0) @ 1500kbps goes from
2min4.9 to 2min3.1, i.e. a 1.4% speedup overall.

Change-Id: Ibe8b08d159797504c5d0c5122de1b6da3b6595e0

--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2569,11 +2569,14 @@
     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv cur_mv[2];
   int64_t this_rd = 0;
-  unsigned char tmp_buf[MAX_MB_PLANE][64 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
   int pred_exists = 0;
   int interpolating_intpel_seen = 0;
   int intpel_mv;
   int64_t rd, best_rd = INT64_MAX;
+  int best_needs_copy = 0;
+  uint8_t *orig_dst[MAX_MB_PLANE];
+  int orig_dst_stride[MAX_MB_PLANE];
 
   switch (this_mode) {
     int rate_mv;
@@ -2630,6 +2633,16 @@
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
 
+  // do first prediction into the destination buffer. Do the next
+  // prediction into a temporary buffer. Then keep track of which one
+  // of these currently holds the best predictor, and use the other
+  // one for future predictions. In the end, copy from tmp_buf to
+  // dst if necessary.
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    orig_dst[i] = xd->plane[i].dst.buf;
+    orig_dst_stride[i] = xd->plane[i].dst.stride;
+  }
+
   /* We don't include the cost of the second reference here, because there
    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
    * words if you present them in that order, the second one is always known
@@ -2657,7 +2670,7 @@
 
     cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
     for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-      int rs;
+      int rs, j;
       int64_t rs_rd;
       const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
       const int is_intpel_interp = intpel_mv &&
@@ -2679,6 +2692,20 @@
       } else {
         int rate_sum = 0;
         int64_t dist_sum = 0;
+        if ((cm->mcomp_filter_type == SWITCHABLE &&
+             i && !best_needs_copy) ||
+            (cm->mcomp_filter_type != SWITCHABLE &&
+             cm->mcomp_filter_type != mbmi->interp_filter)) {
+          for (j = 0; j < MAX_MB_PLANE; j++) {
+            xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
+            xd->plane[j].dst.stride = 64;
+          }
+        } else {
+          for (j = 0; j < MAX_MB_PLANE; j++) {
+            xd->plane[j].dst.buf = orig_dst[j];
+            xd->plane[j].dst.stride = orig_dst_stride[j];
+          }
+        }
         vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
         model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
         cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
@@ -2699,27 +2726,23 @@
       if (newbest) {
         best_rd = rd;
         *best_filter = mbmi->interp_filter;
+        if (cm->mcomp_filter_type == SWITCHABLE && i &&
+            !(interpolating_intpel_seen && is_intpel_interp))
+          best_needs_copy = !best_needs_copy;
       }
 
       if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
           (cm->mcomp_filter_type != SWITCHABLE &&
            cm->mcomp_filter_type == mbmi->interp_filter)) {
-        int p;
-
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          struct macroblockd_plane *pd = &xd->plane[p];
-          const int bw = plane_block_width(bsize, pd);
-          const int bh = plane_block_height(bsize, pd);
-          int i;
-
-          for (i = 0; i < bh; i++)
-            vpx_memcpy(&tmp_buf[p][64 * i], pd->dst.buf + i * pd->dst.stride,
-                                   bw);
-        }
         pred_exists = 1;
       }
       interpolating_intpel_seen |= is_intpel_interp;
     }
+
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].dst.buf = orig_dst[i];
+      xd->plane[i].dst.stride = orig_dst_stride[i];
+    }
   }
 
   // Set the appripriate filter
@@ -2727,18 +2750,13 @@
       cm->mcomp_filter_type : *best_filter;
   vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
 
-
   if (pred_exists) {
-    int p;
-
-    for (p = 0; p < MAX_MB_PLANE; p++) {
-      struct macroblockd_plane *pd = &xd->plane[p];
-      const int bw = plane_block_width(bsize, pd);
-      const int bh = plane_block_height(bsize, pd);
-      int i;
-
-      for (i = 0; i < bh; i++)
-        vpx_memcpy(pd->dst.buf + i * pd->dst.stride, &tmp_buf[p][64 * i], bw);
+    if (best_needs_copy) {
+      // again temporarily set the buffers to local memory to prevent a memcpy
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
+        xd->plane[i].dst.stride = 64;
+      }
     }
   } else {
     // Handles the special case when a filter that is not in the
@@ -2812,6 +2830,10 @@
     if (*rate_y == INT_MAX) {
       *rate2 = INT_MAX;
       *distortion = INT64_MAX;
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = orig_dst[i];
+        xd->plane[i].dst.stride = orig_dst_stride[i];
+      }
       return INT64_MAX;
     }
 
@@ -2833,6 +2855,11 @@
     } else {
       *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
     }
+  }
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = orig_dst[i];
+    xd->plane[i].dst.stride = orig_dst_stride[i];
   }
 
   return this_rd;  // if 0, this will be re-calculated by caller