shithub: libvpx

Download patch

ref: 0aae10007684fc3aa84ebf38a1a1d20e12ca8930
parent: ab9755f3af72addd8545db091fa7f36e368dc7fd
author: Yunqing Wang <yunqingwang@google.com>
date: Fri Jun 20 06:11:34 EDT 2014

Reuse inter prediction result in real-time speed 6

In real-time speed 6, no partition search is done. The inter
prediction results got from picking mode can be reused in the
following encoding process. A speed feature reuse_inter_pred_sby
is added to only enable the resue in speed 6.

This patch doesn't change encoding result. RTC set tests showed
that the encoding speed gain is 2% - 5%.

Change-Id: I3884780f64ef95dd8be10562926542528713b92c

--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3364,7 +3364,10 @@
       vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                            &xd->block_refs[ref]->sf);
     }
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
+    if (!cpi->sf.reuse_inter_pred_sby)
+      vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
+
+    vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
 
     if (!x->skip) {
       mbmi->skip = 1;
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -23,6 +23,7 @@
 #include "vp9/common/vp9_reconintra.h"
 
 #include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rdopt.h"
 
@@ -183,6 +184,22 @@
   *out_dist_sum += dist << 4;
 }
 
+static int get_pred_buffer(PRED_BUFFER *p, int len) {
+  int i;
+
+  for (i = 0; i < len; i++) {
+    if (!p[i].in_use) {
+      p[i].in_use = 1;
+      return i;
+    }
+  }
+  return -1;
+}
+
+static void free_pred_buffer(PRED_BUFFER *p) {
+  p->in_use = 0;
+}
+
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
 // this needs various further optimizations. to be continued..
 int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -229,6 +246,31 @@
   const int pred_filter_search = (((mi_row + mi_col) >> bsl) +
                                       get_chessboard_index(cm)) % 2;
 
+  // For speed 6, the result of interp filter is reused later in actual encoding
+  // process.
+  int bh = num_4x4_blocks_high_lookup[bsize] << 2;
+  int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
+  int pixels_in_block = bh * bw;
+  // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
+  PRED_BUFFER tmp[4];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64);
+  struct buf_2d orig_dst = pd->dst;
+  PRED_BUFFER *best_pred = NULL;
+  PRED_BUFFER *this_mode_pred = NULL;
+  int i;
+
+  if (cpi->sf.reuse_inter_pred_sby) {
+    for (i = 0; i < 3; i++) {
+      tmp[i].data = &pred_buf[pixels_in_block * i];
+      tmp[i].stride = bw;
+      tmp[i].in_use = 0;
+    }
+
+    tmp[3].data = pd->dst.buf;
+    tmp[3].stride = pd->dst.stride;
+    tmp[3].in_use = 0;
+  }
+
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
   x->skip = 0;
@@ -324,6 +366,16 @@
       // Search for the best prediction filter type, when the resulting
       // motion vector is at sub-pixel accuracy level for luma component, i.e.,
       // the last three bits are all zeros.
+      if (cpi->sf.reuse_inter_pred_sby) {
+        if (this_mode == NEARESTMV) {
+          this_mode_pred = &tmp[3];
+        } else {
+          this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+          pd->dst.buf = this_mode_pred->data;
+          pd->dst.stride = bw;
+        }
+      }
+
       if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
           pred_filter_search &&
           ((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
@@ -334,6 +386,7 @@
         unsigned int pf_sse[3];
         int64_t best_cost = INT64_MAX;
         INTERP_FILTER best_filter = SWITCHABLE, filter;
+        PRED_BUFFER *current_pred = this_mode_pred;
 
         for (filter = EIGHTTAP; filter <= EIGHTTAP_SHARP; ++filter) {
           int64_t cost;
@@ -345,12 +398,28 @@
                         vp9_get_switchable_rate(cpi) + pf_rate[filter],
                         pf_dist[filter]);
           if (cost < best_cost) {
-              best_filter = filter;
-              best_cost = cost;
-              skip_txfm = x->skip_txfm;
+            best_filter = filter;
+            best_cost = cost;
+            skip_txfm = x->skip_txfm;
+
+            if (cpi->sf.reuse_inter_pred_sby) {
+              if (this_mode_pred != current_pred) {
+                free_pred_buffer(this_mode_pred);
+                this_mode_pred = current_pred;
+              }
+
+              if (filter < EIGHTTAP_SHARP) {
+                current_pred = &tmp[get_pred_buffer(tmp, 3)];
+                pd->dst.buf = current_pred->data;
+                pd->dst.stride = bw;
+              }
+            }
           }
         }
 
+        if (cpi->sf.reuse_inter_pred_sby && this_mode_pred != current_pred)
+          free_pred_buffer(current_pred);
+
         mbmi->interp_filter = best_filter;
         rate = pf_rate[mbmi->interp_filter];
         dist = pf_dist[mbmi->interp_filter];
@@ -451,6 +520,16 @@
         best_pred_filter = mbmi->interp_filter;
         best_ref_frame = ref_frame;
         skip_txfm = x->skip_txfm;
+
+        if (cpi->sf.reuse_inter_pred_sby) {
+          if (best_pred != NULL)
+            free_pred_buffer(best_pred);
+
+          best_pred = this_mode_pred;
+        }
+      } else {
+        if (cpi->sf.reuse_inter_pred_sby)
+          free_pred_buffer(this_mode_pred);
       }
 
       if (x->skip)
@@ -458,7 +537,20 @@
     }
   }
 
+  // If best prediction is not in dst buf, then copy the prediction block from
+  // temp buf to dst buf.
+  if (cpi->sf.reuse_inter_pred_sby && best_pred->data != orig_dst.buf) {
+    uint8_t *copy_from, *copy_to;
 
+    pd->dst = orig_dst;
+    copy_to = pd->dst.buf;
+
+    copy_from = best_pred->data;
+
+    vp9_convolve_copy(copy_from, bw, copy_to, pd->dst.stride, NULL, 0, NULL, 0,
+                      bw, bh);
+  }
+
   mbmi->mode = best_mode;
   mbmi->interp_filter = best_pred_filter;
   mbmi->ref_frame[0] = best_ref_frame;
@@ -471,6 +563,11 @@
   if (!x->skip && best_rd > inter_mode_thresh &&
       bsize <= cpi->sf.max_intra_bsize) {
     for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
+      if (cpi->sf.reuse_inter_pred_sby) {
+        pd->dst.buf = tmp[0].data;
+        pd->dst.stride = bw;
+      }
+
       vp9_predict_intra_block(xd, 0, b_width_log2(bsize),
                               mbmi->tx_size, this_mode,
                               &p->src.buf[0], p->src.stride,
@@ -477,6 +574,10 @@
                               &pd->dst.buf[0], pd->dst.stride, 0, 0, 0);
 
       model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+
+      if (cpi->sf.reuse_inter_pred_sby)
+        pd->dst = orig_dst;
+
       rate += cpi->mbmode_cost[this_mode];
       rate += intra_cost_penalty;
       this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
@@ -494,6 +595,7 @@
       }
     }
   }
+
 #if CONFIG_DENOISING
   vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col, bsize);
 #endif
--- a/vp9/encoder/vp9_pickmode.h
+++ b/vp9/encoder/vp9_pickmode.h
@@ -17,6 +17,12 @@
 extern "C" {
 #endif
 
+typedef struct {
+  uint8_t *data;
+  int stride;
+  int in_use;
+} PRED_BUFFER;
+
 int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                             const struct TileInfo *const tile,
                             int mi_row, int mi_col,
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -274,6 +274,9 @@
     // is checked for a partition block. Later, we can try to allow large
     // partitions to do intra mode checking.
     sf->max_intra_bsize = BLOCK_8X8;
+
+    // This feature is only enabled when partition search is disabled.
+    sf->reuse_inter_pred_sby = 1;
   }
 
   if (speed >= 7) {
@@ -339,6 +342,7 @@
   for (i = 0; i < BLOCK_SIZES; ++i)
     sf->inter_mode_mask[i] = INTER_ALL;
   sf->max_intra_bsize = BLOCK_64X64;
+  sf->reuse_inter_pred_sby = 0;
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
   sf->always_this_block_size = BLOCK_16X16;
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -353,6 +353,11 @@
 
   // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
   unsigned int source_var_thresh;
+
+  // When partition is pre-set, the inter prediction result from pick_inter_mode
+  // can be reused in final block encoding process. It is enabled only for real-
+  // time mode speed 6.
+  int reuse_inter_pred_sby;
 } SPEED_FEATURES;
 
 struct VP9_COMP;