shithub: libvpx

Download patch

ref: 6b7848d4c9016efbfbc9529df008ffde0e15b7cc
parent: 2beb5c9f91e7166c2c9d01c94bf84767815121e4
author: Hui Su <huisu@google.com>
date: Sat Sep 29 10:48:56 EDT 2018

Introduce the ml_var_partition_pruning feature

Add the ml_var_partition_pruning encoder speed feature that
uses a neural net model to prune partition-none and partition-split
search. The model uses prediction residue variance and quantization
step size as input features.

Encoding speed gain for speed 0(tested over 20 hdres clips):
            QP=30    QP=40
average     17.7%    18.3%
max        24.46%    26.6%

Coding loss:
lowres 0.071%;  midres 0.098%;  hdres 0.163%

Currently it is enabled for speed 0 low-bit depth only. It needs to be
tuned for other settings.

Change-Id: Ifb7417daa6bb6e7c97bb676269ce54ab0dc7b8c8

--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3393,6 +3393,140 @@
 #undef FEATURES
 #undef LABELS
 
+// Use a neural net model to prune partition-none and partition-split search.
+// The model uses prediction residue variance and quantization step size as
+// input features.
+#define FEATURES 6
+static void ml_predict_var_rd_paritioning(VP9_COMP *cpi, MACROBLOCK *x,
+                                          BLOCK_SIZE bsize, int mi_row,
+                                          int mi_col, int *none, int *split) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  const NN_CONFIG *nn_config = NULL;
+  DECLARE_ALIGNED(16, uint8_t, pred_buf[64 * 64]);
+  int i;
+  float thresh_low = -1.0f;
+  float thresh_high = 0.0f;
+
+  switch (bsize) {
+    case BLOCK_64X64:
+      nn_config = &vp9_var_rd_part_nnconfig_64;
+      thresh_low = -3.0f;
+      thresh_high = 3.0f;
+      break;
+    case BLOCK_32X32:
+      nn_config = &vp9_var_rd_part_nnconfig_32;
+      thresh_low = -3.0;
+      thresh_high = 3.0f;
+      break;
+    case BLOCK_16X16:
+      nn_config = &vp9_var_rd_part_nnconfig_16;
+      thresh_low = -4.0;
+      thresh_high = 4.0f;
+      break;
+    case BLOCK_8X8:
+      nn_config = &vp9_var_rd_part_nnconfig_8;
+      thresh_low = -2.0;
+      thresh_high = 2.0f;
+      break;
+    default: assert(0 && "Unexpected block size."); return;
+  }
+
+  if (!nn_config) return;
+
+  mi->ref_frame[1] = NONE;
+  mi->sb_type = bsize;
+  // Do a simple single motion search to find a prediction for current block.
+  // The variance of the residue will be used as input features.
+  {
+    const MV_REFERENCE_FRAME ref =
+        cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+    YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref);
+    MV ref_mv = { 0, 0 };
+    MV ref_mv_full = { 0, 0 };
+    const int step_param = 1;
+    const MvLimits tmp_mv_limits = x->mv_limits;
+    const SEARCH_METHODS search_method = NSTEP;
+    const int sadpb = x->sadperbit16;
+    MV best_mv = { 0, 0 };
+    int cost_list[5];
+
+    assert(yv12 != NULL);
+    if (!yv12) return;
+    vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                         &cm->frame_refs[ref - 1].sf);
+    mi->ref_frame[0] = ref;
+    vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
+    vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param,
+                          search_method, sadpb, cond_cost_list(cpi, cost_list),
+                          &ref_mv, &best_mv, 0, 0);
+    best_mv.row *= 8;
+    best_mv.col *= 8;
+    x->mv_limits = tmp_mv_limits;
+    mi->mv[0].as_mv = best_mv;
+
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+    xd->plane[0].dst.buf = pred_buf;
+    xd->plane[0].dst.stride = 64;
+    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+  }
+
+  vpx_clear_system_state();
+
+  {
+    float features[FEATURES] = { 0.0f };
+    const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth);
+    int feature_idx = 0;
+    float score;
+
+    // Generate model input features.
+    features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f);
+    vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+    // Get the variance of the residue as input features.
+    {
+      const int bs = 4 * num_4x4_blocks_wide_lookup[bsize];
+      const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+      const uint8_t *pred = pred_buf;
+      const uint8_t *src = x->plane[0].src.buf;
+      const int src_stride = x->plane[0].src.stride;
+      const int pred_stride = 64;
+      unsigned int sse;
+      // Variance of whole block.
+      const unsigned int var =
+          cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+      const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+
+      features[feature_idx++] = logf((float)var + 1.0f);
+      for (i = 0; i < 4; ++i) {
+        const int x_idx = (i & 1) * bs / 2;
+        const int y_idx = (i >> 1) * bs / 2;
+        const int src_offset = y_idx * src_stride + x_idx;
+        const int pred_offset = y_idx * pred_stride + x_idx;
+        // Variance of quarter block.
+        const unsigned int sub_var =
+            cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+                                    pred + pred_offset, pred_stride, &sse);
+        const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+        features[feature_idx++] = var_ratio;
+      }
+    }
+    assert(feature_idx == FEATURES);
+
+    // Feed the features into the model to get the confidence score.
+    nn_predict(features, nn_config, &score);
+
+    // Higher score means that the model has higher confidence that the split
+    // partition is better than the non-split partition. So if the score is
+    // high enough, we skip the none-split partition search; if the score is
+    // low enough, we skip the split partition search.
+    if (score > thresh_high) *none = 0;
+    if (score < thresh_low) *split = 0;
+  }
+}
+#undef FEATURES
+#undef LABELS
+
 int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col,
                      int orig_rdmult) {
   TplDepFrame *tpl_frame = &cpi->tpl_stats[cpi->twopass.gf_group.index];
@@ -3624,6 +3758,21 @@
 
   pc_tree->partitioning = PARTITION_NONE;
 
+  if (cpi->sf.ml_var_partition_pruning) {
+    int do_ml_var_partition_pruning =
+        !frame_is_intra_only(cm) && partition_none_allowed && do_split &&
+        mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows &&
+        mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      do_ml_var_partition_pruning = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    if (do_ml_var_partition_pruning) {
+      ml_predict_var_rd_paritioning(cpi, x, bsize, mi_row, mi_col,
+                                    &partition_none_allowed, &do_split);
+    }
+  }
+
   // PARTITION_NONE
   if (partition_none_allowed) {
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, ctx,
@@ -3738,6 +3887,9 @@
       }
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  } else {
+    vp9_zero(ctx->pred_mv);
+    ctx->mic.interp_filter = EIGHTTAP;
   }
 
   // store estimated motion vector
--- a/vp9/encoder/vp9_partition_models.h
+++ b/vp9/encoder/vp9_partition_models.h
@@ -18,7 +18,9 @@
 #define NN_MAX_HIDDEN_LAYERS 10
 #define NN_MAX_NODES_PER_LAYER 128
 
-// Neural net model config.
+// Neural net model config. It defines the layout of a neural net model, such as
+// the number of inputs/outputs, number of layers, the number of nodes in each
+// layer, as well as the weights and bias of each node.
 typedef struct {
   int num_inputs;         // Number of input nodes, i.e. features.
   int num_outputs;        // Number of output nodes.
@@ -963,6 +965,178 @@
 };
 #undef FEATURES
 #endif  // CONFIG_ML_VAR_PARTITION
+
+#define FEATURES 6
+#define LABELS 1
+static const float vp9_var_rd_part_nn_weights_64_layer0[FEATURES * 8] = {
+  -0.100129f, 0.128867f,  -1.375086f, -2.268096f, -1.470368f, -2.296274f,
+  0.034445f,  -0.062993f, -2.151904f, 0.523215f,  1.611269f,  1.530051f,
+  0.418182f,  -1.330239f, 0.828388f,  0.386546f,  -0.026188f, -0.055459f,
+  -0.474437f, 0.861295f,  -2.208743f, -0.652991f, -2.985873f, -1.728956f,
+  0.388052f,  -0.420720f, 2.015495f,  1.280342f,  3.040914f,  1.760749f,
+  -0.009062f, 0.009623f,  1.579270f,  -2.012891f, 1.629662f,  -1.796016f,
+  -0.279782f, -0.288359f, 1.875618f,  1.639855f,  0.903020f,  0.906438f,
+  0.553394f,  -1.621589f, 0.185063f,  0.605207f,  -0.133560f, 0.588689f,
+};
+
+static const float vp9_var_rd_part_nn_bias_64_layer0[8] = {
+  0.659717f, 0.120912f, 0.329894f, -1.586385f,
+  1.715839f, 0.085754f, 2.038774f, 0.268119f,
+};
+
+static const float vp9_var_rd_part_nn_weights_64_layer1[8 * LABELS] = {
+  -3.445586f, 2.375620f, 1.236970f, 0.804030f,
+  -2.448384f, 2.827254f, 2.291478f, 0.790252f,
+};
+
+static const float vp9_var_rd_part_nn_bias_64_layer1[LABELS] = {
+  -1.16608453f,
+};
+
+static const NN_CONFIG vp9_var_rd_part_nnconfig_64 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_var_rd_part_nn_weights_64_layer0,
+      vp9_var_rd_part_nn_weights_64_layer1,
+  },
+  {
+      vp9_var_rd_part_nn_bias_64_layer0,
+      vp9_var_rd_part_nn_bias_64_layer1,
+  },
+};
+
+static const float vp9_var_rd_part_nn_weights_32_layer0[FEATURES * 8] = {
+  0.022420f,  -0.032201f, 1.228065f,  -2.767655f, 1.928743f,  0.566863f,
+  0.459229f,  0.422048f,  0.833395f,  0.822960f,  -0.232227f, 0.586895f,
+  0.442856f,  -0.018564f, 0.227672f,  -1.291306f, 0.119428f,  -0.776563f,
+  -0.042947f, 0.183129f,  0.592231f,  1.174859f,  -0.503868f, 0.270102f,
+  -0.330537f, -0.036340f, 1.144630f,  1.783710f,  1.216929f,  2.038085f,
+  0.373782f,  -0.430258f, 1.957002f,  1.383908f,  2.012261f,  1.585693f,
+  -0.394399f, -0.337523f, -0.238335f, 0.007819f,  -0.368294f, 0.437875f,
+  -0.318923f, -0.242000f, 2.276263f,  1.501432f,  0.645706f,  0.344774f,
+};
+
+static const float vp9_var_rd_part_nn_bias_32_layer0[8] = {
+  -0.023846f, -1.348117f, 1.365007f,  -1.644164f,
+  0.062992f,  1.257980f,  -0.098642f, 1.388472f,
+};
+
+static const float vp9_var_rd_part_nn_weights_32_layer1[8 * LABELS] = {
+  3.016729f, 0.622684f,  -1.021302f, 1.490383f,
+  1.702046f, -2.964618f, 0.689045f,  1.711754f,
+};
+
+static const float vp9_var_rd_part_nn_bias_32_layer1[LABELS] = {
+  -1.28798676f,
+};
+
+static const NN_CONFIG vp9_var_rd_part_nnconfig_32 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_var_rd_part_nn_weights_32_layer0,
+      vp9_var_rd_part_nn_weights_32_layer1,
+  },
+  {
+      vp9_var_rd_part_nn_bias_32_layer0,
+      vp9_var_rd_part_nn_bias_32_layer1,
+  },
+};
+
+static const float vp9_var_rd_part_nn_weights_16_layer0[FEATURES * 8] = {
+  -0.726813f, -0.026748f, 1.376946f,  1.467961f,  1.961810f,  1.690412f,
+  0.596484f,  -0.261486f, -0.310905f, -0.366311f, -1.300086f, -0.534336f,
+  0.040520f,  -0.032391f, -1.194214f, 2.438063f,  -3.915334f, 1.997270f,
+  0.673696f,  -0.676393f, 1.654886f,  1.553838f,  1.129691f,  1.360201f,
+  0.255001f,  0.336442f,  -0.487759f, -0.634555f, 0.479170f,  -0.110475f,
+  -0.661852f, -0.158872f, -0.350243f, -0.303957f, -0.045018f, 0.586151f,
+  -0.262463f, 0.228079f,  -1.688776f, -1.594502f, -2.261078f, -1.802535f,
+  0.034748f,  -0.028476f, 2.713258f,  0.212446f,  -1.529202f, -2.560178f,
+};
+
+static const float vp9_var_rd_part_nn_bias_16_layer0[8] = {
+  0.495983f,  1.858545f, 0.162974f, 1.992247f,
+  -2.698863f, 0.110020f, 0.550830f, 0.420941f,
+};
+
+static const float vp9_var_rd_part_nn_weights_16_layer1[8 * LABELS] = {
+  1.768409f, -1.394240f, 1.076846f,  -1.762808f,
+  1.517405f, 0.535195f,  -0.426827f, 1.002272f,
+};
+
+static const float vp9_var_rd_part_nn_bias_16_layer1[LABELS] = {
+  -1.65894794f,
+};
+
+static const NN_CONFIG vp9_var_rd_part_nnconfig_16 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_var_rd_part_nn_weights_16_layer0,
+      vp9_var_rd_part_nn_weights_16_layer1,
+  },
+  {
+      vp9_var_rd_part_nn_bias_16_layer0,
+      vp9_var_rd_part_nn_bias_16_layer1,
+  },
+};
+
+static const float vp9_var_rd_part_nn_weights_8_layer0[FEATURES * 8] = {
+  -0.804900f, -1.214983f, 0.840202f, 0.686566f,  0.155804f,  0.025542f,
+  -1.244635f, -0.368403f, 0.364150f, 1.081073f,  0.552387f,  0.452715f,
+  0.652968f,  -0.293058f, 0.048967f, 0.021240f,  -0.662981f, 0.424700f,
+  0.008293f,  -0.013088f, 0.747007f, -1.453907f, -1.498226f, 1.593252f,
+  -0.239557f, -0.143766f, 0.064311f, 1.320998f,  -0.477411f, 0.026374f,
+  0.730884f,  -0.675124f, 0.965521f, 0.863658f,  0.809186f,  0.812280f,
+  0.513131f,  0.185102f,  0.211354f, 0.793666f,  0.121714f,  -0.015383f,
+  -0.650980f, -0.046581f, 0.911141f, 0.806319f,  0.974773f,  0.815893f,
+};
+
+static const float vp9_var_rd_part_nn_bias_8_layer0[8] = {
+  0.176134f, 0.651308f, 2.007761f,  0.068812f,
+  1.061517f, 1.487161f, -2.308147f, 1.099828f,
+};
+
+static const float vp9_var_rd_part_nn_weights_8_layer1[8 * LABELS] = {
+  0.683032f, 1.326393f,  -1.661539f, 1.438920f,
+  1.118023f, -2.237380f, 1.518468f,  2.010416f,
+};
+
+static const float vp9_var_rd_part_nn_bias_8_layer1[LABELS] = {
+  -1.65423989f,
+};
+
+static const NN_CONFIG vp9_var_rd_part_nnconfig_8 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_var_rd_part_nn_weights_8_layer0,
+      vp9_var_rd_part_nn_weights_8_layer1,
+  },
+  {
+      vp9_var_rd_part_nn_bias_8_layer0,
+      vp9_var_rd_part_nn_bias_8_layer1,
+  },
+};
+#undef FEATURES
+#undef LABELS
 
 // Partition pruning model(linear).
 static const float vp9_partition_feature_mean[24] = {
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -219,6 +219,7 @@
   sf->less_rectangular_check = 1;
   sf->use_square_partition_only = !boosted;
   sf->prune_ref_frame_for_rect_partitions = 1;
+  sf->ml_var_partition_pruning = 1;
 
   sf->ml_prune_rect_partition_threhold[0] = -1;
   sf->ml_prune_rect_partition_threhold[1] = 350;
@@ -241,6 +242,7 @@
 
   if (speed >= 1) {
     sf->enable_tpl_model = 0;
+    sf->ml_var_partition_pruning = 0;
     sf->ml_prune_rect_partition_threhold[1] = 200;
     sf->ml_prune_rect_partition_threhold[2] = 200;
     sf->ml_prune_rect_partition_threhold[3] = 200;
@@ -939,6 +941,7 @@
   sf->ml_prune_rect_partition_threhold[1] = -1;
   sf->ml_prune_rect_partition_threhold[2] = -1;
   sf->ml_prune_rect_partition_threhold[3] = -1;
+  sf->ml_var_partition_pruning = 0;
 
   // Some speed-up features even for best quality as minimal impact on quality.
   sf->adaptive_rd_thresh = 1;
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -515,6 +515,10 @@
   // Machine-learning based partition search early termination
   int ml_partition_search_early_termination;
 
+  // Machine-learning based partition search pruning using prediction residue
+  // variance.
+  int ml_var_partition_pruning;
+
   // Allow skipping partition search for still image frame
   int allow_partition_search_skip;