shithub: libvpx

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -3393,6 +3393,140 @@

 #undef FEATURES

 #undef LABELS

+// Use a neural net model to prune partition-none and partition-split search.

+// The model uses prediction residue variance and quantization step size as

+// input features.

+#define FEATURES 6

+static void ml_predict_var_rd_paritioning(VP9_COMP *cpi, MACROBLOCK *x,

+                                          BLOCK_SIZE bsize, int mi_row,

+                                          int mi_col, int *none, int *split) {

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCKD *xd = &x->e_mbd;

+  MODE_INFO *mi = xd->mi[0];

+  const NN_CONFIG *nn_config = NULL;

+  DECLARE_ALIGNED(16, uint8_t, pred_buf[64 * 64]);

+  int i;

+  float thresh_low = -1.0f;

+  float thresh_high = 0.0f;

+  switch (bsize) {

+    case BLOCK_64X64:

+      nn_config = &vp9_var_rd_part_nnconfig_64;

+      thresh_low = -3.0f;

+      thresh_high = 3.0f;

+      break;

+    case BLOCK_32X32:

+      nn_config = &vp9_var_rd_part_nnconfig_32;

+      thresh_low = -3.0;

+      thresh_high = 3.0f;

+      break;

+    case BLOCK_16X16:

+      nn_config = &vp9_var_rd_part_nnconfig_16;

+      thresh_low = -4.0;

+      thresh_high = 4.0f;

+      break;

+    case BLOCK_8X8:

+      nn_config = &vp9_var_rd_part_nnconfig_8;

+      thresh_low = -2.0;

+      thresh_high = 2.0f;

+      break;

+    default: assert(0 && "Unexpected block size."); return;

+  }

+  if (!nn_config) return;

+  mi->ref_frame[1] = NONE;

+  mi->sb_type = bsize;

+  // Do a simple single motion search to find a prediction for current block.

+  // The variance of the residue will be used as input features.

+  {

+    const MV_REFERENCE_FRAME ref =

+        cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;

+    YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref);

+    MV ref_mv = { 0, 0 };

+    MV ref_mv_full = { 0, 0 };

+    const int step_param = 1;

+    const MvLimits tmp_mv_limits = x->mv_limits;

+    const SEARCH_METHODS search_method = NSTEP;

+    const int sadpb = x->sadperbit16;

+    MV best_mv = { 0, 0 };

+    int cost_list[5];

+    assert(yv12 != NULL);

+    if (!yv12) return;

+    vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,

+                         &cm->frame_refs[ref - 1].sf);

+    mi->ref_frame[0] = ref;

+    vp9_set_mv_search_range(&x->mv_limits, &ref_mv);

+    vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param,

+                          search_method, sadpb, cond_cost_list(cpi, cost_list),

+                          &ref_mv, &best_mv, 0, 0);

+    best_mv.row *= 8;

+    best_mv.col *= 8;

+    x->mv_limits = tmp_mv_limits;

+    mi->mv[0].as_mv = best_mv;

+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);

+    xd->plane[0].dst.buf = pred_buf;

+    xd->plane[0].dst.stride = 64;

+    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);

+  }

+  vpx_clear_system_state();

+  {

+    float features[FEATURES] = { 0.0f };

+    const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth);

+    int feature_idx = 0;

+    float score;

+    // Generate model input features.

+    features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f);

+    vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);

+    // Get the variance of the residue as input features.

+    {

+      const int bs = 4 * num_4x4_blocks_wide_lookup[bsize];

+      const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);

+      const uint8_t *pred = pred_buf;

+      const uint8_t *src = x->plane[0].src.buf;

+      const int src_stride = x->plane[0].src.stride;

+      const int pred_stride = 64;

+      unsigned int sse;

+      // Variance of whole block.

+      const unsigned int var =

+          cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);

+      const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);

+      features[feature_idx++] = logf((float)var + 1.0f);

+      for (i = 0; i < 4; ++i) {

+        const int x_idx = (i & 1) * bs / 2;

+        const int y_idx = (i >> 1) * bs / 2;

+        const int src_offset = y_idx * src_stride + x_idx;

+        const int pred_offset = y_idx * pred_stride + x_idx;

+        // Variance of quarter block.

+        const unsigned int sub_var =

+            cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,

+                                    pred + pred_offset, pred_stride, &sse);

+        const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;

+        features[feature_idx++] = var_ratio;

+      }

+    }

+    assert(feature_idx == FEATURES);

+    // Feed the features into the model to get the confidence score.

+    nn_predict(features, nn_config, &score);

+    // Higher score means that the model has higher confidence that the split

+    // partition is better than the non-split partition. So if the score is

+    // high enough, we skip the none-split partition search; if the score is

+    // low enough, we skip the split partition search.

+    if (score > thresh_high) *none = 0;

+    if (score < thresh_low) *split = 0;

+  }

+}

+#undef FEATURES

+#undef LABELS

 int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col,

                      int orig_rdmult) {

   TplDepFrame *tpl_frame = &cpi->tpl_stats[cpi->twopass.gf_group.index];

@@ -3624,6 +3758,21 @@

   pc_tree->partitioning = PARTITION_NONE;

+  if (cpi->sf.ml_var_partition_pruning) {

+    int do_ml_var_partition_pruning =

+        !frame_is_intra_only(cm) && partition_none_allowed && do_split &&

+        mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows &&

+        mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols;

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)

+      do_ml_var_partition_pruning = 0;

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+    if (do_ml_var_partition_pruning) {

+      ml_predict_var_rd_paritioning(cpi, x, bsize, mi_row, mi_col,

+                                    &partition_none_allowed, &do_split);

+    }

+  }

   // PARTITION_NONE

   if (partition_none_allowed) {

     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, ctx,

@@ -3738,6 +3887,9 @@

     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);

+  } else {

+    vp9_zero(ctx->pred_mv);

+    ctx->mic.interp_filter = EIGHTTAP;

   // store estimated motion vector

--- a/vp9/encoder/vp9_partition_models.h

+++ b/vp9/encoder/vp9_partition_models.h

@@ -18,7 +18,9 @@

 #define NN_MAX_HIDDEN_LAYERS 10

 #define NN_MAX_NODES_PER_LAYER 128

-// Neural net model config.

+// Neural net model config. It defines the layout of a neural net model, such as

+// the number of inputs/outputs, number of layers, the number of nodes in each

+// layer, as well as the weights and bias of each node.

 typedef struct {

   int num_inputs;         // Number of input nodes, i.e. features.

   int num_outputs;        // Number of output nodes.

@@ -963,6 +965,178 @@

};

 #undef FEATURES

 #endif  // CONFIG_ML_VAR_PARTITION

+#define FEATURES 6

+#define LABELS 1

+static const float vp9_var_rd_part_nn_weights_64_layer0[FEATURES * 8] = {

+  -0.100129f, 0.128867f,  -1.375086f, -2.268096f, -1.470368f, -2.296274f,

+  0.034445f,  -0.062993f, -2.151904f, 0.523215f,  1.611269f,  1.530051f,

+  0.418182f,  -1.330239f, 0.828388f,  0.386546f,  -0.026188f, -0.055459f,

+  -0.474437f, 0.861295f,  -2.208743f, -0.652991f, -2.985873f, -1.728956f,

+  0.388052f,  -0.420720f, 2.015495f,  1.280342f,  3.040914f,  1.760749f,

+  -0.009062f, 0.009623f,  1.579270f,  -2.012891f, 1.629662f,  -1.796016f,

+  -0.279782f, -0.288359f, 1.875618f,  1.639855f,  0.903020f,  0.906438f,

+  0.553394f,  -1.621589f, 0.185063f,  0.605207f,  -0.133560f, 0.588689f,

+};

+static const float vp9_var_rd_part_nn_bias_64_layer0[8] = {

+  0.659717f, 0.120912f, 0.329894f, -1.586385f,

+  1.715839f, 0.085754f, 2.038774f, 0.268119f,

+};

+static const float vp9_var_rd_part_nn_weights_64_layer1[8 * LABELS] = {

+  -3.445586f, 2.375620f, 1.236970f, 0.804030f,

+  -2.448384f, 2.827254f, 2.291478f, 0.790252f,

+};

+static const float vp9_var_rd_part_nn_bias_64_layer1[LABELS] = {

+  -1.16608453f,

+};

+static const NN_CONFIG vp9_var_rd_part_nnconfig_64 = {

+  FEATURES,  // num_inputs

+  LABELS,    // num_outputs

+  1,         // num_hidden_layers

+  {

+      8,

+  },  // num_hidden_nodes

+  {

+      vp9_var_rd_part_nn_weights_64_layer0,

+      vp9_var_rd_part_nn_weights_64_layer1,

+  },

+  {

+      vp9_var_rd_part_nn_bias_64_layer0,

+      vp9_var_rd_part_nn_bias_64_layer1,

+  },

+};

+static const float vp9_var_rd_part_nn_weights_32_layer0[FEATURES * 8] = {

+  0.022420f,  -0.032201f, 1.228065f,  -2.767655f, 1.928743f,  0.566863f,

+  0.459229f,  0.422048f,  0.833395f,  0.822960f,  -0.232227f, 0.586895f,

+  0.442856f,  -0.018564f, 0.227672f,  -1.291306f, 0.119428f,  -0.776563f,

+  -0.042947f, 0.183129f,  0.592231f,  1.174859f,  -0.503868f, 0.270102f,

+  -0.330537f, -0.036340f, 1.144630f,  1.783710f,  1.216929f,  2.038085f,

+  0.373782f,  -0.430258f, 1.957002f,  1.383908f,  2.012261f,  1.585693f,

+  -0.394399f, -0.337523f, -0.238335f, 0.007819f,  -0.368294f, 0.437875f,

+  -0.318923f, -0.242000f, 2.276263f,  1.501432f,  0.645706f,  0.344774f,

+};

+static const float vp9_var_rd_part_nn_bias_32_layer0[8] = {

+  -0.023846f, -1.348117f, 1.365007f,  -1.644164f,

+  0.062992f,  1.257980f,  -0.098642f, 1.388472f,

+};

+static const float vp9_var_rd_part_nn_weights_32_layer1[8 * LABELS] = {

+  3.016729f, 0.622684f,  -1.021302f, 1.490383f,

+  1.702046f, -2.964618f, 0.689045f,  1.711754f,

+};

+static const float vp9_var_rd_part_nn_bias_32_layer1[LABELS] = {

+  -1.28798676f,

+};

+static const NN_CONFIG vp9_var_rd_part_nnconfig_32 = {

+  FEATURES,  // num_inputs

+  LABELS,    // num_outputs

+  1,         // num_hidden_layers

+  {

+      8,

+  },  // num_hidden_nodes

+  {

+      vp9_var_rd_part_nn_weights_32_layer0,

+      vp9_var_rd_part_nn_weights_32_layer1,

+  },

+  {

+      vp9_var_rd_part_nn_bias_32_layer0,

+      vp9_var_rd_part_nn_bias_32_layer1,

+  },

+};

+static const float vp9_var_rd_part_nn_weights_16_layer0[FEATURES * 8] = {

+  -0.726813f, -0.026748f, 1.376946f,  1.467961f,  1.961810f,  1.690412f,

+  0.596484f,  -0.261486f, -0.310905f, -0.366311f, -1.300086f, -0.534336f,

+  0.040520f,  -0.032391f, -1.194214f, 2.438063f,  -3.915334f, 1.997270f,

+  0.673696f,  -0.676393f, 1.654886f,  1.553838f,  1.129691f,  1.360201f,

+  0.255001f,  0.336442f,  -0.487759f, -0.634555f, 0.479170f,  -0.110475f,

+  -0.661852f, -0.158872f, -0.350243f, -0.303957f, -0.045018f, 0.586151f,

+  -0.262463f, 0.228079f,  -1.688776f, -1.594502f, -2.261078f, -1.802535f,

+  0.034748f,  -0.028476f, 2.713258f,  0.212446f,  -1.529202f, -2.560178f,

+};

+static const float vp9_var_rd_part_nn_bias_16_layer0[8] = {

+  0.495983f,  1.858545f, 0.162974f, 1.992247f,

+  -2.698863f, 0.110020f, 0.550830f, 0.420941f,

+};

+static const float vp9_var_rd_part_nn_weights_16_layer1[8 * LABELS] = {

+  1.768409f, -1.394240f, 1.076846f,  -1.762808f,

+  1.517405f, 0.535195f,  -0.426827f, 1.002272f,

+};

+static const float vp9_var_rd_part_nn_bias_16_layer1[LABELS] = {

+  -1.65894794f,

+};

+static const NN_CONFIG vp9_var_rd_part_nnconfig_16 = {

+  FEATURES,  // num_inputs

+  LABELS,    // num_outputs

+  1,         // num_hidden_layers

+  {

+      8,

+  },  // num_hidden_nodes

+  {

+      vp9_var_rd_part_nn_weights_16_layer0,

+      vp9_var_rd_part_nn_weights_16_layer1,

+  },

+  {

+      vp9_var_rd_part_nn_bias_16_layer0,

+      vp9_var_rd_part_nn_bias_16_layer1,

+  },

+};

+static const float vp9_var_rd_part_nn_weights_8_layer0[FEATURES * 8] = {

+  -0.804900f, -1.214983f, 0.840202f, 0.686566f,  0.155804f,  0.025542f,

+  -1.244635f, -0.368403f, 0.364150f, 1.081073f,  0.552387f,  0.452715f,

+  0.652968f,  -0.293058f, 0.048967f, 0.021240f,  -0.662981f, 0.424700f,

+  0.008293f,  -0.013088f, 0.747007f, -1.453907f, -1.498226f, 1.593252f,

+  -0.239557f, -0.143766f, 0.064311f, 1.320998f,  -0.477411f, 0.026374f,

+  0.730884f,  -0.675124f, 0.965521f, 0.863658f,  0.809186f,  0.812280f,

+  0.513131f,  0.185102f,  0.211354f, 0.793666f,  0.121714f,  -0.015383f,

+  -0.650980f, -0.046581f, 0.911141f, 0.806319f,  0.974773f,  0.815893f,

+};

+static const float vp9_var_rd_part_nn_bias_8_layer0[8] = {

+  0.176134f, 0.651308f, 2.007761f,  0.068812f,

+  1.061517f, 1.487161f, -2.308147f, 1.099828f,

+};

+static const float vp9_var_rd_part_nn_weights_8_layer1[8 * LABELS] = {

+  0.683032f, 1.326393f,  -1.661539f, 1.438920f,

+  1.118023f, -2.237380f, 1.518468f,  2.010416f,

+};

+static const float vp9_var_rd_part_nn_bias_8_layer1[LABELS] = {

+  -1.65423989f,

+};

+static const NN_CONFIG vp9_var_rd_part_nnconfig_8 = {

+  FEATURES,  // num_inputs

+  LABELS,    // num_outputs

+  1,         // num_hidden_layers

+  {

+      8,

+  },  // num_hidden_nodes

+  {

+      vp9_var_rd_part_nn_weights_8_layer0,

+      vp9_var_rd_part_nn_weights_8_layer1,

+  },

+  {

+      vp9_var_rd_part_nn_bias_8_layer0,

+      vp9_var_rd_part_nn_bias_8_layer1,

+  },

+};

+#undef FEATURES

+#undef LABELS

 // Partition pruning model(linear).

 static const float vp9_partition_feature_mean[24] = {

--- a/vp9/encoder/vp9_speed_features.c

+++ b/vp9/encoder/vp9_speed_features.c

@@ -219,6 +219,7 @@

   sf->less_rectangular_check = 1;

   sf->use_square_partition_only = !boosted;

   sf->prune_ref_frame_for_rect_partitions = 1;

+  sf->ml_var_partition_pruning = 1;

   sf->ml_prune_rect_partition_threhold[0] = -1;

   sf->ml_prune_rect_partition_threhold[1] = 350;

@@ -241,6 +242,7 @@

   if (speed >= 1) {

     sf->enable_tpl_model = 0;

+    sf->ml_var_partition_pruning = 0;

     sf->ml_prune_rect_partition_threhold[1] = 200;

     sf->ml_prune_rect_partition_threhold[2] = 200;

     sf->ml_prune_rect_partition_threhold[3] = 200;

@@ -939,6 +941,7 @@

   sf->ml_prune_rect_partition_threhold[1] = -1;

   sf->ml_prune_rect_partition_threhold[2] = -1;

   sf->ml_prune_rect_partition_threhold[3] = -1;

+  sf->ml_var_partition_pruning = 0;

   // Some speed-up features even for best quality as minimal impact on quality.

   sf->adaptive_rd_thresh = 1;

--- a/vp9/encoder/vp9_speed_features.h

+++ b/vp9/encoder/vp9_speed_features.h

@@ -515,6 +515,10 @@

   // Machine-learning based partition search early termination

   int ml_partition_search_early_termination;

+  // Machine-learning based partition search pruning using prediction residue

+  // variance.

+  int ml_var_partition_pruning;

   // Allow skipping partition search for still image frame

   int allow_partition_search_skip;

--

⑨