shithub: libvpx

Download patch

ref: b106abe570a5dcef13805593873d5c0e1350453b
parent: 259e835b1bb365a0fe179f9f394192261069ec88
parent: 91f01a2060619183a35eb6ed3eb15733bc6d1e18
author: Yunqing Wang <yunqingwang@google.com>
date: Tue Feb 7 14:55:40 EST 2017

Merge "Row based multi-threading of ARNR filtering stage"

--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1581,6 +1581,11 @@
        cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) &&
       cpi->oxcf.new_mt && !cpi->use_svc)
     cpi->new_mt = 1;
+
+  if (cpi->oxcf.mode == GOOD && cpi->oxcf.speed < 5 &&
+      (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) && cpi->oxcf.new_mt &&
+      !cpi->use_svc)
+    cpi->new_mt = 1;
 }
 
 #ifndef M_LOG2_E
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -13,6 +13,7 @@
 #include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_multi_thread.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
 static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
@@ -463,4 +464,81 @@
     TileDataEnc *this_tile = &cpi->tile_data[i];
     accumulate_fp_tile_stat(first_tile_col, this_tile);
   }
+}
+
+static int temporal_filter_worker_hook(EncWorkerData *const thread_data,
+                                       MultiThreadHandle *multi_thread_ctxt) {
+  VP9_COMP *const cpi = thread_data->cpi;
+  const VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  int tile_row, tile_col;
+  int mb_col_start, mb_col_end;
+  TileDataEnc *this_tile;
+  int end_of_frame;
+  int thread_id = thread_data->thread_id;
+  int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+  JobNode *proc_job = NULL;
+  int mb_row;
+
+  end_of_frame = 0;
+  while (0 == end_of_frame) {
+    // Get the next job in the queue
+    proc_job =
+        (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
+    if (NULL == proc_job) {
+      // Query for the status of other tiles
+      end_of_frame = vp9_get_tiles_proc_status(
+          multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
+          tile_cols);
+    } else {
+      tile_col = proc_job->tile_col_id;
+      tile_row = proc_job->tile_row_id;
+      this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+      mb_col_start = (this_tile->tile_info.mi_col_start) >> 1;
+      mb_col_end = (this_tile->tile_info.mi_col_end + 1) >> 1;
+      mb_row = proc_job->vert_unit_row_num;
+
+      vp9_temporal_filter_iterate_row_c(cpi, thread_data->td, mb_row,
+                                        mb_col_start, mb_col_end);
+    }
+  }
+  return 0;
+}
+
+void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int num_workers = cpi->num_workers ? cpi->num_workers : 1;
+  int i;
+
+  if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
+      multi_thread_ctxt->allocated_tile_rows < tile_rows ||
+      multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
+    vp9_row_mt_mem_dealloc(cpi);
+    vp9_init_tile_data(cpi);
+    vp9_row_mt_mem_alloc(cpi);
+  } else {
+    vp9_init_tile_data(cpi);
+  }
+
+  create_enc_workers(cpi, num_workers);
+
+  vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
+
+  vp9_prepare_job_queue(cpi, ARNR_JOB);
+
+  for (i = 0; i < num_workers; i++) {
+    EncWorkerData *thread_data;
+    thread_data = &cpi->tile_thr_data[i];
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+    }
+  }
+
+  launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook,
+                     multi_thread_ctxt, num_workers);
 }
--- a/vp9/encoder/vp9_ethread.h
+++ b/vp9/encoder/vp9_ethread.h
@@ -61,6 +61,8 @@
 // Deallocate row based multi-threading synchronization related mutex and data.
 void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync);
 
+void vp9_temporal_filter_row_mt(struct VP9_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -2381,9 +2381,6 @@
     cost_list[4] = INT_MAX;
   }
 
-  // Keep track of number of searches (this frame in this thread).
-  ++(*x->m_search_count_ptr);
-
   switch (method) {
     case FAST_DIAMOND:
       var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
@@ -2409,6 +2406,9 @@
       var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
                                MAX_MVSEARCH_STEPS - 1 - step_param, 1,
                                cost_list, fn_ptr, ref_mv, tmp_mv);
+
+      // Keep track of number of searches (this frame in this thread).
+      ++(*x->m_search_count_ptr);
 
       // Should we allow a follow on exhaustive search?
       if (is_exhaustive_allowed(cpi, x)) {
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -16,6 +16,7 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
@@ -262,9 +263,9 @@
   return bestsme;
 }
 
-static void temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
-                                          int mb_row, int mb_col_start,
-                                          int mb_col_end) {
+void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
+                                       int mb_row, int mb_col_start,
+                                       int mb_col_end) {
   ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data;
   YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames;
   int frame_count = arnr_filter_data->frame_count;
@@ -571,8 +572,8 @@
   int mb_row;
 
   for (mb_row = mb_row_start; mb_row < mb_row_end; mb_row++) {
-    temporal_filter_iterate_row_c(cpi, &cpi->td, mb_row, mb_col_start,
-                                  mb_col_end);
+    vp9_temporal_filter_iterate_row_c(cpi, &cpi->td, mb_row, mb_col_start,
+                                      mb_col_end);
   }
 }
 
@@ -765,5 +766,8 @@
   set_error_per_bit(&cpi->td.mb, rdmult);
   vp9_initialize_me_consts(cpi, &cpi->td.mb, ARNR_FILT_QINDEX);
 
-  temporal_filter_iterate_c(cpi);
+  if (!cpi->new_mt)
+    temporal_filter_iterate_c(cpi);
+  else
+    vp9_temporal_filter_row_mt(cpi);
 }
--- a/vp9/encoder/vp9_temporal_filter.h
+++ b/vp9/encoder/vp9_temporal_filter.h
@@ -20,6 +20,10 @@
 void vp9_temporal_filter_init(void);
 void vp9_temporal_filter(VP9_COMP *cpi, int distance);
 
+void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
+                                       int mb_row, int mb_col_start,
+                                       int mb_col_end);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif