shithub: libvpx

Download patch

ref: 01900edc40e30891fd00b4066209a686894dd1c5
parent: fe2fd37bb2121cbbb95d826089d88357e70c87a8
parent: 01483677e567dcb58077540449de7999ec05ffcb
author: James Zern <jzern@google.com>
date: Fri Oct 24 07:43:51 EDT 2014

Merge changes I8a9c9019,Ic7b2faa3,I44d42a50,I3f3a3924,I10747b32,I31b49c9e

* changes:
  add vp9_loop_filter_data_reset
  move LFWorkerData allocation to VP9LfSync
  vp9_loop_filter_frame_mt: remove pbi dependency
  vp9_loop_filter_frame_mt: pass planes directly
  vp9_loop_filter_frame_mt: pass VP9LfSync directly
  vp9: store TileWorkerData allocations separately

--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1625,6 +1625,17 @@
                        y_only);
 }
 
+void vp9_loop_filter_data_reset(
+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+    struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]) {
+  lf_data->frame_buffer = frame_buffer;
+  lf_data->cm = cm;
+  lf_data->start = 0;
+  lf_data->stop = 0;
+  lf_data->y_only = 0;
+  vpx_memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
+}
+
 int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
   (void)unused;
   vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -124,10 +124,11 @@
   int start;
   int stop;
   int y_only;
-
-  struct VP9LfSyncData *lf_sync;
-  int num_lf_workers;
 } LFWorkerData;
+
+void vp9_loop_filter_data_reset(
+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+    struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
 
 // Operates on the rows described by 'lf_data'.
 int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -902,11 +902,8 @@
     LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
     // Be sure to sync as we might be resuming after a failed frame decode.
     winterface->sync(&pbi->lf_worker);
-    lf_data->frame_buffer = get_frame_new_buffer(cm);
-    lf_data->cm = cm;
-    vp9_copy(lf_data->planes, pbi->mb.plane);
-    lf_data->stop = 0;
-    lf_data->y_only = 0;
+    vp9_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm,
+                               pbi->mb.plane);
     vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
   }
 
@@ -1065,14 +1062,19 @@
     // use num_threads - 1 workers.
     CHECK_MEM_ERROR(cm, pbi->tile_workers,
                     vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
+    // Ensure tile data offsets will be properly aligned. This may fail on
+    // platforms without DECLARE_ALIGNED().
+    assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
+    CHECK_MEM_ERROR(cm, pbi->tile_worker_data,
+                    vpx_memalign(32, num_threads *
+                                 sizeof(*pbi->tile_worker_data)));
+    CHECK_MEM_ERROR(cm, pbi->tile_worker_info,
+                    vpx_malloc(num_threads * sizeof(*pbi->tile_worker_info)));
     for (i = 0; i < num_threads; ++i) {
       VP9Worker *const worker = &pbi->tile_workers[i];
       ++pbi->num_tile_workers;
 
       winterface->init(worker);
-      CHECK_MEM_ERROR(cm, worker->data1,
-                      vpx_memalign(32, sizeof(TileWorkerData)));
-      CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
       if (i < num_threads - 1 && !winterface->reset(worker)) {
         vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                            "Tile decoder thread creation failed");
@@ -1082,8 +1084,11 @@
 
   // Reset tile decoding hook
   for (n = 0; n < num_workers; ++n) {
-    winterface->sync(&pbi->tile_workers[n]);
-    pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook;
+    VP9Worker *const worker = &pbi->tile_workers[n];
+    winterface->sync(worker);
+    worker->hook = (VP9WorkerHook)tile_worker_hook;
+    worker->data1 = &pbi->tile_worker_data[n];
+    worker->data2 = &pbi->tile_worker_info[n];
   }
 
   // Note: this memset assumes above_context[0], [1] and [2]
@@ -1555,7 +1560,9 @@
     if (!xd->corrupted) {
       // If multiple threads are used to decode tiles, then we use those threads
       // to do parallel loopfiltering.
-      vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
+      vp9_loop_filter_frame_mt(&pbi->lf_row_sync, new_fb, pbi->mb.plane, cm,
+                               pbi->tile_workers, pbi->num_tile_workers,
+                               cm->lf.filter_level, 0);
     }
   } else {
     *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -106,9 +106,9 @@
   for (i = 0; i < pbi->num_tile_workers; ++i) {
     VP9Worker *const worker = &pbi->tile_workers[i];
     vp9_get_worker_interface()->end(worker);
-    vpx_free(worker->data1);
-    vpx_free(worker->data2);
   }
+  vpx_free(pbi->tile_worker_data);
+  vpx_free(pbi->tile_worker_info);
   vpx_free(pbi->tile_workers);
 
   if (pbi->num_tile_workers > 0) {
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -46,6 +46,8 @@
 
   VP9Worker lf_worker;
   VP9Worker *tile_workers;
+  TileWorkerData *tile_worker_data;
+  TileInfo *tile_worker_info;
   int num_tile_workers;
 
   TileData *tile_data;
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -92,12 +92,12 @@
                                 VP9_COMMON *const cm,
                                 struct macroblockd_plane planes[MAX_MB_PLANE],
                                 int start, int stop, int y_only,
-                                VP9LfSync *const lf_sync, int num_lf_workers) {
+                                VP9LfSync *const lf_sync) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int r, c;  // SB row and col
   const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
 
-  for (r = start; r < stop; r += num_lf_workers) {
+  for (r = start; r < stop; r += lf_sync->num_workers) {
     const int mi_row = r << MI_BLOCK_SIZE_LOG2;
     MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;
 
@@ -121,35 +121,35 @@
 }
 
 // Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(TileWorkerData *const tile_data,
-                                  void *unused) {
-  LFWorkerData *const lf_data = &tile_data->lfdata;
-  (void)unused;
+static int loop_filter_row_worker(VP9LfSync *const lf_sync,
+                                  LFWorkerData *const lf_data) {
   loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
-                      lf_data->start, lf_data->stop, lf_data->y_only,
-                      lf_data->lf_sync, lf_data->num_lf_workers);
+                      lf_data->start, lf_data->stop, lf_data->y_only, lf_sync);
   return 1;
 }
 
 // VP9 decoder: Implement multi-threaded loopfilter that uses the tile
 // threads.
-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
-                              VP9Decoder *pbi, VP9_COMMON *cm,
+void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
+                              YV12_BUFFER_CONFIG *frame,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
+                              VP9_COMMON *cm,
+                              VP9Worker *workers, int nworkers,
                               int frame_filter_level,
                               int y_only) {
-  VP9LfSync *const lf_sync = &pbi->lf_row_sync;
   const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
   // Number of superblock rows and cols
   const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
   const int tile_cols = 1 << cm->log2_tile_cols;
-  const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
+  const int num_workers = MIN(nworkers, tile_cols);
   int i;
 
   if (!frame_filter_level) return;
 
-  if (!lf_sync->sync_range || cm->last_height != cm->height) {
+  if (!lf_sync->sync_range || cm->last_height != cm->height ||
+      num_workers > lf_sync->num_workers) {
     vp9_loop_filter_dealloc(lf_sync);
-    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width);
+    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
   }
 
   vp9_loop_filter_frame_init(cm, frame_filter_level);
@@ -158,33 +158,27 @@
   vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
 
   // Set up loopfilter thread data.
-  // The decoder is using num_workers instead of pbi->num_tile_workers
-  // because it has been observed that using more threads on the
-  // loopfilter, than there are tile columns in the frame will hurt
-  // performance on Android. This is because the system will only
-  // schedule the tile decode workers on cores equal to the number
-  // of tile columns. Then if the decoder tries to use more threads for the
-  // loopfilter, it will hurt performance because of contention. If the
-  // multithreading code changes in the future then the number of workers
-  // used by the loopfilter should be revisited.
+  // The decoder is capping num_workers because it has been observed that using
+  // more threads on the loopfilter than there are cores will hurt performance
+  // on Android. This is because the system will only schedule the tile decode
+  // workers on cores equal to the number of tile columns. Then if the decoder
+  // tries to use more threads for the loopfilter, it will hurt performance
+  // because of contention. If the multithreading code changes in the future
+  // then the number of workers used by the loopfilter should be revisited.
   for (i = 0; i < num_workers; ++i) {
-    VP9Worker *const worker = &pbi->tile_workers[i];
-    TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
-    LFWorkerData *const lf_data = &tile_data->lfdata;
+    VP9Worker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
 
     worker->hook = (VP9WorkerHook)loop_filter_row_worker;
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
 
     // Loopfilter data
-    lf_data->frame_buffer = frame;
-    lf_data->cm = cm;
-    vp9_copy(lf_data->planes, pbi->mb.plane);
+    vp9_loop_filter_data_reset(lf_data, frame, cm, planes);
     lf_data->start = i;
     lf_data->stop = sb_rows;
-    lf_data->y_only = y_only;   // always do all planes in decoder
+    lf_data->y_only = y_only;
 
-    lf_data->lf_sync = lf_sync;
-    lf_data->num_lf_workers = num_workers;
-
     // Start loopfiltering
     if (i == num_workers - 1) {
       winterface->execute(worker);
@@ -195,7 +189,7 @@
 
   // Wait till all rows are finished
   for (i = 0; i < num_workers; ++i) {
-    winterface->sync(&pbi->tile_workers[i]);
+    winterface->sync(&workers[i]);
   }
 }
 
@@ -215,7 +209,7 @@
 
 // Allocate memory for lf row synchronization
 void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
-                           int width) {
+                           int width, int num_workers) {
   lf_sync->rows = rows;
 #if CONFIG_MULTITHREAD
   {
@@ -239,6 +233,10 @@
   }
 #endif  // CONFIG_MULTITHREAD
 
+  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+                  vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
+  lf_sync->num_workers = num_workers;
+
   CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
                   vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
 
@@ -265,6 +263,7 @@
       vpx_free(lf_sync->cond_);
     }
 #endif  // CONFIG_MULTITHREAD
+    vpx_free(lf_sync->lfdata);
     vpx_free(lf_sync->cur_sb_col);
     // clear the structure as the source of this call may be a resize in which
     // case this call will be followed by an _alloc() which may fail.
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -22,9 +22,6 @@
   struct VP9Common *cm;
   vp9_reader bit_reader;
   DECLARE_ALIGNED(16, struct macroblockd, xd);
-
-  // Row-based parallel loopfilter data
-  LFWorkerData lfdata;
 } TileWorkerData;
 
 // Loopfilter row synchronization
@@ -39,19 +36,25 @@
   // determined by testing. Currently, it is chosen to be a power-of-2 number.
   int sync_range;
   int rows;
+
+  // Row-based parallel loopfilter data
+  LFWorkerData *lfdata;
+  int num_workers;
 } VP9LfSync;
 
 // Allocate memory for loopfilter row synchronization.
 void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
-                           int width);
+                           int width, int num_workers);
 
 // Deallocate loopfilter synchronization related mutex and data.
 void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
 
 // Multi-threaded loopfilter that uses the tile threads.
-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
-                              struct VP9Decoder *pbi,
+void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
+                              YV12_BUFFER_CONFIG *frame,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
                               struct VP9Common *cm,
+                              VP9Worker *workers, int num_workers,
                               int frame_filter_level,
                               int y_only);