shithub: libvpx

--- a/vp9/common/vp9_onyxc_int.h

+++ b/vp9/common/vp9_onyxc_int.h

@@ -256,6 +256,8 @@

   PARTITION_CONTEXT *above_seg_context;

   ENTROPY_CONTEXT *above_context;

   int above_context_alloc_cols;

+  int lf_row;

 } VP9_COMMON;

 static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {

--- a/vp9/common/vp9_thread_common.c

+++ b/vp9/common/vp9_thread_common.c

@@ -229,6 +229,26 @@

                       workers, num_workers, lf_sync);

+void vp9_lpf_mt_init(VP9LfSync *lf_sync, VP9_COMMON *cm, int frame_filter_level,

+                     int num_workers) {

+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;

+  if (!frame_filter_level) return;

+  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||

+      num_workers > lf_sync->num_workers) {

+    vp9_loop_filter_dealloc(lf_sync);

+    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);

+  }

+  // Initialize cur_sb_col to -1 for all SB rows.

+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);

+  memset(lf_sync->num_tiles_done, 0,

+         sizeof(*lf_sync->num_tiles_done) * sb_rows);

+  cm->lf_row = 0;

+}

 // Set up nsync by width.

 static INLINE int get_sync_range(int width) {

   // nsync numbers are picked by testing. For example, for 4k

@@ -266,6 +286,25 @@

         pthread_cond_init(&lf_sync->cond_[i], NULL);

+    pthread_mutex_init(&lf_sync->lf_mutex, NULL);

+    CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex,

+                    vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows));

+    if (lf_sync->recon_done_mutex) {

+      int i;

+      for (i = 0; i < rows; ++i) {

+        pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL);

+      }

+    }

+    CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond,

+                    vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows));

+    if (lf_sync->recon_done_cond) {

+      int i;

+      for (i = 0; i < rows; ++i) {

+        pthread_cond_init(&lf_sync->recon_done_cond[i], NULL);

+      }

+    }

 #endif  // CONFIG_MULTITHREAD

@@ -276,6 +315,11 @@

   CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,

                   vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));

+  CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done,

+                  vpx_malloc(sizeof(*lf_sync->num_tiles_done) *

+                                 mi_cols_aligned_to_sb(cm->mi_rows) >>

+                             MI_BLOCK_SIZE_LOG2));

   // Set up nsync.

   lf_sync->sync_range = get_sync_range(width);

@@ -298,13 +342,116 @@

       vpx_free(lf_sync->cond_);

+    if (lf_sync->recon_done_mutex != NULL) {

+      int i;

+      for (i = 0; i < lf_sync->rows; ++i) {

+        pthread_mutex_destroy(&lf_sync->recon_done_mutex[i]);

+      }

+      vpx_free(lf_sync->recon_done_mutex);

+    }

+    pthread_mutex_destroy(&lf_sync->lf_mutex);

+    if (lf_sync->recon_done_cond != NULL) {

+      int i;

+      for (i = 0; i < lf_sync->rows; ++i) {

+        pthread_cond_destroy(&lf_sync->recon_done_cond[i]);

+      }

+      vpx_free(lf_sync->recon_done_cond);

+    }

 #endif  // CONFIG_MULTITHREAD

     vpx_free(lf_sync->lfdata);

     vpx_free(lf_sync->cur_sb_col);

+    vpx_free(lf_sync->num_tiles_done);

     // clear the structure as the source of this call may be a resize in which

     // case this call will be followed by an _alloc() which may fail.

     vp9_zero(*lf_sync);

+}

+static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {

+  int return_val = -1;

+  int cur_row;

+  const int max_rows = cm->mi_rows;

+#if CONFIG_MULTITHREAD

+  const int tile_cols = 1 << cm->log2_tile_cols;

+  pthread_mutex_lock(&lf_sync->lf_mutex);

+  if (cm->lf_row < max_rows) {

+    cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;

+    return_val = cm->lf_row;

+    cm->lf_row += MI_BLOCK_SIZE;

+    if (cm->lf_row < max_rows) {

+      /* If this is not the last row, make sure the next row is also decoded.

+       * This is because the intra predict has to happen before loop filter */

+      cur_row += 1;

+    }

+  }

+  pthread_mutex_unlock(&lf_sync->lf_mutex);

+  if (return_val == -1) return return_val;

+  pthread_mutex_lock(&lf_sync->recon_done_mutex[cur_row]);

+  if (lf_sync->num_tiles_done[cur_row] < tile_cols) {

+    pthread_cond_wait(&lf_sync->recon_done_cond[cur_row],

+                      &lf_sync->recon_done_mutex[cur_row]);

+  }

+  pthread_mutex_unlock(&lf_sync->recon_done_mutex[cur_row]);

+#else

+  (void)lf_sync;

+  if (cm->lf_row < max_rows) {

+    cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;

+    return_val = cm->lf_row;

+    cm->lf_row += MI_BLOCK_SIZE;

+    if (cm->lf_row < max_rows) {

+      /* If this is not the last row, make sure the next row is also decoded.

+       * This is because the intra predict has to happen before loop filter */

+      cur_row += 1;

+    }

+  }

+#endif  // CONFIG_MULTITHREAD

+  return return_val;

+}

+void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync,

+                         MACROBLOCKD *xd) {

+  int mi_row;

+  VP9_COMMON *cm = lf_data->cm;

+  while (!xd->corrupted && (mi_row = get_next_row(cm, lf_sync)) != -1 &&

+         mi_row < cm->mi_rows) {

+    lf_data->start = mi_row;

+    lf_data->stop = mi_row + MI_BLOCK_SIZE;

+    thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,

+                            lf_data->start, lf_data->stop, lf_data->y_only,

+                            lf_sync);

+  }

+}

+void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row) {

+#if CONFIG_MULTITHREAD

+  pthread_mutex_lock(&lf_sync->recon_done_mutex[row]);

+  lf_sync->num_tiles_done[row] += 1;

+  if (num_tiles == lf_sync->num_tiles_done[row]) {

+    if (is_last_row) {

+      /* The last 2 rows wait on the last row to be done.

+       * So, we have to broadcast the signal in this case.

+       */

+      pthread_cond_broadcast(&lf_sync->recon_done_cond[row]);

+    } else {

+      pthread_cond_signal(&lf_sync->recon_done_cond[row]);

+    }

+  }

+  pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]);

+#else

+  (void)lf_sync;

+  (void)num_tiles;

+  (void)row;

+  (void)is_last_row;

+#endif  // CONFIG_MULTITHREAD

 // Accumulate frame counts.

--- a/vp9/common/vp9_thread_common.h

+++ b/vp9/common/vp9_thread_common.h

@@ -37,6 +37,13 @@

   // Row-based parallel loopfilter data

   LFWorkerData *lfdata;

   int num_workers;

+#if CONFIG_MULTITHREAD

+  pthread_mutex_t lf_mutex;

+  pthread_mutex_t *recon_done_mutex;

+  pthread_cond_t *recon_done_cond;

+#endif

+  int *num_tiles_done;

 } VP9LfSync;

 // Allocate memory for loopfilter row synchronization.

@@ -52,6 +59,17 @@

                               int frame_filter_level, int y_only,

                               int partial_frame, VPxWorker *workers,

                               int num_workers, VP9LfSync *lf_sync);

+// Multi-threaded loopfilter initialisations

+void vp9_lpf_mt_init(VP9LfSync *lf_sync, struct VP9Common *cm,

+                     int frame_filter_level, int num_workers);

+void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync,

+                         MACROBLOCKD *xd);

+void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row);

+void vp9_set_last_decoded_row(struct VP9Common *cm, int tile_col, int mi_row);

 void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum,

                                  const struct FRAME_COUNTS *counts, int is_dec);

--- a/vp9/decoder/vp9_decodeframe.c

+++ b/vp9/decoder/vp9_decodeframe.c

@@ -1487,6 +1487,11 @@

   TileInfo *volatile tile = &tile_data->xd.tile;

   const int final_col = (1 << pbi->common.log2_tile_cols) - 1;

   const uint8_t *volatile bit_reader_end = NULL;

+  VP9_COMMON *cm = &pbi->common;

+  LFWorkerData *lf_data = tile_data->lf_data;

+  VP9LfSync *lf_sync = tile_data->lf_sync;

   volatile int n = tile_data->buf_start;

   tile_data->error_info.setjmp = 1;

@@ -1519,6 +1524,13 @@

            mi_col += MI_BLOCK_SIZE) {

         decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);

+      if (cm->lf.filter_level && !cm->skip_loop_filter) {

+        const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);

+        const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2);

+        const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2);

+        vp9_set_row(lf_sync, 1 << cm->log2_tile_cols,

+                    mi_row >> MI_BLOCK_SIZE_LOG2, is_last_row);

+      }

     if (buf->col == final_col) {

@@ -1526,6 +1538,11 @@

   } while (!tile_data->xd.corrupted && ++n <= tile_data->buf_end);

+  if (!tile_data->xd.corrupted && cm->lf.filter_level &&

+      !cm->skip_loop_filter) {

+    vp9_loopfilter_rows(lf_data, lf_sync, &tile_data->xd);

+  }

   tile_data->data_end = bit_reader_end;

   return !tile_data->xd.corrupted;

@@ -1542,6 +1559,8 @@

   VP9_COMMON *const cm = &pbi->common;

   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();

   const uint8_t *bit_reader_end = NULL;

+  VP9LfSync *lf_row_sync = &pbi->lf_row_sync;

+  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);

   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);

   const int tile_cols = 1 << cm->log2_tile_cols;

   const int tile_rows = 1 << cm->log2_tile_rows;

@@ -1568,6 +1587,12 @@

+  // Initialize LPF

+  if (cm->lf.filter_level && !cm->skip_loop_filter) {

+    vp9_lpf_mt_init(lf_row_sync, cm, cm->lf.filter_level,

+                    pbi->num_tile_workers);

+  }

   // Reset tile decoding hook

   for (n = 0; n < num_workers; ++n) {

     VPxWorker *const worker = &pbi->tile_workers[n];

@@ -1574,6 +1599,14 @@

     TileWorkerData *const tile_data =

         &pbi->tile_worker_data[n + pbi->total_tiles];

     winterface->sync(worker);

+    if (cm->lf.filter_level && !cm->skip_loop_filter) {

+      tile_data->lf_sync = lf_row_sync;

+      tile_data->lf_data = &tile_data->lf_sync->lfdata[n];

+      vp9_loop_filter_data_reset(tile_data->lf_data, new_fb, cm, pbi->mb.plane);

+      tile_data->lf_data->y_only = 0;

+    }

     tile_data->xd = pbi->mb;

     tile_data->xd.counts =

         cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts;

@@ -2095,15 +2128,7 @@

   if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {

     // Multi-threaded tile decoder

     *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);

-    if (!xd->corrupted) {

-      if (!cm->skip_loop_filter) {

-        // If multiple threads are used to decode tiles, then we use those

-        // threads to do parallel loopfiltering.

-        vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level,

-                                 0, 0, pbi->tile_workers, pbi->num_tile_workers,

-                                 &pbi->lf_row_sync);

-      }

-    } else {

+    if (xd->corrupted) {

       vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,

                          "Decode failed. Frame data is corrupted.");

--- a/vp9/decoder/vp9_decoder.h

+++ b/vp9/decoder/vp9_decoder.h

@@ -37,6 +37,8 @@

   int buf_start, buf_end;  // pbi->tile_buffers to decode, inclusive

   vpx_reader bit_reader;

   FRAME_COUNTS counts;

+  LFWorkerData *lf_data;

+  VP9LfSync *lf_sync;

   DECLARE_ALIGNED(16, MACROBLOCKD, xd);

   /* dqcoeff are shared by all the planes. So planes must be decoded serially */

   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);

--- a/vpx_util/vpx_thread.h

+++ b/vpx_util/vpx_thread.h

@@ -159,6 +159,23 @@

   return 0;

+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {

+  int ok = 1;

+#ifdef USE_WINDOWS_CONDITION_VARIABLE

+  WakeAllConditionVariable(condition);

+#else

+  while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {

+    // a thread is waiting in pthread_cond_wait: allow it to be notified

+    ok &= SetEvent(condition->signal_event_);

+    // wait until the event is consumed so the signaler cannot consume

+    // the event via its own pthread_cond_wait.

+    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=

+           WAIT_OBJECT_0);

+  }

+#endif

+  return !ok;

+}

 static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {

   int ok = 1;

 #ifdef USE_WINDOWS_CONDITION_VARIABLE

--

⑨