shithub: libvpx

--- a/vp9/common/vp9_loopfilter_thread.c

+++ /dev/null

@@ -1,301 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vpx_config.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp9/common/vp9_loopfilter_thread.h"

-#include "vp9/common/vp9_reconinter.h"

-#if CONFIG_MULTITHREAD

-static INLINE void mutex_lock(pthread_mutex_t *const mutex) {

-  const int kMaxTryLocks = 4000;

-  int locked = 0;

-  int i;

-  for (i = 0; i < kMaxTryLocks; ++i) {

-    if (!pthread_mutex_trylock(mutex)) {

-      locked = 1;

-      break;

-    }

-  }

-  if (!locked)

-    pthread_mutex_lock(mutex);

-}

-#endif  // CONFIG_MULTITHREAD

-static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {

-#if CONFIG_MULTITHREAD

-  const int nsync = lf_sync->sync_range;

-  if (r && !(c & (nsync - 1))) {

-    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];

-    mutex_lock(mutex);

-    while (c > lf_sync->cur_sb_col[r - 1] - nsync) {

-      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);

-    }

-    pthread_mutex_unlock(mutex);

-  }

-#else

-  (void)lf_sync;

-  (void)r;

-  (void)c;

-#endif  // CONFIG_MULTITHREAD

-}

-static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,

-                              const int sb_cols) {

-#if CONFIG_MULTITHREAD

-  const int nsync = lf_sync->sync_range;

-  int cur;

-  // Only signal when there are enough filtered SB for next row to run.

-  int sig = 1;

-  if (c < sb_cols - 1) {

-    cur = c;

-    if (c % nsync)

-      sig = 0;

-  } else {

-    cur = sb_cols + nsync;

-  }

-  if (sig) {

-    mutex_lock(&lf_sync->mutex_[r]);

-    lf_sync->cur_sb_col[r] = cur;

-    pthread_cond_signal(&lf_sync->cond_[r]);

-    pthread_mutex_unlock(&lf_sync->mutex_[r]);

-  }

-#else

-  (void)lf_sync;

-  (void)r;

-  (void)c;

-  (void)sb_cols;

-#endif  // CONFIG_MULTITHREAD

-}

-// Implement row loopfiltering for each thread.

-static INLINE

-void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,

-                             VP9_COMMON *const cm,

-                             struct macroblockd_plane planes[MAX_MB_PLANE],

-                             int start, int stop, int y_only,

-                             VP9LfSync *const lf_sync) {

-  const int num_planes = y_only ? 1 : MAX_MB_PLANE;

-  const int use_420 = y_only || (planes[1].subsampling_y == 1 &&

-                                 planes[1].subsampling_x == 1);

-  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;

-  int mi_row, mi_col;

-  for (mi_row = start; mi_row < stop;

-       mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {

-    MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;

-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {

-      const int r = mi_row >> MI_BLOCK_SIZE_LOG2;

-      const int c = mi_col >> MI_BLOCK_SIZE_LOG2;

-      LOOP_FILTER_MASK lfm;

-      int plane;

-      sync_read(lf_sync, r, c);

-      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);

-      // TODO(JBB): Make setup_mask work for non 420.

-      if (use_420)

-        vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,

-                       &lfm);

-      for (plane = 0; plane < num_planes; ++plane) {

-        if (use_420)

-          vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);

-        else

-          vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,

-                                        mi_row, mi_col);

-      }

-      sync_write(lf_sync, r, c, sb_cols);

-    }

-  }

-}

-// Row-based multi-threaded loopfilter hook

-static int loop_filter_row_worker(VP9LfSync *const lf_sync,

-                                  LFWorkerData *const lf_data) {

-  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,

-                          lf_data->start, lf_data->stop, lf_data->y_only,

-                          lf_sync);

-  return 1;

-}

-static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,

-                                VP9_COMMON *cm,

-                                struct macroblockd_plane planes[MAX_MB_PLANE],

-                                int start, int stop, int y_only,

-                                VP9Worker *workers, int nworkers,

-                                VP9LfSync *lf_sync) {

-  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();

-  // Number of superblock rows and cols

-  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;

-  // Decoder may allocate more threads than number of tiles based on user's

-  // input.

-  const int tile_cols = 1 << cm->log2_tile_cols;

-  const int num_workers = MIN(nworkers, tile_cols);

-  int i;

-  if (!lf_sync->sync_range || cm->last_height != cm->height ||

-      num_workers > lf_sync->num_workers) {

-    vp9_loop_filter_dealloc(lf_sync);

-    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);

-  }

-  // Initialize cur_sb_col to -1 for all SB rows.

-  vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);

-  // Set up loopfilter thread data.

-  // The decoder is capping num_workers because it has been observed that using

-  // more threads on the loopfilter than there are cores will hurt performance

-  // on Android. This is because the system will only schedule the tile decode

-  // workers on cores equal to the number of tile columns. Then if the decoder

-  // tries to use more threads for the loopfilter, it will hurt performance

-  // because of contention. If the multithreading code changes in the future

-  // then the number of workers used by the loopfilter should be revisited.

-  for (i = 0; i < num_workers; ++i) {

-    VP9Worker *const worker = &workers[i];

-    LFWorkerData *const lf_data = &lf_sync->lfdata[i];

-    worker->hook = (VP9WorkerHook)loop_filter_row_worker;

-    worker->data1 = lf_sync;

-    worker->data2 = lf_data;

-    // Loopfilter data

-    vp9_loop_filter_data_reset(lf_data, frame, cm, planes);

-    lf_data->start = start + i * MI_BLOCK_SIZE;

-    lf_data->stop = stop;

-    lf_data->y_only = y_only;

-    // Start loopfiltering

-    if (i == num_workers - 1) {

-      winterface->execute(worker);

-    } else {

-      winterface->launch(worker);

-    }

-  }

-  // Wait till all rows are finished

-  for (i = 0; i < num_workers; ++i) {

-    winterface->sync(&workers[i]);

-  }

-}

-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,

-                              VP9_COMMON *cm,

-                              struct macroblockd_plane planes[MAX_MB_PLANE],

-                              int frame_filter_level,

-                              int y_only, int partial_frame,

-                              VP9Worker *workers, int num_workers,

-                              VP9LfSync *lf_sync) {

-  int start_mi_row, end_mi_row, mi_rows_to_filter;

-  if (!frame_filter_level) return;

-  start_mi_row = 0;

-  mi_rows_to_filter = cm->mi_rows;

-  if (partial_frame && cm->mi_rows > 8) {

-    start_mi_row = cm->mi_rows >> 1;

-    start_mi_row &= 0xfffffff8;

-    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);

-  }

-  end_mi_row = start_mi_row + mi_rows_to_filter;

-  vp9_loop_filter_frame_init(cm, frame_filter_level);

-  loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row,

-                      y_only, workers, num_workers, lf_sync);

-}

-// Set up nsync by width.

-static INLINE int get_sync_range(int width) {

-  // nsync numbers are picked by testing. For example, for 4k

-  // video, using 4 gives best performance.

-  if (width < 640)

-    return 1;

-  else if (width <= 1280)

-    return 2;

-  else if (width <= 4096)

-    return 4;

-  else

-    return 8;

-}

-// Allocate memory for lf row synchronization

-void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,

-                           int width, int num_workers) {

-  lf_sync->rows = rows;

-#if CONFIG_MULTITHREAD

-  {

-    int i;

-    CHECK_MEM_ERROR(cm, lf_sync->mutex_,

-                    vpx_malloc(sizeof(*lf_sync->mutex_) * rows));

-    if (lf_sync->mutex_) {

-      for (i = 0; i < rows; ++i) {

-        pthread_mutex_init(&lf_sync->mutex_[i], NULL);

-      }

-    }

-    CHECK_MEM_ERROR(cm, lf_sync->cond_,

-                    vpx_malloc(sizeof(*lf_sync->cond_) * rows));

-    if (lf_sync->cond_) {

-      for (i = 0; i < rows; ++i) {

-        pthread_cond_init(&lf_sync->cond_[i], NULL);

-      }

-    }

-  }

-#endif  // CONFIG_MULTITHREAD

-  CHECK_MEM_ERROR(cm, lf_sync->lfdata,

-                  vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));

-  lf_sync->num_workers = num_workers;

-  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,

-                  vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));

-  // Set up nsync.

-  lf_sync->sync_range = get_sync_range(width);

-}

-// Deallocate lf synchronization related mutex and data

-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {

-  if (lf_sync != NULL) {

-#if CONFIG_MULTITHREAD

-    int i;

-    if (lf_sync->mutex_ != NULL) {

-      for (i = 0; i < lf_sync->rows; ++i) {

-        pthread_mutex_destroy(&lf_sync->mutex_[i]);

-      }

-      vpx_free(lf_sync->mutex_);

-    }

-    if (lf_sync->cond_ != NULL) {

-      for (i = 0; i < lf_sync->rows; ++i) {

-        pthread_cond_destroy(&lf_sync->cond_[i]);

-      }

-      vpx_free(lf_sync->cond_);

-    }

-#endif  // CONFIG_MULTITHREAD

-    vpx_free(lf_sync->lfdata);

-    vpx_free(lf_sync->cur_sb_col);

-    // clear the structure as the source of this call may be a resize in which

-    // case this call will be followed by an _alloc() which may fail.

-    vp9_zero(*lf_sync);

-  }

-}

--- a/vp9/common/vp9_loopfilter_thread.h

+++ /dev/null

@@ -1,53 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_COMMON_VP9_LOOPFILTER_THREAD_H_

-#define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_

-#include "./vpx_config.h"

-#include "vp9/common/vp9_loopfilter.h"

-#include "vp9/common/vp9_thread.h"

-struct VP9Common;

-// Loopfilter row synchronization

-typedef struct VP9LfSyncData {

-#if CONFIG_MULTITHREAD

-  pthread_mutex_t *mutex_;

-  pthread_cond_t *cond_;

-#endif

-  // Allocate memory to store the loop-filtered superblock index in each row.

-  int *cur_sb_col;

-  // The optimal sync_range for different resolution and platform should be

-  // determined by testing. Currently, it is chosen to be a power-of-2 number.

-  int sync_range;

-  int rows;

-  // Row-based parallel loopfilter data

-  LFWorkerData *lfdata;

-  int num_workers;

-} VP9LfSync;

-// Allocate memory for loopfilter row synchronization.

-void vp9_loop_filter_alloc(VP9LfSync *lf_sync, struct VP9Common *cm, int rows,

-                           int width, int num_workers);

-// Deallocate loopfilter synchronization related mutex and data.

-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);

-// Multi-threaded loopfilter that uses the tile threads.

-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,

-                              struct VP9Common *cm,

-                              struct macroblockd_plane planes[MAX_MB_PLANE],

-                              int frame_filter_level,

-                              int y_only, int partial_frame,

-                              VP9Worker *workers, int num_workers,

-                              VP9LfSync *lf_sync);

-#endif  // VP9_COMMON_VP9_LOOPFILTER_THREAD_H_

--- /dev/null

+++ b/vp9/common/vp9_thread_common.c

@@ -1,0 +1,301 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_config.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vp9/common/vp9_thread_common.h"

+#include "vp9/common/vp9_reconinter.h"

+#if CONFIG_MULTITHREAD

+static INLINE void mutex_lock(pthread_mutex_t *const mutex) {

+  const int kMaxTryLocks = 4000;

+  int locked = 0;

+  int i;

+  for (i = 0; i < kMaxTryLocks; ++i) {

+    if (!pthread_mutex_trylock(mutex)) {

+      locked = 1;

+      break;

+    }

+  }

+  if (!locked)

+    pthread_mutex_lock(mutex);

+}

+#endif  // CONFIG_MULTITHREAD

+static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {

+#if CONFIG_MULTITHREAD

+  const int nsync = lf_sync->sync_range;

+  if (r && !(c & (nsync - 1))) {

+    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];

+    mutex_lock(mutex);

+    while (c > lf_sync->cur_sb_col[r - 1] - nsync) {

+      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);

+    }

+    pthread_mutex_unlock(mutex);

+  }

+#else

+  (void)lf_sync;

+  (void)r;

+  (void)c;

+#endif  // CONFIG_MULTITHREAD

+}

+static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,

+                              const int sb_cols) {

+#if CONFIG_MULTITHREAD

+  const int nsync = lf_sync->sync_range;

+  int cur;

+  // Only signal when there are enough filtered SB for next row to run.

+  int sig = 1;

+  if (c < sb_cols - 1) {

+    cur = c;

+    if (c % nsync)

+      sig = 0;

+  } else {

+    cur = sb_cols + nsync;

+  }

+  if (sig) {

+    mutex_lock(&lf_sync->mutex_[r]);

+    lf_sync->cur_sb_col[r] = cur;

+    pthread_cond_signal(&lf_sync->cond_[r]);

+    pthread_mutex_unlock(&lf_sync->mutex_[r]);

+  }

+#else

+  (void)lf_sync;

+  (void)r;

+  (void)c;

+  (void)sb_cols;

+#endif  // CONFIG_MULTITHREAD

+}

+// Implement row loopfiltering for each thread.

+static INLINE

+void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,

+                             VP9_COMMON *const cm,

+                             struct macroblockd_plane planes[MAX_MB_PLANE],

+                             int start, int stop, int y_only,

+                             VP9LfSync *const lf_sync) {

+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;

+  const int use_420 = y_only || (planes[1].subsampling_y == 1 &&

+                                 planes[1].subsampling_x == 1);

+  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;

+  int mi_row, mi_col;

+  for (mi_row = start; mi_row < stop;

+       mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {

+    MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;

+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {

+      const int r = mi_row >> MI_BLOCK_SIZE_LOG2;

+      const int c = mi_col >> MI_BLOCK_SIZE_LOG2;

+      LOOP_FILTER_MASK lfm;

+      int plane;

+      sync_read(lf_sync, r, c);

+      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);

+      // TODO(JBB): Make setup_mask work for non 420.

+      if (use_420)

+        vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,

+                       &lfm);

+      for (plane = 0; plane < num_planes; ++plane) {

+        if (use_420)

+          vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);

+        else

+          vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,

+                                        mi_row, mi_col);

+      }

+      sync_write(lf_sync, r, c, sb_cols);

+    }

+  }

+}

+// Row-based multi-threaded loopfilter hook

+static int loop_filter_row_worker(VP9LfSync *const lf_sync,

+                                  LFWorkerData *const lf_data) {

+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,

+                          lf_data->start, lf_data->stop, lf_data->y_only,

+                          lf_sync);

+  return 1;

+}

+static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,

+                                VP9_COMMON *cm,

+                                struct macroblockd_plane planes[MAX_MB_PLANE],

+                                int start, int stop, int y_only,

+                                VP9Worker *workers, int nworkers,

+                                VP9LfSync *lf_sync) {

+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();

+  // Number of superblock rows and cols

+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;

+  // Decoder may allocate more threads than number of tiles based on user's

+  // input.

+  const int tile_cols = 1 << cm->log2_tile_cols;

+  const int num_workers = MIN(nworkers, tile_cols);

+  int i;

+  if (!lf_sync->sync_range || cm->last_height != cm->height ||

+      num_workers > lf_sync->num_workers) {

+    vp9_loop_filter_dealloc(lf_sync);

+    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);

+  }

+  // Initialize cur_sb_col to -1 for all SB rows.

+  vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);

+  // Set up loopfilter thread data.

+  // The decoder is capping num_workers because it has been observed that using

+  // more threads on the loopfilter than there are cores will hurt performance

+  // on Android. This is because the system will only schedule the tile decode

+  // workers on cores equal to the number of tile columns. Then if the decoder

+  // tries to use more threads for the loopfilter, it will hurt performance

+  // because of contention. If the multithreading code changes in the future

+  // then the number of workers used by the loopfilter should be revisited.

+  for (i = 0; i < num_workers; ++i) {

+    VP9Worker *const worker = &workers[i];

+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];

+    worker->hook = (VP9WorkerHook)loop_filter_row_worker;

+    worker->data1 = lf_sync;

+    worker->data2 = lf_data;

+    // Loopfilter data

+    vp9_loop_filter_data_reset(lf_data, frame, cm, planes);

+    lf_data->start = start + i * MI_BLOCK_SIZE;

+    lf_data->stop = stop;

+    lf_data->y_only = y_only;

+    // Start loopfiltering

+    if (i == num_workers - 1) {

+      winterface->execute(worker);

+    } else {

+      winterface->launch(worker);

+    }

+  }

+  // Wait till all rows are finished

+  for (i = 0; i < num_workers; ++i) {

+    winterface->sync(&workers[i]);

+  }

+}

+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,

+                              VP9_COMMON *cm,

+                              struct macroblockd_plane planes[MAX_MB_PLANE],

+                              int frame_filter_level,

+                              int y_only, int partial_frame,

+                              VP9Worker *workers, int num_workers,

+                              VP9LfSync *lf_sync) {

+  int start_mi_row, end_mi_row, mi_rows_to_filter;

+  if (!frame_filter_level) return;

+  start_mi_row = 0;

+  mi_rows_to_filter = cm->mi_rows;

+  if (partial_frame && cm->mi_rows > 8) {

+    start_mi_row = cm->mi_rows >> 1;

+    start_mi_row &= 0xfffffff8;

+    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);

+  }

+  end_mi_row = start_mi_row + mi_rows_to_filter;

+  vp9_loop_filter_frame_init(cm, frame_filter_level);

+  loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row,

+                      y_only, workers, num_workers, lf_sync);

+}

+// Set up nsync by width.

+static INLINE int get_sync_range(int width) {

+  // nsync numbers are picked by testing. For example, for 4k

+  // video, using 4 gives best performance.

+  if (width < 640)

+    return 1;

+  else if (width <= 1280)

+    return 2;

+  else if (width <= 4096)

+    return 4;

+  else

+    return 8;

+}

+// Allocate memory for lf row synchronization

+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,

+                           int width, int num_workers) {

+  lf_sync->rows = rows;

+#if CONFIG_MULTITHREAD

+  {

+    int i;

+    CHECK_MEM_ERROR(cm, lf_sync->mutex_,

+                    vpx_malloc(sizeof(*lf_sync->mutex_) * rows));

+    if (lf_sync->mutex_) {

+      for (i = 0; i < rows; ++i) {

+        pthread_mutex_init(&lf_sync->mutex_[i], NULL);

+      }

+    }

+    CHECK_MEM_ERROR(cm, lf_sync->cond_,

+                    vpx_malloc(sizeof(*lf_sync->cond_) * rows));

+    if (lf_sync->cond_) {

+      for (i = 0; i < rows; ++i) {

+        pthread_cond_init(&lf_sync->cond_[i], NULL);

+      }

+    }

+  }

+#endif  // CONFIG_MULTITHREAD

+  CHECK_MEM_ERROR(cm, lf_sync->lfdata,

+                  vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));

+  lf_sync->num_workers = num_workers;

+  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,

+                  vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));

+  // Set up nsync.

+  lf_sync->sync_range = get_sync_range(width);

+}

+// Deallocate lf synchronization related mutex and data

+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {

+  if (lf_sync != NULL) {

+#if CONFIG_MULTITHREAD

+    int i;

+    if (lf_sync->mutex_ != NULL) {

+      for (i = 0; i < lf_sync->rows; ++i) {

+        pthread_mutex_destroy(&lf_sync->mutex_[i]);

+      }

+      vpx_free(lf_sync->mutex_);

+    }

+    if (lf_sync->cond_ != NULL) {

+      for (i = 0; i < lf_sync->rows; ++i) {

+        pthread_cond_destroy(&lf_sync->cond_[i]);

+      }

+      vpx_free(lf_sync->cond_);

+    }

+#endif  // CONFIG_MULTITHREAD

+    vpx_free(lf_sync->lfdata);

+    vpx_free(lf_sync->cur_sb_col);

+    // clear the structure as the source of this call may be a resize in which

+    // case this call will be followed by an _alloc() which may fail.

+    vp9_zero(*lf_sync);

+  }

+}

--- /dev/null

+++ b/vp9/common/vp9_thread_common.h

@@ -1,0 +1,53 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VP9_COMMON_VP9_LOOPFILTER_THREAD_H_

+#define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_

+#include "./vpx_config.h"

+#include "vp9/common/vp9_loopfilter.h"

+#include "vp9/common/vp9_thread.h"

+struct VP9Common;

+// Loopfilter row synchronization

+typedef struct VP9LfSyncData {

+#if CONFIG_MULTITHREAD

+  pthread_mutex_t *mutex_;

+  pthread_cond_t *cond_;

+#endif

+  // Allocate memory to store the loop-filtered superblock index in each row.

+  int *cur_sb_col;

+  // The optimal sync_range for different resolution and platform should be

+  // determined by testing. Currently, it is chosen to be a power-of-2 number.

+  int sync_range;

+  int rows;

+  // Row-based parallel loopfilter data

+  LFWorkerData *lfdata;

+  int num_workers;

+} VP9LfSync;

+// Allocate memory for loopfilter row synchronization.

+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, struct VP9Common *cm, int rows,

+                           int width, int num_workers);

+// Deallocate loopfilter synchronization related mutex and data.

+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);

+// Multi-threaded loopfilter that uses the tile threads.

+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,

+                              struct VP9Common *cm,

+                              struct macroblockd_plane planes[MAX_MB_PLANE],

+                              int frame_filter_level,

+                              int y_only, int partial_frame,

+                              VP9Worker *workers, int num_workers,

+                              VP9LfSync *lf_sync);

+#endif  // VP9_COMMON_VP9_LOOPFILTER_THREAD_H_

--- a/vp9/decoder/vp9_decodeframe.c

+++ b/vp9/decoder/vp9_decodeframe.c

@@ -23,7 +23,7 @@

 #include "vp9/common/vp9_entropy.h"

 #include "vp9/common/vp9_entropymode.h"

 #include "vp9/common/vp9_idct.h"

-#include "vp9/common/vp9_loopfilter_thread.h"

+#include "vp9/common/vp9_thread_common.h"

 #include "vp9/common/vp9_pred_common.h"

 #include "vp9/common/vp9_quant_common.h"

 #include "vp9/common/vp9_reconintra.h"

--- a/vp9/decoder/vp9_decoder.h

+++ b/vp9/decoder/vp9_decoder.h

@@ -15,7 +15,7 @@

 #include "vpx/vpx_codec.h"

 #include "vpx_scale/yv12config.h"

-#include "vp9/common/vp9_loopfilter_thread.h"

+#include "vp9/common/vp9_thread_common.h"

 #include "vp9/common/vp9_onyxc_int.h"

 #include "vp9/common/vp9_ppflags.h"

 #include "vp9/common/vp9_thread.h"

--- a/vp9/encoder/vp9_encoder.h

+++ b/vp9/encoder/vp9_encoder.h

@@ -19,7 +19,7 @@

 #include "vp9/common/vp9_ppflags.h"

 #include "vp9/common/vp9_entropymode.h"

-#include "vp9/common/vp9_loopfilter_thread.h"

+#include "vp9/common/vp9_thread_common.h"

 #include "vp9/common/vp9_onyxc_int.h"

 #include "vp9/common/vp9_thread.h"

--- a/vp9/vp9_common.mk

+++ b/vp9/vp9_common.mk

@@ -33,7 +33,7 @@

 VP9_COMMON_SRCS-yes += common/vp9_enums.h

 VP9_COMMON_SRCS-yes += common/vp9_idct.h

 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h

-VP9_COMMON_SRCS-yes += common/vp9_loopfilter_thread.h

+VP9_COMMON_SRCS-yes += common/vp9_thread_common.h

 VP9_COMMON_SRCS-yes += common/vp9_mv.h

 VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h

 VP9_COMMON_SRCS-yes += common/vp9_pred_common.h

@@ -57,7 +57,7 @@

 VP9_COMMON_SRCS-yes += common/vp9_tile_common.c

 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c

 VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c

-VP9_COMMON_SRCS-yes += common/vp9_loopfilter_thread.c

+VP9_COMMON_SRCS-yes += common/vp9_thread_common.c

 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c

 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h

 VP9_COMMON_SRCS-yes += common/vp9_quant_common.c

--

⑨