ref: 71061e9332c05324007e7f6c900285273793366d
parent: eeb288d568fde3512e4362d73e4d684af3bcf87c
author: Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
date: Fri Feb 10 11:25:50 EST 2017
Row based multi-threading of encoding stage (Yunqing Wang) This patch implements the row-based multi-threading within tiles in the encoding pass, and substantially speeds up the multi-threaded encoder in VP9. Speed tests at speed 1 on STDHD(using 4 tiles) set show that the average speedups of the encoding pass(second pass in the 2-pass encoding) is 7% while using 2 threads, 16% while using 4 threads, 85% while using 8 threads, and 116% while using 16 threads. Change-Id: I12e41dbc171951958af9e6d098efd6e2c82827de
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -925,10 +925,11 @@
static int encode_tile_worker(VP9_COMP *cpi, VP9BitstreamWorkerData *data) {
MACROBLOCKD *const xd = &data->xd;
+ const int tile_row = 0;
vpx_start_encode(&data->bit_writer, data->dest);
write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info,
- &data->bit_writer, 0, data->tile_idx, &data->max_mv_magnitude,
- data->interp_filter_selected);
+ &data->bit_writer, tile_row, data->tile_idx,
+ &data->max_mv_magnitude, data->interp_filter_selected);
vpx_stop_encode(&data->bit_writer);
return 1;
}
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -11,6 +11,8 @@
#ifndef VP9_ENCODER_VP9_BLOCK_H_
#define VP9_ENCODER_VP9_BLOCK_H_
+#include "vpx_util/vpx_thread.h"
+
#include "vp9/common/vp9_entropymv.h"
#include "vp9/common/vp9_entropy.h"
@@ -88,6 +90,9 @@
int mb_energy;
int *m_search_count_ptr;
int *ex_search_count_ptr;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *search_count_mutex;
+#endif
// These are set to their default values at the beginning, and then adjusted
// further in the encoding process.
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3095,6 +3095,10 @@
const int mi_col_start = tile_info->mi_col_start;
const int mi_col_end = tile_info->mi_col_end;
int mi_col;
+ const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+ const int num_sb_cols =
+ get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2);
+ int sb_col_in_tile;
// Initialize the left context for the new SB row
memset(&xd->left_context, 0, sizeof(xd->left_context));
@@ -3101,7 +3105,8 @@
memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
// Code each SB in the row
- for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) {
+ for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end;
+ mi_col += MI_BLOCK_SIZE, sb_col_in_tile++) {
const struct segmentation *const seg = &cm->seg;
int dummy_rate;
int64_t dummy_dist;
@@ -3112,6 +3117,9 @@
const int idx_str = cm->mi_stride * mi_row + mi_col;
MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+ (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
+ sb_col_in_tile - 1);
+
if (sf->adaptive_pred_interp_filter) {
for (i = 0; i < 64; ++i) td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
@@ -3163,6 +3171,8 @@
rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rdc, INT64_MAX, td->pc_root);
}
+ (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
+ sb_col_in_tile, num_sb_cols);
}
}
@@ -4109,13 +4119,17 @@
tile_data->mode_map[i][j] = j;
}
}
+#if CONFIG_MULTITHREAD
+ tile_data->search_count_mutex = NULL;
+ tile_data->enc_row_mt_mutex = NULL;
+#endif
}
}
for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
- TileInfo *tile_info =
- &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ TileInfo *tile_info = &this_tile->tile_info;
vp9_tile_init(tile_info, cm, tile_row, tile_col);
cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
@@ -4125,6 +4139,10 @@
cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
tplist = cpi->tplist[tile_row][tile_col];
tplist_count = get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
+
+ // Set up pointers to per thread motion search counters.
+ this_tile->m_search_count = 0; // Count of motion search hits.
+ this_tile->ex_search_count = 0; // Exhaustive mesh search hits.
}
}
}
@@ -4170,10 +4188,11 @@
int mi_row;
// Set up pointers to per thread motion search counters.
- this_tile->m_search_count = 0; // Count of motion search hits.
- this_tile->ex_search_count = 0; // Exhaustive mesh search hits.
td->mb.m_search_count_ptr = &this_tile->m_search_count;
td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
+#if CONFIG_MULTITHREAD
+ td->mb.search_count_mutex = this_tile->search_count_mutex;
+#endif
for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE)
vp9_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
@@ -4289,11 +4308,20 @@
}
#endif
- // If allowed, encoding tiles in parallel with one thread handling one tile.
- if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
- vp9_encode_tiles_mt(cpi);
- else
- encode_tiles(cpi);
+ if (!cpi->new_mt) {
+ cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy;
+ cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy;
+ // If allowed, encoding tiles in parallel with one thread handling one
+ // tile when row based multi-threading is disabled.
+ if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
+ vp9_encode_tiles_mt(cpi);
+ else
+ encode_tiles(cpi);
+ } else {
+ cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read;
+ cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write;
+ vp9_encode_tiles_row_mt(cpi);
+ }
vpx_usec_timer_mark(&emr_timer);
cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -39,6 +39,9 @@
void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td, int tile_row,
int tile_col);
+void vp9_encode_sb_row(struct VP9_COMP *cpi, struct ThreadData *td,
+ int tile_row, int tile_col, int mi_row);
+
void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q);
#ifdef __cplusplus
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1575,17 +1575,7 @@
highbd_set_var_fns(cpi);
#endif
- // Enable multi-threading for first pass.
- cpi->new_mt = 0;
- if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) &&
- cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) &&
- cpi->oxcf.new_mt && !cpi->use_svc)
- cpi->new_mt = 1;
-
- if (cpi->oxcf.mode == GOOD && cpi->oxcf.speed < 5 &&
- (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) && cpi->oxcf.new_mt &&
- !cpi->use_svc)
- cpi->new_mt = 1;
+ vp9_set_new_mt(cpi);
}
#ifndef M_LOG2_E
@@ -5212,4 +5202,18 @@
if (flags & VP8_EFLAG_NO_UPD_ENTROPY) {
vp9_update_entropy(cpi, 0);
}
+}
+
+void vp9_set_new_mt(VP9_COMP *cpi) {
+ // Enable row based multi-threading for supported modes of encoding
+ cpi->new_mt = 0;
+ if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) &&
+ cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) &&
+ cpi->oxcf.new_mt && !cpi->use_svc)
+ cpi->new_mt = 1;
+
+ if (cpi->oxcf.mode == GOOD && cpi->oxcf.speed < 5 &&
+ (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) && cpi->oxcf.new_mt &&
+ !cpi->use_svc)
+ cpi->new_mt = 1;
}
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -276,6 +276,10 @@
int ex_search_count;
FIRSTPASS_DATA fp_data;
VP9RowMTSync row_mt_sync;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *search_count_mutex;
+ pthread_mutex_t *enc_row_mt_mutex;
+#endif
} TileDataEnc;
typedef struct RowMTInfo {
@@ -896,6 +900,8 @@
VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
void vp9_new_framerate(VP9_COMP *cpi, double framerate);
+
+void vp9_set_new_mt(VP9_COMP *cpi);
#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -341,7 +341,7 @@
#if CONFIG_MULTITHREAD
const int nsync = row_mt_sync->sync_range;
int cur;
- // Only signal when there are enough filtered SB for next row to run.
+ // Only signal when there are enough encoded blocks for next row to run.
int sig = 1;
if (c < cols - 1) {
@@ -541,4 +541,101 @@
launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook,
multi_thread_ctxt, num_workers);
+}
+
+static int enc_row_mt_worker_hook(EncWorkerData *const thread_data,
+ MultiThreadHandle *multi_thread_ctxt) {
+ VP9_COMP *const cpi = thread_data->cpi;
+ const VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ int tile_row, tile_col;
+ TileDataEnc *this_tile;
+ int end_of_frame;
+ int thread_id = thread_data->thread_id;
+ int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+ JobNode *proc_job = NULL;
+ int mi_row;
+
+ end_of_frame = 0;
+ while (0 == end_of_frame) {
+ // Get the next job in the queue
+ proc_job =
+ (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
+ if (NULL == proc_job) {
+ // Query for the status of other tiles
+ end_of_frame = vp9_get_tiles_proc_status(
+ multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
+ tile_cols);
+ } else {
+ tile_col = proc_job->tile_col_id;
+ tile_row = proc_job->tile_row_id;
+ mi_row = proc_job->vert_unit_row_num * MI_BLOCK_SIZE;
+
+ this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ thread_data->td->mb.m_search_count_ptr = &this_tile->m_search_count;
+ thread_data->td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
+#if CONFIG_MULTITHREAD
+ thread_data->td->mb.search_count_mutex = this_tile->search_count_mutex;
+#endif
+
+ vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row);
+ }
+ }
+ return 0;
+}
+
+void vp9_encode_tiles_row_mt(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+ int num_workers = VPXMAX(cpi->oxcf.max_threads, 1);
+ int i;
+
+ if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
+ multi_thread_ctxt->allocated_tile_rows < tile_rows ||
+ multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
+ vp9_row_mt_mem_dealloc(cpi);
+ vp9_init_tile_data(cpi);
+ vp9_row_mt_mem_alloc(cpi);
+ } else {
+ vp9_init_tile_data(cpi);
+ }
+
+ create_enc_workers(cpi, num_workers);
+
+ vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
+
+ vp9_prepare_job_queue(cpi, ENCODE_JOB);
+
+ vp9_multi_thread_tile_init(cpi);
+
+ for (i = 0; i < num_workers; i++) {
+ EncWorkerData *thread_data;
+ thread_data = &cpi->tile_thr_data[i];
+
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ thread_data->td->rd_counts = cpi->td.rd_counts;
+ }
+ if (thread_data->td->counts != &cpi->common.counts) {
+ memcpy(thread_data->td->counts, &cpi->common.counts,
+ sizeof(cpi->common.counts));
+ }
+ }
+
+ launch_enc_workers(cpi, (VPxWorkerHook)enc_row_mt_worker_hook,
+ multi_thread_ctxt, num_workers);
+
+ for (i = 0; i < num_workers; i++) {
+ VPxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+ // Accumulate counters.
+ if (i < cpi->num_workers - 1) {
+ vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0);
+ accumulate_rd_opt(&cpi->td, thread_data->td);
+ }
+ }
}
--- a/vp9/encoder/vp9_ethread.h
+++ b/vp9/encoder/vp9_ethread.h
@@ -44,6 +44,8 @@
void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
+void vp9_encode_tiles_row_mt(struct VP9_COMP *cpi);
+
void vp9_encode_fp_row_mt(struct VP9_COMP *cpi);
void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c);
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1993,9 +1993,18 @@
int range = sf->mesh_patterns[0].range;
int baseline_interval_divisor;
+#if CONFIG_MULTITHREAD
+ if (NULL != x->search_count_mutex) pthread_mutex_lock(x->search_count_mutex);
+#endif
+
// Keep track of number of exhaustive calls (this frame in this thread).
++(*x->ex_search_count_ptr);
+#if CONFIG_MULTITHREAD
+ if (NULL != x->search_count_mutex)
+ pthread_mutex_unlock(x->search_count_mutex);
+#endif
+
// Trap illegal values for interval and range for this function.
if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
(interval > range))
@@ -2356,13 +2365,27 @@
#define MIN_EX_SEARCH_LIMIT 128
static int is_exhaustive_allowed(VP9_COMP *cpi, MACROBLOCK *x) {
const SPEED_FEATURES *const sf = &cpi->sf;
- const int max_ex =
- VPXMAX(MIN_EX_SEARCH_LIMIT,
- (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+ int is_exhaustive_allowed;
+ int max_ex;
- return sf->allow_exhaustive_searches &&
- (sf->exhaustive_searches_thresh < INT_MAX) &&
- (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref;
+#if CONFIG_MULTITHREAD
+ if (NULL != x->search_count_mutex) pthread_mutex_lock(x->search_count_mutex);
+#endif
+
+ max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT,
+ (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+
+ is_exhaustive_allowed = sf->allow_exhaustive_searches &&
+ (sf->exhaustive_searches_thresh < INT_MAX) &&
+ (*x->ex_search_count_ptr <= max_ex) &&
+ !cpi->rc.is_src_frame_alt_ref;
+
+#if CONFIG_MULTITHREAD
+ if (NULL != x->search_count_mutex)
+ pthread_mutex_unlock(x->search_count_mutex);
+#endif
+
+ return is_exhaustive_allowed;
}
int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
@@ -2407,8 +2430,18 @@
MAX_MVSEARCH_STEPS - 1 - step_param, 1,
cost_list, fn_ptr, ref_mv, tmp_mv);
+#if CONFIG_MULTITHREAD
+ if (NULL != x->search_count_mutex)
+ pthread_mutex_lock(x->search_count_mutex);
+#endif
+
// Keep track of number of searches (this frame in this thread).
++(*x->m_search_count_ptr);
+
+#if CONFIG_MULTITHREAD
+ if (NULL != x->search_count_mutex)
+ pthread_mutex_unlock(x->search_count_mutex);
+#endif
// Should we allow a follow on exhaustive search?
if (is_exhaustive_allowed(cpi, x)) {
--- a/vp9/encoder/vp9_multi_thread.c
+++ b/vp9/encoder/vp9_multi_thread.c
@@ -100,11 +100,32 @@
multi_thread_ctxt->num_tile_vert_sbs[tile_row] =
get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
}
+
+#if CONFIG_MULTITHREAD
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+
+ CHECK_MEM_ERROR(cm, this_tile->search_count_mutex,
+ vpx_malloc(sizeof(*this_tile->search_count_mutex)));
+
+ pthread_mutex_init(this_tile->search_count_mutex, NULL);
+
+ CHECK_MEM_ERROR(cm, this_tile->enc_row_mt_mutex,
+ vpx_malloc(sizeof(*this_tile->enc_row_mt_mutex)));
+
+ pthread_mutex_init(this_tile->enc_row_mt_mutex, NULL);
+ }
+ }
+#endif
}
void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) {
MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
int tile_col;
+#if CONFIG_MULTITHREAD
+ int tile_row;
+#endif
// Deallocate memory for job queue
if (multi_thread_ctxt->job_queue) vpx_free(multi_thread_ctxt->job_queue);
@@ -124,6 +145,25 @@
TileDataEnc *this_tile = &cpi->tile_data[tile_col];
vp9_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
}
+
+#if CONFIG_MULTITHREAD
+ for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows;
+ tile_row++) {
+ for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+ tile_col++) {
+ TileDataEnc *this_tile =
+ &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
+ tile_col];
+ pthread_mutex_destroy(this_tile->search_count_mutex);
+ vpx_free(this_tile->search_count_mutex);
+ this_tile->search_count_mutex = NULL;
+
+ pthread_mutex_destroy(this_tile->enc_row_mt_mutex);
+ vpx_free(this_tile->enc_row_mt_mutex);
+ this_tile->enc_row_mt_mutex = NULL;
+ }
+ }
+#endif
}
void vp9_multi_thread_tile_init(VP9_COMP *cpi) {
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1657,7 +1657,10 @@
mode_rd_thresh = mode_rd_thresh << 3;
if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
- rd_thresh_freq_fact[mode_index]))
+#if CONFIG_MULTITHREAD
+ tile_data->enc_row_mt_mutex,
+#endif
+ &rd_thresh_freq_fact[mode_index]))
continue;
if (this_mode == NEWMV) {
@@ -2018,7 +2021,10 @@
continue;
if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
- rd_thresh_freq_fact[mode_index]))
+#if CONFIG_MULTITHREAD
+ tile_data->enc_row_mt_mutex,
+#endif
+ &rd_thresh_freq_fact[mode_index]))
continue;
mi->mode = this_mode;
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -610,7 +610,15 @@
}
void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
- int bsize, int best_mode_index) {
+ int bsize,
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *enc_row_mt_mutex,
+#endif
+ int best_mode_index) {
+#if CONFIG_MULTITHREAD
+ if (NULL != enc_row_mt_mutex) pthread_mutex_lock(enc_row_mt_mutex);
+#endif
+
if (rd_thresh > 0) {
const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
int mode;
@@ -628,6 +636,10 @@
}
}
}
+
+#if CONFIG_MULTITHREAD
+ if (NULL != enc_row_mt_mutex) pthread_mutex_unlock(enc_row_mt_mutex);
+#endif
}
int vp9_get_intra_cost_penalty(int qindex, int qdelta,
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -164,11 +164,32 @@
void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);
void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize,
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *enc_row_mt_mutex,
+#endif
int best_mode_index);
static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
- int thresh_fact) {
- return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *enc_row_mt_mutex,
+#endif
+ const int *const thresh_fact) {
+ int is_rd_less_than_thresh;
+
+#if CONFIG_MULTITHREAD
+ // Synchronize to ensure data coherency as thresh_freq_fact is maintained at
+ // tile level and not thread-safe with row based multi-threading
+ if (NULL != enc_row_mt_mutex) pthread_mutex_lock(enc_row_mt_mutex);
+#endif
+
+ is_rd_less_than_thresh =
+ best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX;
+
+#if CONFIG_MULTITHREAD
+ if (NULL != enc_row_mt_mutex) pthread_mutex_unlock(enc_row_mt_mutex);
+#endif
+
+ return is_rd_less_than_thresh;
}
static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3043,7 +3043,10 @@
const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
int64_t mode_threshold[MAX_MODES];
- int *mode_map = tile_data->mode_map[bsize];
+ int *tile_mode_map = tile_data->mode_map[bsize];
+ int mode_map[MAX_MODES]; // Maintain mode_map information locally to avoid
+ // lock mechanism involved with reads from
+ // tile_mode_map
const int mode_search_skip_flags = sf->mode_search_skip_flags;
int64_t mask_filter = 0;
int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
@@ -3155,10 +3158,19 @@
~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
+
+#if CONFIG_MULTITHREAD
+ if (NULL != tile_data->enc_row_mt_mutex)
+ pthread_mutex_lock(tile_data->enc_row_mt_mutex);
+#endif
+
for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
midx = sf->schedule_mode_search ? mode_skip_start : 0;
+
+ memcpy(mode_map, tile_mode_map, sizeof(mode_map));
+
while (midx > 4) {
uint8_t end_pos = 0;
for (i = 5; i < midx; ++i) {
@@ -3172,6 +3184,13 @@
midx = end_pos;
}
+ memcpy(tile_mode_map, mode_map, sizeof(mode_map));
+
+#if CONFIG_MULTITHREAD
+ if (NULL != tile_data->enc_row_mt_mutex)
+ pthread_mutex_unlock(tile_data->enc_row_mt_mutex);
+#endif
+
for (midx = 0; midx < MAX_MODES; ++midx) {
int mode_index = mode_map[midx];
int mode_excluded = 0;
@@ -3573,6 +3592,9 @@
}
if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
+ // If adaptive interp filter is enabled, then the current leaf node of 8x8
+ // data is needed for sub8x8. Hence preserve the context.
+ if (cpi->new_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
rd_cost->rate = INT_MAX;
rd_cost->rdcost = INT64_MAX;
return;
@@ -3599,7 +3621,11 @@
if (!cpi->rc.is_src_frame_alt_ref)
vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
- sf->adaptive_rd_thresh, bsize, best_mode_index);
+ sf->adaptive_rd_thresh, bsize,
+#if CONFIG_MULTITHREAD
+ tile_data->enc_row_mt_mutex,
+#endif
+ best_mode_index);
// macroblock modes
*mi = best_mbmode;
@@ -3737,7 +3763,11 @@
(cm->interp_filter == mi->interp_filter));
vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
- cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
+ cpi->sf.adaptive_rd_thresh, bsize,
+#if CONFIG_MULTITHREAD
+ tile_data->enc_row_mt_mutex,
+#endif
+ THR_ZEROMV);
vp9_zero(best_pred_diff);
vp9_zero(best_filter_diff);
@@ -3789,6 +3819,7 @@
int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
int internal_active_edge =
vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi);
+ const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
memset(x->zcoeff_blk[TX_4X4], 0, 4);
@@ -3880,7 +3911,10 @@
if (!internal_active_edge &&
rd_less_than_thresh(best_rd,
rd_opt->threshes[segment_id][bsize][ref_index],
- tile_data->thresh_freq_fact[bsize][ref_index]))
+#if CONFIG_MULTITHREAD
+ tile_data->enc_row_mt_mutex,
+#endif
+ &rd_thresh_freq_fact[ref_index]))
continue;
comp_pred = second_ref_frame > INTRA_FRAME;
@@ -4324,7 +4358,11 @@
!is_inter_block(&best_mbmode));
vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, sf->adaptive_rd_thresh,
- bsize, best_ref_index);
+ bsize,
+#if CONFIG_MULTITHREAD
+ tile_data->enc_row_mt_mutex,
+#endif
+ best_ref_index);
// macroblock modes
*mi = best_mbmode;
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -585,6 +585,15 @@
rd->thresh_mult_sub8x8[i] = INT_MAX;
}
}
+
+ // With row based multi-threading, the following speed features
+ // have to be disabled to guarantee that bitstreams encoded with single thread
+ // and multiple threads match
+ if (cpi->oxcf.ethread_bit_match) {
+ sf->adaptive_rd_thresh = 0;
+ sf->allow_exhaustive_searches = 0;
+ sf->adaptive_pred_interp_filter = 0;
+ }
}
void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
@@ -746,5 +755,14 @@
if (!cpi->oxcf.frame_periodic_boost) {
sf->max_delta_qindex = 0;
+ }
+
+ // With row based multi-threading, the following speed features
+ // have to be disabled to guarantee that bitstreams encoded with single thread
+ // and multiple threads match
+ if (cpi->oxcf.ethread_bit_match) {
+ sf->adaptive_rd_thresh = 0;
+ sf->allow_exhaustive_searches = 0;
+ sf->adaptive_pred_interp_filter = 0;
}
}
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1459,6 +1459,9 @@
cfg->ss_number_layers > 1 && cfg->ts_number_layers > 1) {
return VPX_CODEC_INVALID_PARAM;
}
+
+ vp9_set_new_mt(ctx->cpi);
+
return VPX_CODEC_OK;
}