shithub: libvpx

--- a/test/vp9_ethread_test.cc

+++ b/test/vp9_ethread_test.cc

@@ -82,9 +82,8 @@

       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0);

       // For now, new_mt_mode only works for 2-pass encoding.

-      // Enable this once the fp mt patch is checked in.

-      // if (encoding_mode_ == ::libvpx_test::kTwoPassGood)

-      //  encoder->Control(VP9E_SET_NEW_MT, new_mt_mode_);

+      if (encoding_mode_ == ::libvpx_test::kTwoPassGood)

+        encoder->Control(VP9E_SET_NEW_MT, new_mt_mode_);

       encoder_initialized_ = true;

@@ -131,7 +130,7 @@

     for (j = 0; j < kDbl; ++j) {

       EXPECT_LE(fabs(*frame_stats1 - *frame_stats2),

-                fabs(*frame_stats1) / 1000.0);

+                fabs(*frame_stats1) / 10000.0);

       frame_stats1++;

       frame_stats2++;

@@ -146,7 +145,7 @@

 TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {

-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 50);

+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);

   first_pass_only_ = 1;

   cfg_.rc_target_bitrate = 1000;

--- a/vp9/encoder/vp9_block.h

+++ b/vp9/encoder/vp9_block.h

@@ -131,6 +131,10 @@

   int use_lp32x32fdct;

   int skip_encode;

+  // In first pass, intra prediction is done based on source pixels

+  // at tile boundaries

+  int fp_src_pred;

   // use fast quantization process

   int quant_fp;

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -773,9 +773,10 @@

-  vp9_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst,

-                          x->skip_encode ? src_stride : dst_stride, dst,

-                          dst_stride, col, row, plane);

+  vp9_predict_intra_block(

+      xd, bwl, tx_size, mode, (x->skip_encode || x->fp_src_pred) ? src : dst,

+      (x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst,

+      dst_stride, col, row, plane);

 #if CONFIG_VP9_HIGHBITDEPTH

   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

--- a/vp9/encoder/vp9_encoder.c

+++ b/vp9/encoder/vp9_encoder.c

@@ -50,6 +50,7 @@

 #include "vp9/encoder/vp9_ethread.h"

 #include "vp9/encoder/vp9_firstpass.h"

 #include "vp9/encoder/vp9_mbgraph.h"

+#include "vp9/encoder/vp9_multi_thread.h"

 #include "vp9/encoder/vp9_noise_estimate.h"

 #include "vp9/encoder/vp9_picklpf.h"

 #include "vp9/encoder/vp9_ratectrl.h"

@@ -1563,6 +1564,13 @@

 #if CONFIG_VP9_HIGHBITDEPTH

   highbd_set_var_fns(cpi);

 #endif

+  // Enable multi-threading for first pass.

+  cpi->new_mt = 0;

+  if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) &&

+       cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) &&

+      cpi->oxcf.new_mt)

+    cpi->new_mt = 1;

 #ifndef M_LOG2_E

@@ -1719,6 +1727,12 @@

 #endif

+#if ENABLE_MT_BIT_MATCH

+  CHECK_MEM_ERROR(

+      cm, cpi->twopass.fp_mb_float_stats,

+      vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1));

+#endif

   cpi->refresh_alt_ref_frame = 0;

   cpi->multi_arf_last_grp_enabled = 0;

@@ -2076,6 +2090,7 @@

   vpx_free(cpi->tile_thr_data);

   vpx_free(cpi->workers);

+  vp9_row_mt_mem_dealloc(cpi);

   if (cpi->num_workers > 1) {

     vp9_loop_filter_dealloc(&cpi->lf_row_sync);

@@ -2098,6 +2113,11 @@

 #endif

+#if ENABLE_MT_BIT_MATCH

+  vpx_free(cpi->twopass.fp_mb_float_stats);

+  cpi->twopass.fp_mb_float_stats = NULL;

+#endif

   vp9_remove_common(cm);

   vp9_free_ref_frame_buffers(cm->buffer_pool);

 #if CONFIG_VP9_POSTPROC

@@ -4802,6 +4822,7 @@

     for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;

+  cpi->td.mb.fp_src_pred = 0;

   if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) {

     const int lossless = is_lossless_requested(oxcf);

 #if CONFIG_VP9_HIGHBITDEPTH

--- a/vp9/encoder/vp9_encoder.h

+++ b/vp9/encoder/vp9_encoder.h

@@ -33,7 +33,9 @@

 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"

 #include "vp9/encoder/vp9_context_tree.h"

 #include "vp9/encoder/vp9_encodemb.h"

+#include "vp9/encoder/vp9_ethread.h"

 #include "vp9/encoder/vp9_firstpass.h"

+#include "vp9/encoder/vp9_job_queue.h"

 #include "vp9/encoder/vp9_lookahead.h"

 #include "vp9/encoder/vp9_mbgraph.h"

 #include "vp9/encoder/vp9_mcomp.h"

@@ -256,6 +258,8 @@

   int render_width;

   int render_height;

   VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;

+  int new_mt;

 } VP9EncoderConfig;

 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {

@@ -269,8 +273,34 @@

   int mode_map[BLOCK_SIZES][MAX_MODES];

   int m_search_count;

   int ex_search_count;

+  FIRSTPASS_DATA fp_data;

+  VP9RowMTSync row_mt_sync;

 } TileDataEnc;

+typedef struct RowMTInfo {

+  JobQueueHandle job_queue_hdl;

+#if CONFIG_MULTITHREAD

+  pthread_mutex_t job_mutex;

+#endif

+} RowMTInfo;

+typedef struct MultiThreadHandle {

+  int allocated_tile_rows;

+  int allocated_tile_cols;

+  int allocated_vert_unit_rows;

+  // Frame level params

+  int num_tile_vert_sbs[MAX_NUM_TILE_ROWS];

+  // Job Queue structure and handles

+  JobQueue *job_queue;

+  int jobs_per_tile_col;

+  RowMTInfo row_mt_info[MAX_NUM_TILE_COLS];

+  int thread_id_to_tile_id[MAX_NUM_THREADS];  // Mapping of threads to tiles

+} MultiThreadHandle;

 typedef struct RD_COUNTS {

   vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];

   int64_t comp_pred_diff[REFERENCE_MODES];

@@ -629,6 +659,10 @@

   int keep_level_stats;

   Vp9LevelInfo level_info;

+  MultiThreadHandle multi_thread_ctxt;

+  void (*row_mt_sync_read_ptr)(VP9RowMTSync *const, int, int);

+  void (*row_mt_sync_write_ptr)(VP9RowMTSync *const, int, int, const int);

+  int new_mt;

   // Previous Partition Info

   BLOCK_SIZE *prev_partition;

@@ -806,6 +840,18 @@

 static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {

   return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;

+}

+static INLINE int get_num_vert_units(TileInfo tile, int shift) {

+  int num_vert_units =

+      (tile.mi_row_end - tile.mi_row_start + (1 << shift) - 1) >> shift;

+  return num_vert_units;

+}

+static INLINE int get_num_cols(TileInfo tile, int shift) {

+  int num_cols =

+      (tile.mi_col_end - tile.mi_col_start + (1 << shift) - 1) >> shift;

+  return num_cols;

 static INLINE int get_level_index(VP9_LEVEL level) {

--- a/vp9/encoder/vp9_ethread.c

+++ b/vp9/encoder/vp9_ethread.c

@@ -11,6 +11,8 @@

 #include "vp9/encoder/vp9_encodeframe.h"

 #include "vp9/encoder/vp9_encoder.h"

 #include "vp9/encoder/vp9_ethread.h"

+#include "vp9/encoder/vp9_firstpass.h"

+#include "vp9/encoder/vp9_multi_thread.h"

 #include "vpx_dsp/vpx_dsp_common.h"

 static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {

@@ -64,15 +66,11 @@

   return (1 << log2_tile_cols);

-void vp9_encode_tiles_mt(VP9_COMP *cpi) {

+static void create_enc_workers(VP9_COMP *cpi, int num_workers) {

   VP9_COMMON *const cm = &cpi->common;

-  const int tile_cols = 1 << cm->log2_tile_cols;

   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();

-  const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);

   int i;

-  vp9_init_tile_data(cpi);

   // Only run once to create threads and allocate thread data.

   if (cpi->num_workers == 0) {

     int allocated_workers = num_workers;

@@ -123,20 +121,58 @@

         thread_data->cpi = cpi;

         thread_data->td = &cpi->td;

       winterface->sync(worker);

+}

+static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2,

+                               int num_workers) {

+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();

+  int i;

   for (i = 0; i < num_workers; i++) {

     VPxWorker *const worker = &cpi->workers[i];

-    EncWorkerData *thread_data;

-    worker->hook = (VPxWorkerHook)enc_worker_hook;

+    worker->hook = (VPxWorkerHook)hook;

     worker->data1 = &cpi->tile_thr_data[i];

-    worker->data2 = NULL;

-    thread_data = (EncWorkerData *)worker->data1;

+    worker->data2 = data2;

+  }

+  // Encode a frame

+  for (i = 0; i < num_workers; i++) {

+    VPxWorker *const worker = &cpi->workers[i];

+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;

+    // Set the starting tile for each thread.

+    thread_data->start = i;

+    if (i == cpi->num_workers - 1)

+      winterface->execute(worker);

+    else

+      winterface->launch(worker);

+  }

+  // Encoding ends.

+  for (i = 0; i < num_workers; i++) {

+    VPxWorker *const worker = &cpi->workers[i];

+    winterface->sync(worker);

+  }

+}

+void vp9_encode_tiles_mt(VP9_COMP *cpi) {

+  VP9_COMMON *const cm = &cpi->common;

+  const int tile_cols = 1 << cm->log2_tile_cols;

+  const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);

+  int i;

+  vp9_init_tile_data(cpi);

+  create_enc_workers(cpi, num_workers);

+  for (i = 0; i < num_workers; i++) {

+    EncWorkerData *thread_data;

+    thread_data = &cpi->tile_thr_data[i];

     // Before encoding a frame, copy the thread data from cpi.

     if (thread_data->td != &cpi->td) {

       thread_data->td->mb = cpi->td.mb;

@@ -165,34 +201,266 @@

-  // Encode a frame

+  launch_enc_workers(cpi, (VPxWorkerHook)enc_worker_hook, NULL, num_workers);

   for (i = 0; i < num_workers; i++) {

     VPxWorker *const worker = &cpi->workers[i];

     EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;

-    // Set the starting tile for each thread.

-    thread_data->start = i;

+    // Accumulate counters.

+    if (i < cpi->num_workers - 1) {

+      vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0);

+      accumulate_rd_opt(&cpi->td, thread_data->td);

+    }

+  }

+}

-    if (i == cpi->num_workers - 1)

-      winterface->execute(worker);

-    else

-      winterface->launch(worker);

+static void accumulate_fp_tile_stat(TileDataEnc *tile_data,

+                                    TileDataEnc *tile_data_t) {

+  tile_data->fp_data.intra_factor += tile_data_t->fp_data.intra_factor;

+  tile_data->fp_data.brightness_factor +=

+      tile_data_t->fp_data.brightness_factor;

+  tile_data->fp_data.coded_error += tile_data_t->fp_data.coded_error;

+  tile_data->fp_data.sr_coded_error += tile_data_t->fp_data.sr_coded_error;

+  tile_data->fp_data.frame_noise_energy +=

+      tile_data_t->fp_data.frame_noise_energy;

+  tile_data->fp_data.intra_error += tile_data_t->fp_data.intra_error;

+  tile_data->fp_data.intercount += tile_data_t->fp_data.intercount;

+  tile_data->fp_data.second_ref_count += tile_data_t->fp_data.second_ref_count;

+  tile_data->fp_data.neutral_count += tile_data_t->fp_data.neutral_count;

+  tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count;

+  tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount;

+  tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr;

+  tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs;

+  tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc;

+  tile_data->fp_data.sum_mvc_abs += tile_data_t->fp_data.sum_mvc_abs;

+  tile_data->fp_data.sum_mvrs += tile_data_t->fp_data.sum_mvrs;

+  tile_data->fp_data.sum_mvcs += tile_data_t->fp_data.sum_mvcs;

+  tile_data->fp_data.sum_in_vectors += tile_data_t->fp_data.sum_in_vectors;

+  tile_data->fp_data.intra_smooth_count +=

+      tile_data_t->fp_data.intra_smooth_count;

+  tile_data->fp_data.image_data_start_row =

+      VPXMIN(tile_data->fp_data.image_data_start_row,

+             tile_data_t->fp_data.image_data_start_row) == INVALID_ROW

+          ? VPXMAX(tile_data->fp_data.image_data_start_row,

+                   tile_data_t->fp_data.image_data_start_row)

+          : VPXMIN(tile_data->fp_data.image_data_start_row,

+                   tile_data_t->fp_data.image_data_start_row);

+}

+// Allocate memory for row synchronization

+void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,

+                               int rows) {

+  row_mt_sync->rows = rows;

+#if CONFIG_MULTITHREAD

+  {

+    int i;

+    CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,

+                    vpx_malloc(sizeof(*row_mt_sync->mutex_) * rows));

+    if (row_mt_sync->mutex_) {

+      for (i = 0; i < rows; ++i) {

+        pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);

+      }

+    }

+    CHECK_MEM_ERROR(cm, row_mt_sync->cond_,

+                    vpx_malloc(sizeof(*row_mt_sync->cond_) * rows));

+    if (row_mt_sync->cond_) {

+      for (i = 0; i < rows; ++i) {

+        pthread_cond_init(&row_mt_sync->cond_[i], NULL);

+      }

+    }

+#endif  // CONFIG_MULTITHREAD

-  // Encoding ends.

-  for (i = 0; i < num_workers; i++) {

-    VPxWorker *const worker = &cpi->workers[i];

-    winterface->sync(worker);

+  CHECK_MEM_ERROR(cm, row_mt_sync->cur_col,

+                  vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows));

+  // Set up nsync.

+  row_mt_sync->sync_range = 1;

+}

+// Deallocate row based multi-threading synchronization related mutex and data

+void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync) {

+  if (row_mt_sync != NULL) {

+#if CONFIG_MULTITHREAD

+    int i;

+    if (row_mt_sync->mutex_ != NULL) {

+      for (i = 0; i < row_mt_sync->rows; ++i) {

+        pthread_mutex_destroy(&row_mt_sync->mutex_[i]);

+      }

+      vpx_free(row_mt_sync->mutex_);

+    }

+    if (row_mt_sync->cond_ != NULL) {

+      for (i = 0; i < row_mt_sync->rows; ++i) {

+        pthread_cond_destroy(&row_mt_sync->cond_[i]);

+      }

+      vpx_free(row_mt_sync->cond_);

+    }

+#endif  // CONFIG_MULTITHREAD

+    vpx_free(row_mt_sync->cur_col);

+    // clear the structure as the source of this call may be dynamic change

+    // in tiles in which case this call will be followed by an _alloc()

+    // which may fail.

+    vp9_zero(*row_mt_sync);

+}

+void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c) {

+#if CONFIG_MULTITHREAD

+  const int nsync = row_mt_sync->sync_range;

+  if (r && !(c & (nsync - 1))) {

+    pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1];

+    pthread_mutex_lock(mutex);

+    while (c > row_mt_sync->cur_col[r - 1] - nsync) {

+      pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex);

+    }

+    pthread_mutex_unlock(mutex);

+  }

+#else

+  (void)row_mt_sync;

+  (void)r;

+  (void)c;

+#endif  // CONFIG_MULTITHREAD

+}

+void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c) {

+  (void)row_mt_sync;

+  (void)r;

+  (void)c;

+  return;

+}

+void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,

+                           const int cols) {

+#if CONFIG_MULTITHREAD

+  const int nsync = row_mt_sync->sync_range;

+  int cur;

+  // Only signal when there are enough filtered SB for next row to run.

+  int sig = 1;

+  if (c < cols - 1) {

+    cur = c;

+    if (c % nsync) sig = 0;

+  } else {

+    cur = cols + nsync;

+  }

+  if (sig) {

+    pthread_mutex_lock(&row_mt_sync->mutex_[r]);

+    row_mt_sync->cur_col[r] = cur;

+    pthread_cond_signal(&row_mt_sync->cond_[r]);

+    pthread_mutex_unlock(&row_mt_sync->mutex_[r]);

+  }

+#else

+  (void)row_mt_sync;

+  (void)r;

+  (void)c;

+  (void)cols;

+#endif  // CONFIG_MULTITHREAD

+}

+void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,

+                                 const int cols) {

+  (void)row_mt_sync;

+  (void)r;

+  (void)c;

+  (void)cols;

+  return;

+}

+static int first_pass_worker_hook(EncWorkerData *const thread_data,

+                                  MultiThreadHandle *multi_thread_ctxt) {

+  VP9_COMP *const cpi = thread_data->cpi;

+  const VP9_COMMON *const cm = &cpi->common;

+  const int tile_cols = 1 << cm->log2_tile_cols;

+  int tile_row, tile_col;

+  TileDataEnc *this_tile;

+  int end_of_frame;

+  int thread_id = thread_data->thread_id;

+  int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];

+  JobNode *proc_job = NULL;

+  FIRSTPASS_DATA fp_acc_data;

+  MV zero_mv = { 0, 0 };

+  MV best_ref_mv;

+  int mb_row;

+  end_of_frame = 0;

+  while (0 == end_of_frame) {

+    // Get the next job in the queue

+    proc_job =

+        (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);

+    if (NULL == proc_job) {

+      // Query for the status of other tiles

+      end_of_frame = vp9_get_tiles_proc_status(

+          multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,

+          tile_cols);

+    } else {

+      tile_col = proc_job->tile_col_id;

+      tile_row = proc_job->tile_row_id;

+      this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];

+      mb_row = proc_job->vert_unit_row_num;

+      best_ref_mv = zero_mv;

+      vp9_zero(fp_acc_data);

+      fp_acc_data.image_data_start_row = INVALID_ROW;

+      vp9_first_pass_encode_tile_mb_row(cpi, thread_data->td, &fp_acc_data,

+                                        this_tile, &best_ref_mv, mb_row);

+    }

+  }

+  return 0;

+}

+void vp9_encode_fp_row_mt(VP9_COMP *cpi) {

+  VP9_COMMON *const cm = &cpi->common;

+  const int tile_cols = 1 << cm->log2_tile_cols;

+  const int tile_rows = 1 << cm->log2_tile_rows;

+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;

+  TileDataEnc *first_tile_col;

+  int num_workers = VPXMAX(cpi->oxcf.max_threads, 1);

+  int i;

+  if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||

+      multi_thread_ctxt->allocated_tile_rows < tile_rows ||

+      multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {

+    vp9_row_mt_mem_dealloc(cpi);

+    vp9_init_tile_data(cpi);

+    vp9_row_mt_mem_alloc(cpi);

+  } else {

+    vp9_init_tile_data(cpi);

+  }

+  create_enc_workers(cpi, num_workers);

+  vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);

+  vp9_prepare_job_queue(cpi, FIRST_PASS_JOB);

+  vp9_multi_thread_tile_init(cpi);

   for (i = 0; i < num_workers; i++) {

-    VPxWorker *const worker = &cpi->workers[i];

-    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;

+    EncWorkerData *thread_data;

+    thread_data = &cpi->tile_thr_data[i];

-    // Accumulate counters.

-    if (i < cpi->num_workers - 1) {

-      vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0);

-      accumulate_rd_opt(&cpi->td, thread_data->td);

+    // Before encoding a frame, copy the thread data from cpi.

+    if (thread_data->td != &cpi->td) {

+      thread_data->td->mb = cpi->td.mb;

+  }

+  launch_enc_workers(cpi, (VPxWorkerHook)first_pass_worker_hook,

+                     multi_thread_ctxt, num_workers);

+  first_tile_col = &cpi->tile_data[0];

+  for (i = 1; i < tile_cols; i++) {

+    TileDataEnc *this_tile = &cpi->tile_data[i];

+    accumulate_fp_tile_stat(first_tile_col, this_tile);

--- a/vp9/encoder/vp9_ethread.h

+++ b/vp9/encoder/vp9_ethread.h

@@ -15,6 +15,10 @@

 extern "C" {

 #endif

+#define MAX_NUM_TILE_COLS (1 << 6)

+#define MAX_NUM_TILE_ROWS 4

+#define MAX_NUM_THREADS 80

 struct VP9_COMP;

 struct ThreadData;

@@ -22,9 +26,40 @@

   struct VP9_COMP *cpi;

   struct ThreadData *td;

   int start;

+  int thread_id;

+  int tile_completion_status[MAX_NUM_TILE_COLS];

 } EncWorkerData;

+// Encoder row synchronization

+typedef struct VP9RowMTSyncData {

+#if CONFIG_MULTITHREAD

+  pthread_mutex_t *mutex_;

+  pthread_cond_t *cond_;

+#endif

+  // Allocate memory to store the sb/mb block index in each row.

+  int *cur_col;

+  int sync_range;

+  int rows;

+} VP9RowMTSync;

 void vp9_encode_tiles_mt(struct VP9_COMP *cpi);

+void vp9_encode_fp_row_mt(struct VP9_COMP *cpi);

+void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c);

+void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,

+                           const int cols);

+void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c);

+void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,

+                                 const int cols);

+// Allocate memory for row based multi-threading synchronization.

+void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, struct VP9Common *cm,

+                               int rows);

+// Deallocate row based multi-threading synchronization related mutex and data.

+void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync);

 #ifdef __cplusplus

 }  // extern "C"

--- a/vp9/encoder/vp9_firstpass.c

+++ b/vp9/encoder/vp9_firstpass.c

@@ -31,6 +31,7 @@

 #include "vp9/encoder/vp9_encodemb.h"

 #include "vp9/encoder/vp9_encodemv.h"

 #include "vp9/encoder/vp9_encoder.h"

+#include "vp9/encoder/vp9_ethread.h"

 #include "vp9/encoder/vp9_extend.h"

 #include "vp9/encoder/vp9_firstpass.h"

 #include "vp9/encoder/vp9_mcomp.h"

@@ -646,37 +647,150 @@

   return block_noise << 2;  // Scale << 2 to account for sampling.

-#define INVALID_ROW -1

-void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {

+#if ENABLE_MT_BIT_MATCH

+static void accumulate_floating_point_stats(VP9_COMP *cpi,

+                                            TileDataEnc *first_tile_col) {

+  VP9_COMMON *const cm = &cpi->common;

   int mb_row, mb_col;

-  MACROBLOCK *const x = &cpi->td.mb;

+  first_tile_col->fp_data.intra_factor = 0;

+  first_tile_col->fp_data.brightness_factor = 0;

+  first_tile_col->fp_data.neutral_count = 0;

+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {

+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {

+      const int mb_index = mb_row * cm->mb_cols + mb_col;

+      first_tile_col->fp_data.intra_factor +=

+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor;

+      first_tile_col->fp_data.brightness_factor +=

+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor;

+      first_tile_col->fp_data.neutral_count +=

+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count;

+    }

+  }

+}

+#endif

+static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,

+                                 FIRSTPASS_DATA *fp_acc_data) {

   VP9_COMMON *const cm = &cpi->common;

+  // The minimum error here insures some bit allocation to frames even

+  // in static regions. The allocation per MB declines for larger formats

+  // where the typical "real" energy per MB also falls.

+  // Initial estimate here uses sqrt(mbs) to define the min_err, where the

+  // number of mbs is proportional to the image area.

+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs

+                                                             : cpi->common.MBs;

+  const double min_err = 200 * sqrt(num_mbs);

+  // Clamp the image start to rows/2. This number of rows is discarded top

+  // and bottom as dead data so rows / 2 means the frame is blank.

+  if ((fp_acc_data->image_data_start_row > cm->mb_rows / 2) ||

+      (fp_acc_data->image_data_start_row == INVALID_ROW)) {

+    fp_acc_data->image_data_start_row = cm->mb_rows / 2;

+  }

+  // Exclude any image dead zone

+  if (fp_acc_data->image_data_start_row > 0) {

+    fp_acc_data->intra_skip_count =

+        VPXMAX(0, fp_acc_data->intra_skip_count -

+                      (fp_acc_data->image_data_start_row * cm->mb_cols * 2));

+  }

+  fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs;

+  fp_acc_data->brightness_factor =

+      fp_acc_data->brightness_factor / (double)num_mbs;

+  fps->weight = fp_acc_data->intra_factor * fp_acc_data->brightness_factor;

+  fps->frame = cm->current_video_frame;

+  fps->spatial_layer_id = cpi->svc.spatial_layer_id;

+  fps->coded_error = (double)(fp_acc_data->coded_error >> 8) + min_err;

+  fps->sr_coded_error = (double)(fp_acc_data->sr_coded_error >> 8) + min_err;

+  fps->intra_error = (double)(fp_acc_data->intra_error >> 8) + min_err;

+  fps->frame_noise_energy =

+      (double)(fp_acc_data->frame_noise_energy) / (double)num_mbs;

+  fps->count = 1.0;

+  fps->pcnt_inter = (double)(fp_acc_data->intercount) / num_mbs;

+  fps->pcnt_second_ref = (double)(fp_acc_data->second_ref_count) / num_mbs;

+  fps->pcnt_neutral = (double)(fp_acc_data->neutral_count) / num_mbs;

+  fps->intra_skip_pct = (double)(fp_acc_data->intra_skip_count) / num_mbs;

+  fps->intra_smooth_pct = (double)(fp_acc_data->intra_smooth_count) / num_mbs;

+  fps->inactive_zone_rows = (double)(fp_acc_data->image_data_start_row);

+  // Currently set to 0 as most issues relate to letter boxing.

+  fps->inactive_zone_cols = (double)0;

+  if (fp_acc_data->mvcount > 0) {

+    fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount;

+    fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount;

+    fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount;

+    fps->mvc_abs = (double)(fp_acc_data->sum_mvc_abs) / fp_acc_data->mvcount;

+    fps->MVrv = ((double)(fp_acc_data->sum_mvrs) -

+                 ((double)(fp_acc_data->sum_mvr) * (fp_acc_data->sum_mvr) /

+                  fp_acc_data->mvcount)) /

+                fp_acc_data->mvcount;

+    fps->MVcv = ((double)(fp_acc_data->sum_mvcs) -

+                 ((double)(fp_acc_data->sum_mvc) * (fp_acc_data->sum_mvc) /

+                  fp_acc_data->mvcount)) /

+                fp_acc_data->mvcount;

+    fps->mv_in_out_count =

+        (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2);

+    fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs;

+  } else {

+    fps->MVr = 0.0;

+    fps->mvr_abs = 0.0;

+    fps->MVc = 0.0;

+    fps->mvc_abs = 0.0;

+    fps->MVrv = 0.0;

+    fps->MVcv = 0.0;

+    fps->mv_in_out_count = 0.0;

+    fps->pcnt_motion = 0.0;

+  }

+}

+static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile,

+                                      FIRSTPASS_DATA *fp_acc_data) {

+  this_tile->fp_data.intra_factor += fp_acc_data->intra_factor;

+  this_tile->fp_data.brightness_factor += fp_acc_data->brightness_factor;

+  this_tile->fp_data.coded_error += fp_acc_data->coded_error;

+  this_tile->fp_data.sr_coded_error += fp_acc_data->sr_coded_error;

+  this_tile->fp_data.frame_noise_energy += fp_acc_data->frame_noise_energy;

+  this_tile->fp_data.intra_error += fp_acc_data->intra_error;

+  this_tile->fp_data.intercount += fp_acc_data->intercount;

+  this_tile->fp_data.second_ref_count += fp_acc_data->second_ref_count;

+  this_tile->fp_data.neutral_count += fp_acc_data->neutral_count;

+  this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count;

+  this_tile->fp_data.mvcount += fp_acc_data->mvcount;

+  this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr;

+  this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs;

+  this_tile->fp_data.sum_mvc += fp_acc_data->sum_mvc;

+  this_tile->fp_data.sum_mvc_abs += fp_acc_data->sum_mvc_abs;

+  this_tile->fp_data.sum_mvrs += fp_acc_data->sum_mvrs;

+  this_tile->fp_data.sum_mvcs += fp_acc_data->sum_mvcs;

+  this_tile->fp_data.sum_in_vectors += fp_acc_data->sum_in_vectors;

+  this_tile->fp_data.intra_smooth_count += fp_acc_data->intra_smooth_count;

+  this_tile->fp_data.image_data_start_row =

+      VPXMIN(this_tile->fp_data.image_data_start_row,

+             fp_acc_data->image_data_start_row) == INVALID_ROW

+          ? VPXMAX(this_tile->fp_data.image_data_start_row,

+                   fp_acc_data->image_data_start_row)

+          : VPXMIN(this_tile->fp_data.image_data_start_row,

+                   fp_acc_data->image_data_start_row);

+}

+void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,

+                                       FIRSTPASS_DATA *fp_acc_data,

+                                       TileDataEnc *tile_data, MV *best_ref_mv,

+                                       int mb_row) {

+  int mb_col;

+  MACROBLOCK *const x = &td->mb;

+  VP9_COMMON *const cm = &cpi->common;

   MACROBLOCKD *const xd = &x->e_mbd;

-  TileInfo tile;

+  TileInfo tile = tile_data->tile_info;

   struct macroblock_plane *const p = x->plane;

   struct macroblockd_plane *const pd = xd->plane;

-  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;

-  int i;

+  const PICK_MODE_CONTEXT *ctx = &td->pc_root->none;

+  int i, c;

+  int num_mb_cols = get_num_cols(tile_data->tile_info, 1);

   int recon_yoffset, recon_uvoffset;

-  int64_t intra_error = 0;

-  int64_t coded_error = 0;

-  int64_t sr_coded_error = 0;

-  int64_t frame_noise_energy = 0;

-  int sum_mvr = 0, sum_mvc = 0;

-  int sum_mvr_abs = 0, sum_mvc_abs = 0;

-  int64_t sum_mvrs = 0, sum_mvcs = 0;

-  int mvcount = 0;

-  int intercount = 0;

-  int second_ref_count = 0;

   const int intrapenalty = INTRA_MODE_PENALTY;

-  double neutral_count;

-  int intra_skip_count = 0;

-  int intra_smooth_count = 0;

-  int image_data_start_row = INVALID_ROW;

-  int sum_in_vectors = 0;

-  TWO_PASS *twopass = &cpi->twopass;

   const MV zero_mv = { 0, 0 };

   int recon_y_stride, recon_uv_stride, uv_mb_height;

@@ -688,9 +802,6 @@

   LAYER_CONTEXT *const lc =

       is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id]

                            : NULL;

-  double intra_factor;

-  double brightness_factor;

-  BufferPool *const pool = cm->buffer_pool;

   MODE_INFO mi_above, mi_left;

   // First pass code requires valid last and new frame buffers.

@@ -697,41 +808,7 @@

   assert(new_yv12 != NULL);

   assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));

-#if CONFIG_FP_MB_STATS

-  if (cpi->use_fp_mb_stats) {

-    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);

-  }

-#endif

-  vpx_clear_system_state();

-  intra_factor = 0.0;

-  brightness_factor = 0.0;

-  neutral_count = 0.0;

-  set_first_pass_params(cpi);

-  vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));

   if (lc != NULL) {

-    twopass = &lc->twopass;

-    cpi->lst_fb_idx = cpi->svc.spatial_layer_id;

-    cpi->ref_frame_flags = VP9_LAST_FLAG;

-    if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <

-        REF_FRAMES) {

-      cpi->gld_fb_idx =

-          cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;

-      cpi->ref_frame_flags |= VP9_GOLD_FLAG;

-      cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);

-    } else {

-      cpi->refresh_golden_frame = 0;

-    }

-    if (lc->current_video_frame_in_layer == 0) cpi->ref_frame_flags = 0;

-    vp9_scale_references(cpi);

     // Use either last frame or alt frame for motion search.

     if (cpi->ref_frame_flags & VP9_LAST_FLAG) {

       first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);

@@ -747,29 +824,12 @@

     } else {

       gld_yv12 = NULL;

-    set_ref_ptrs(cm, xd,

-                 (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME : NONE,

-                 (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);

-    cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,

-                                        &cpi->scaled_source, 0);

-  vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);

+  xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) +

+           (tile.mi_col_start >> 1);

+  xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + (tile.mi_col_start >> 1);

-  vp9_setup_src_planes(x, cpi->Source, 0, 0);

-  vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);

-  if (!frame_is_intra_only(cm)) {

-    vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);

-  }

-  xd->mi = cm->mi_grid_visible;

-  xd->mi[0] = cm->mi;

-  vp9_frame_init_quantizer(cpi);

   for (i = 0; i < MAX_MB_PLANE; ++i) {

     p[i].coeff = ctx->coeff_pbuf[i][1];

     p[i].qcoeff = ctx->qcoeff_pbuf[i][1];

@@ -776,276 +836,363 @@

     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];

     p[i].eobs = ctx->eobs_pbuf[i][1];

-  x->skip_recode = 0;

-  vp9_init_mv_probs(cm);

-  vp9_initialize_rd_consts(cpi);

-  // Tiling is ignored in the first pass.

-  vp9_tile_init(&tile, cm, 0, 0);

   recon_y_stride = new_yv12->y_stride;

   recon_uv_stride = new_yv12->uv_stride;

   uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);

-  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {

-    MV best_ref_mv = { 0, 0 };

+  // Reset above block coeffs.

+  recon_yoffset =

+      (mb_row * recon_y_stride * 16) + (tile.mi_col_start >> 1) * 16;

+  recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height) +

+                   (tile.mi_col_start >> 1) * uv_mb_height;

-    // Reset above block coeffs.

-    recon_yoffset = (mb_row * recon_y_stride * 16);

-    recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);

+  // Set up limit values for motion vectors to prevent them extending

+  // outside the UMV borders.

+  x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);

+  x->mv_limits.row_max =

+      ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;

-    // Set up limit values for motion vectors to prevent them extending

-    // outside the UMV borders.

-    x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);

-    x->mv_limits.row_max =

-        ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;

+  for (mb_col = tile.mi_col_start >> 1, c = 0; mb_col < (tile.mi_col_end >> 1);

+       ++mb_col, c++) {

+    int this_error;

+    int this_intra_error;

+    const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);

+    const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);

+    double log_intra;

+    int level_sample;

+#if ENABLE_MT_BIT_MATCH

+    const int mb_index = mb_row * cm->mb_cols + mb_col;

+#endif

-    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {

-      int this_error;

-      int this_intra_error;

-      const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);

-      const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);

-      double log_intra;

-      int level_sample;

 #if CONFIG_FP_MB_STATS

-      const int mb_index = mb_row * cm->mb_cols + mb_col;

+    const int mb_index = mb_row * cm->mb_cols + mb_col;

 #endif

-      vpx_clear_system_state();

+    (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c - 1);

-      xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;

-      xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;

-      xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;

-      xd->mi[0]->sb_type = bsize;

-      xd->mi[0]->ref_frame[0] = INTRA_FRAME;

-      set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize],

-                     mb_col << 1, num_8x8_blocks_wide_lookup[bsize],

-                     cm->mi_rows, cm->mi_cols);

-      // Are edges available for intra prediction?

-      // Since the firstpass does not populate the mi_grid_visible,

-      // above_mi/left_mi must be overwritten with a nonzero value when edges

-      // are available.  Required by vp9_predict_intra_block().

-      xd->above_mi = (mb_row != 0) ? &mi_above : NULL;

-      xd->left_mi = (mb_col > tile.mi_col_start) ? &mi_left : NULL;

+    // Adjust to the next column of MBs.

+    x->plane[0].src.buf = cpi->Source->y_buffer +

+                          mb_row * 16 * x->plane[0].src.stride + mb_col * 16;

+    x->plane[1].src.buf = cpi->Source->u_buffer +

+                          mb_row * uv_mb_height * x->plane[1].src.stride +

+                          mb_col * uv_mb_height;

+    x->plane[2].src.buf = cpi->Source->v_buffer +

+                          mb_row * uv_mb_height * x->plane[1].src.stride +

+                          mb_col * uv_mb_height;

-      // Do intra 16x16 prediction.

-      x->skip_encode = 0;

-      xd->mi[0]->mode = DC_PRED;

-      xd->mi[0]->tx_size =

-          use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;

+    vpx_clear_system_state();

-      // Set the 16x16 src_diff block to zero, which ensures correct this_error

-      // calculation for block sizes smaller than 16x16.

-      vp9_zero_array(x->plane[0].src_diff, 256);

-      vp9_encode_intra_block_plane(x, bsize, 0, 0);

-      this_error = vpx_get_mb_ss(x->plane[0].src_diff);

-      this_intra_error = this_error;

+    xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;

+    xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;

+    xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;

+    xd->mi[0]->sb_type = bsize;

+    xd->mi[0]->ref_frame[0] = INTRA_FRAME;

+    set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize],

+                   mb_col << 1, num_8x8_blocks_wide_lookup[bsize], cm->mi_rows,

+                   cm->mi_cols);

+    // Are edges available for intra prediction?

+    // Since the firstpass does not populate the mi_grid_visible,

+    // above_mi/left_mi must be overwritten with a nonzero value when edges

+    // are available.  Required by vp9_predict_intra_block().

+    xd->above_mi = (mb_row != 0) ? &mi_above : NULL;

+    xd->left_mi = ((mb_col << 1) > tile.mi_col_start) ? &mi_left : NULL;

-      // Keep a record of blocks that have very low intra error residual

-      // (i.e. are in effect completely flat and untextured in the intra

-      // domain). In natural videos this is uncommon, but it is much more

-      // common in animations, graphics and screen content, so may be used

-      // as a signal to detect these types of content.

-      if (this_error < get_ul_intra_threshold(cm)) {

-        ++intra_skip_count;

-      } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {

-        image_data_start_row = mb_row;

-      }

+    // Do intra 16x16 prediction.

+    x->skip_encode = 0;

+    x->fp_src_pred = 0;

+    // Do intra prediction based on source pixels for tile boundaries

+    if ((mb_col == (tile.mi_col_start >> 1)) && mb_col != 0) {

+      xd->left_mi = &mi_left;

+      x->fp_src_pred = 1;

+    }

+    xd->mi[0]->mode = DC_PRED;

+    xd->mi[0]->tx_size =

+        use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;

+    // Fix - zero the 16x16 block first. This ensures correct this_error for

+    // block sizes smaller than 16x16.

+    vp9_zero_array(x->plane[0].src_diff, 256);

+    vp9_encode_intra_block_plane(x, bsize, 0, 0);

+    this_error = vpx_get_mb_ss(x->plane[0].src_diff);

+    this_intra_error = this_error;

-      // Blocks that are mainly smooth in the intra domain.

-      // Some special accounting for CQ but also these are better for testing

-      // noise levels.

-      if (this_error < get_smooth_intra_threshold(cm)) {

-        ++intra_smooth_count;

-      }

+    // Keep a record of blocks that have very low intra error residual

+    // (i.e. are in effect completely flat and untextured in the intra

+    // domain). In natural videos this is uncommon, but it is much more

+    // common in animations, graphics and screen content, so may be used

+    // as a signal to detect these types of content.

+    if (this_error < get_ul_intra_threshold(cm)) {

+      ++(fp_acc_data->intra_skip_count);

+    } else if ((mb_col > 0) &&

+               (fp_acc_data->image_data_start_row == INVALID_ROW)) {

+      fp_acc_data->image_data_start_row = mb_row;

+    }

-      // Special case noise measurement for first frame.

-      if (cm->current_video_frame == 0) {

-        if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {

-          frame_noise_energy += fp_estimate_block_noise(x, bsize);

-        } else {

-          frame_noise_energy += (int64_t)SECTION_NOISE_DEF;

-        }

+    // Blocks that are mainly smooth in the intra domain.

+    // Some special accounting for CQ but also these are better for testing

+    // noise levels.

+    if (this_error < get_smooth_intra_threshold(cm)) {

+      ++(fp_acc_data->intra_smooth_count);

+    }

+    // Special case noise measurement for first frame.

+    if (cm->current_video_frame == 0) {

+      if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {

+        fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);

+      } else {

+        fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;

+    }

 #if CONFIG_VP9_HIGHBITDEPTH

-      if (cm->use_highbitdepth) {

-        switch (cm->bit_depth) {

-          case VPX_BITS_8: break;

-          case VPX_BITS_10: this_error >>= 4; break;

-          case VPX_BITS_12: this_error >>= 8; break;

-          default:

-            assert(0 &&

-                   "cm->bit_depth should be VPX_BITS_8, "

-                   "VPX_BITS_10 or VPX_BITS_12");

-            return;

-        }

+    if (cm->use_highbitdepth) {

+      switch (cm->bit_depth) {

+        case VPX_BITS_8: break;

+        case VPX_BITS_10: this_error >>= 4; break;

+        case VPX_BITS_12: this_error >>= 8; break;

+        default:

+          assert(0 &&

+                 "cm->bit_depth should be VPX_BITS_8, "

+                 "VPX_BITS_10 or VPX_BITS_12");

+          return;

+    }

 #endif  // CONFIG_VP9_HIGHBITDEPTH

-      vpx_clear_system_state();

-      log_intra = log(this_error + 1.0);

-      if (log_intra < 10.0)

-        intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);

-      else

-        intra_factor += 1.0;

+    vpx_clear_system_state();

+    log_intra = log(this_error + 1.0);

+    if (log_intra < 10.0) {

+      fp_acc_data->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);

+#if ENABLE_MT_BIT_MATCH

+      cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor =

+          1.0 + ((10.0 - log_intra) * 0.05);

+#endif

+    } else {

+      fp_acc_data->intra_factor += 1.0;

+#if ENABLE_MT_BIT_MATCH

+      cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = 1.0;

+#endif

+    }

 #if CONFIG_VP9_HIGHBITDEPTH

-      if (cm->use_highbitdepth)

-        level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];

-      else

-        level_sample = x->plane[0].src.buf[0];

-#else

+    if (cm->use_highbitdepth)

+      level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];

+    else

       level_sample = x->plane[0].src.buf[0];

+#else

+    level_sample = x->plane[0].src.buf[0];

 #endif

-      if ((level_sample < DARK_THRESH) && (log_intra < 9.0))

-        brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));

-      else

-        brightness_factor += 1.0;

+    if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {

+      fp_acc_data->brightness_factor +=

+          1.0 + (0.01 * (DARK_THRESH - level_sample));

+#if ENABLE_MT_BIT_MATCH

+      cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor =

+          1.0 + (0.01 * (DARK_THRESH - level_sample));

+#endif

+    } else {

+      fp_acc_data->brightness_factor += 1.0;

+#if ENABLE_MT_BIT_MATCH

+      cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor = 1.0;

+#endif

+    }

-      // Intrapenalty below deals with situations where the intra and inter

-      // error scores are very low (e.g. a plain black frame).

-      // We do not have special cases in first pass for 0,0 and nearest etc so

-      // all inter modes carry an overhead cost estimate for the mv.

-      // When the error score is very low this causes us to pick all or lots of

-      // INTRA modes and throw lots of key frames.

-      // This penalty adds a cost matching that of a 0,0 mv to the intra case.

-      this_error += intrapenalty;

+    // Intrapenalty below deals with situations where the intra and inter

+    // error scores are very low (e.g. a plain black frame).

+    // We do not have special cases in first pass for 0,0 and nearest etc so

+    // all inter modes carry an overhead cost estimate for the mv.

+    // When the error score is very low this causes us to pick all or lots of

+    // INTRA modes and throw lots of key frames.

+    // This penalty adds a cost matching that of a 0,0 mv to the intra case.

+    this_error += intrapenalty;

-      // Accumulate the intra error.

-      intra_error += (int64_t)this_error;

+    // Accumulate the intra error.

+    fp_acc_data->intra_error += (int64_t)this_error;

 #if CONFIG_FP_MB_STATS

-      if (cpi->use_fp_mb_stats) {

-        // initialization

-        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;

-      }

+    if (cpi->use_fp_mb_stats) {

+      // initialization

+      cpi->twopass.frame_mb_stats_buf[mb_index] = 0;

+    }

 #endif

-      // Set up limit values for motion vectors to prevent them extending

-      // outside the UMV borders.

-      x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);

-      x->mv_limits.col_max =

-          ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;

+    // Set up limit values for motion vectors to prevent them extending

+    // outside the UMV borders.

+    x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);

+    x->mv_limits.col_max =

+        ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;

-      // Other than for the first frame do a motion search.

-      if ((lc == NULL && cm->current_video_frame > 0) ||

-          (lc != NULL && lc->current_video_frame_in_layer > 0)) {

-        int tmp_err, motion_error, raw_motion_error;

-        // Assume 0,0 motion with no mv overhead.

-        MV mv = { 0, 0 }, tmp_mv = { 0, 0 };

-        struct buf_2d unscaled_last_source_buf_2d;

+    // Other than for the first frame do a motion search.

+    if ((lc == NULL && cm->current_video_frame > 0) ||

+        (lc != NULL && lc->current_video_frame_in_layer > 0)) {

+      int tmp_err, motion_error, raw_motion_error;

+      // Assume 0,0 motion with no mv overhead.

+      MV mv = { 0, 0 }, tmp_mv = { 0, 0 };

+      struct buf_2d unscaled_last_source_buf_2d;

-        xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;

+      xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;

 #if CONFIG_VP9_HIGHBITDEPTH

-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

-          motion_error = highbd_get_prediction_error(

-              bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);

-        } else {

-          motion_error = get_prediction_error(bsize, &x->plane[0].src,

-                                              &xd->plane[0].pre[0]);

-        }

-#else

+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+        motion_error = highbd_get_prediction_error(

+            bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);

+      } else {

         motion_error =

             get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);

+      }

+#else

+      motion_error =

+          get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

-        // Compute the motion error of the 0,0 motion using the last source

-        // frame as the reference. Skip the further motion search on

-        // reconstructed frame if this error is small.

-        unscaled_last_source_buf_2d.buf =

-            cpi->unscaled_last_source->y_buffer + recon_yoffset;

-        unscaled_last_source_buf_2d.stride =

-            cpi->unscaled_last_source->y_stride;

+      // Compute the motion error of the 0,0 motion using the last source

+      // frame as the reference. Skip the further motion search on

+      // reconstructed frame if this error is small.

+      unscaled_last_source_buf_2d.buf =

+          cpi->unscaled_last_source->y_buffer + recon_yoffset;

+      unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride;

 #if CONFIG_VP9_HIGHBITDEPTH

-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

-          raw_motion_error = highbd_get_prediction_error(

-              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);

-        } else {

-          raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,

-                                                  &unscaled_last_source_buf_2d);

-        }

-#else

+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+        raw_motion_error = highbd_get_prediction_error(

+            bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);

+      } else {

         raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,

                                                 &unscaled_last_source_buf_2d);

+      }

+#else

+      raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,

+                                              &unscaled_last_source_buf_2d);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

-        // TODO(pengchong): Replace the hard-coded threshold

-        if (raw_motion_error > 25 || lc != NULL) {

-          // Test last reference frame using the previous best mv as the

-          // starting point (best reference) for the search.

-          first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);

+      // TODO(pengchong): Replace the hard-coded threshold

+      if (raw_motion_error > 25 || lc != NULL) {

+        // Test last reference frame using the previous best mv as the

+        // starting point (best reference) for the search.

+        first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error);

-          // If the current best reference mv is not centered on 0,0 then do a

-          // 0,0 based search as well.

-          if (!is_zero_mv(&best_ref_mv)) {

-            tmp_err = INT_MAX;

-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);

+        // If the current best reference mv is not centered on 0,0 then do a

+        // 0,0 based search as well.

+        if (!is_zero_mv(best_ref_mv)) {

+          tmp_err = INT_MAX;

+          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);

-            if (tmp_err < motion_error) {

-              motion_error = tmp_err;

-              mv = tmp_mv;

-            }

+          if (tmp_err < motion_error) {

+            motion_error = tmp_err;

+            mv = tmp_mv;

+        }

-          // Search in an older reference frame.

-          if (((lc == NULL && cm->current_video_frame > 1) ||

-               (lc != NULL && lc->current_video_frame_in_layer > 1)) &&

-              gld_yv12 != NULL) {

-            // Assume 0,0 motion with no mv overhead.

-            int gf_motion_error;

+        // Search in an older reference frame.

+        if (((lc == NULL && cm->current_video_frame > 1) ||

+             (lc != NULL && lc->current_video_frame_in_layer > 1)) &&

+            gld_yv12 != NULL) {

+          // Assume 0,0 motion with no mv overhead.

+          int gf_motion_error;

-            xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;

+          xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;

 #if CONFIG_VP9_HIGHBITDEPTH

-            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

-              gf_motion_error = highbd_get_prediction_error(

-                  bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);

-            } else {

-              gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,

-                                                     &xd->plane[0].pre[0]);

-            }

-#else

+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+            gf_motion_error = highbd_get_prediction_error(

+                bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);

+          } else {

             gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,

                                                    &xd->plane[0].pre[0]);

+          }

+#else

+          gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,

+                                                 &xd->plane[0].pre[0]);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,

-                                     &gf_motion_error);

+          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error);

-            if (gf_motion_error < motion_error && gf_motion_error < this_error)

-              ++second_ref_count;

+          if (gf_motion_error < motion_error && gf_motion_error < this_error)

+            ++(fp_acc_data->second_ref_count);

-            // Reset to last frame as reference buffer.

-            xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;

-            xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;

-            xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;

+          // Reset to last frame as reference buffer.

+          xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;

+          xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;

+          xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;

-            // In accumulating a score for the older reference frame take the

-            // best of the motion predicted score and the intra coded error

-            // (just as will be done for) accumulation of "coded_error" for

-            // the last frame.

-            if (gf_motion_error < this_error)

-              sr_coded_error += gf_motion_error;

-            else

-              sr_coded_error += this_error;

-          } else {

-            sr_coded_error += motion_error;

-          }

+          // In accumulating a score for the older reference frame take the

+          // best of the motion predicted score and the intra coded error

+          // (just as will be done for) accumulation of "coded_error" for

+          // the last frame.

+          if (gf_motion_error < this_error)

+            fp_acc_data->sr_coded_error += gf_motion_error;

+          else

+            fp_acc_data->sr_coded_error += this_error;

         } else {

-          sr_coded_error += motion_error;

+          fp_acc_data->sr_coded_error += motion_error;

+      } else {

+        fp_acc_data->sr_coded_error += motion_error;

+      }

-        // Start by assuming that intra mode is best.

-        best_ref_mv.row = 0;

-        best_ref_mv.col = 0;

+      // Start by assuming that intra mode is best.

+      best_ref_mv->row = 0;

+      best_ref_mv->col = 0;

 #if CONFIG_FP_MB_STATS

+      if (cpi->use_fp_mb_stats) {

+        // intra prediction statistics

+        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;

+        cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;

+        cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;

+        if (this_error > FPMB_ERROR_LARGE_TH) {

+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;

+        } else if (this_error < FPMB_ERROR_SMALL_TH) {

+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;

+        }

+      }

+#endif

+      if (motion_error <= this_error) {

+        vpx_clear_system_state();

+        // Keep a count of cases where the inter and intra were very close

+        // and very low. This helps with scene cut detection for example in

+        // cropped clips with black bars at the sides or top and bottom.

+        if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&

+            (this_error < (2 * intrapenalty))) {

+          fp_acc_data->neutral_count += 1.0;

+#if ENABLE_MT_BIT_MATCH

+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count = 1.0;

+#endif

+          // Also track cases where the intra is not much worse than the inter

+          // and use this in limiting the GF/arf group length.

+        } else if ((this_error > NCOUNT_INTRA_THRESH) &&

+                   (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {

+          fp_acc_data->neutral_count +=

+              (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);

+#if ENABLE_MT_BIT_MATCH

+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count =

+              (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);

+#endif

+        }

+        mv.row *= 8;

+        mv.col *= 8;

+        this_error = motion_error;

+        xd->mi[0]->mode = NEWMV;

+        xd->mi[0]->mv[0].as_mv = mv;

+        xd->mi[0]->tx_size = TX_4X4;

+        xd->mi[0]->ref_frame[0] = LAST_FRAME;

+        xd->mi[0]->ref_frame[1] = NONE;

+        vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);

+        vp9_encode_sby_pass1(x, bsize);

+        fp_acc_data->sum_mvr += mv.row;

+        fp_acc_data->sum_mvr_abs += abs(mv.row);

+        fp_acc_data->sum_mvc += mv.col;

+        fp_acc_data->sum_mvc_abs += abs(mv.col);

+        fp_acc_data->sum_mvrs += mv.row * mv.row;

+        fp_acc_data->sum_mvcs += mv.col * mv.col;

+        ++(fp_acc_data->intercount);

+        *best_ref_mv = mv;

+#if CONFIG_FP_MB_STATS

         if (cpi->use_fp_mb_stats) {

-          // intra prediction statistics

+          // inter prediction statistics

           cpi->twopass.frame_mb_stats_buf[mb_index] = 0;

-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;

+          cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;

           cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;

           if (this_error > FPMB_ERROR_LARGE_TH) {

             cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;

@@ -1055,214 +1202,229 @@

 #endif

-        if (motion_error <= this_error) {

-          vpx_clear_system_state();

+        if (!is_zero_mv(&mv)) {

+          ++(fp_acc_data->mvcount);

-          // Keep a count of cases where the inter and intra were very close

-          // and very low. This helps with scene cut detection for example in

-          // cropped clips with black bars at the sides or top and bottom.

-          if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&

-              (this_error < (2 * intrapenalty))) {

-            neutral_count += 1.0;

-            // Also track cases where the intra is not much worse than the inter

-            // and use this in limiting the GF/arf group length.

-          } else if ((this_error > NCOUNT_INTRA_THRESH) &&

-                     (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {

-            neutral_count +=

-                (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);

-          }

-          mv.row *= 8;

-          mv.col *= 8;

-          this_error = motion_error;

-          xd->mi[0]->mode = NEWMV;

-          xd->mi[0]->mv[0].as_mv = mv;

-          xd->mi[0]->tx_size = TX_4X4;

-          xd->mi[0]->ref_frame[0] = LAST_FRAME;

-          xd->mi[0]->ref_frame[1] = NONE;

-          vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);

-          vp9_encode_sby_pass1(x, bsize);

-          sum_mvr += mv.row;

-          sum_mvr_abs += abs(mv.row);

-          sum_mvc += mv.col;

-          sum_mvc_abs += abs(mv.col);

-          sum_mvrs += mv.row * mv.row;

-          sum_mvcs += mv.col * mv.col;

-          ++intercount;

-          best_ref_mv = mv;

 #if CONFIG_FP_MB_STATS

           if (cpi->use_fp_mb_stats) {

-            // inter prediction statistics

-            cpi->twopass.frame_mb_stats_buf[mb_index] = 0;

-            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;

-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;

-            if (this_error > FPMB_ERROR_LARGE_TH) {

+            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_MOTION_ZERO_MASK;

+            // check estimated motion direction

+            if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {

+              // right direction

               cpi->twopass.frame_mb_stats_buf[mb_index] |=

-                  FPMB_ERROR_LARGE_MASK;

-            } else if (this_error < FPMB_ERROR_SMALL_TH) {

+                  FPMB_MOTION_RIGHT_MASK;

+            } else if (mv.as_mv.row < 0 &&

+                       abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {

+              // up direction

+              cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_UP_MASK;

+            } else if (mv.as_mv.col < 0 &&

+                       abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {

+              // left direction

               cpi->twopass.frame_mb_stats_buf[mb_index] |=

-                  FPMB_ERROR_SMALL_MASK;

+                  FPMB_MOTION_LEFT_MASK;

+            } else {

+              // down direction

+              cpi->twopass.frame_mb_stats_buf[mb_index] |=

+                  FPMB_MOTION_DOWN_MASK;

 #endif

-          if (!is_zero_mv(&mv)) {

-            ++mvcount;

+          // Does the row vector point inwards or outwards?

+          if (mb_row < cm->mb_rows / 2) {

+            if (mv.row > 0)

+              --(fp_acc_data->sum_in_vectors);

+            else if (mv.row < 0)

+              ++(fp_acc_data->sum_in_vectors);

+          } else if (mb_row > cm->mb_rows / 2) {

+            if (mv.row > 0)

+              ++(fp_acc_data->sum_in_vectors);

+            else if (mv.row < 0)

+              --(fp_acc_data->sum_in_vectors);

+          }

+          // Does the col vector point inwards or outwards?

+          if (mb_col < cm->mb_cols / 2) {

+            if (mv.col > 0)

+              --(fp_acc_data->sum_in_vectors);

+            else if (mv.col < 0)

+              ++(fp_acc_data->sum_in_vectors);

+          } else if (mb_col > cm->mb_cols / 2) {

+            if (mv.col > 0)

+              ++(fp_acc_data->sum_in_vectors);

+            else if (mv.col < 0)

+              --(fp_acc_data->sum_in_vectors);

+          }

+          fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;

+        } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {

+          fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);

+        } else {  // 0,0 mv but high error

+          fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;

+        }

+      } else {  // Intra < inter error

+        if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH))

+          fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);

+        else

+          fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;

+      }

+    } else {

+      fp_acc_data->sr_coded_error += (int64_t)this_error;

+    }

+    fp_acc_data->coded_error += (int64_t)this_error;

+    recon_yoffset += 16;

+    recon_uvoffset += uv_mb_height;

+    // Accumulate row level stats to the corresponding tile stats

+    if (cpi->new_mt && mb_col == (tile.mi_col_end >> 1) - 1)

+      accumulate_fp_mb_row_stat(tile_data, fp_acc_data);

+    (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, mb_row, c,

+                                    num_mb_cols);

+  }

+  vpx_clear_system_state();

+}

+static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) {

+  VP9_COMMON *const cm = &cpi->common;

+  int mb_row;

+  TileDataEnc tile_data;

+  TileInfo *tile = &tile_data.tile_info;

+  MV zero_mv = { 0, 0 };

+  MV best_ref_mv;

+  // Tiling is ignored in the first pass.

+  vp9_tile_init(tile, cm, 0, 0);

+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {

+    best_ref_mv = zero_mv;

+    vp9_first_pass_encode_tile_mb_row(cpi, &cpi->td, fp_acc_data, &tile_data,

+                                      &best_ref_mv, mb_row);

+  }

+}

+void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {

+  MACROBLOCK *const x = &cpi->td.mb;

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  TWO_PASS *twopass = &cpi->twopass;

+  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);

+  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);

+  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);

+  const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;

+  LAYER_CONTEXT *const lc =

+      is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id]

+                           : NULL;

+  BufferPool *const pool = cm->buffer_pool;

+  FIRSTPASS_DATA fp_temp_data;

+  FIRSTPASS_DATA *fp_acc_data = &fp_temp_data;

+  vpx_clear_system_state();

+  vp9_zero(fp_temp_data);

+  fp_acc_data->image_data_start_row = INVALID_ROW;

+  // First pass code requires valid last and new frame buffers.

+  assert(new_yv12 != NULL);

+  assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));

 #if CONFIG_FP_MB_STATS

-            if (cpi->use_fp_mb_stats) {

-              cpi->twopass.frame_mb_stats_buf[mb_index] &=

-                  ~FPMB_MOTION_ZERO_MASK;

-              // check estimated motion direction

-              if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {

-                // right direction

-                cpi->twopass.frame_mb_stats_buf[mb_index] |=

-                    FPMB_MOTION_RIGHT_MASK;

-              } else if (mv.as_mv.row < 0 &&

-                         abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {

-                // up direction

-                cpi->twopass.frame_mb_stats_buf[mb_index] |=

-                    FPMB_MOTION_UP_MASK;

-              } else if (mv.as_mv.col < 0 &&

-                         abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {

-                // left direction

-                cpi->twopass.frame_mb_stats_buf[mb_index] |=

-                    FPMB_MOTION_LEFT_MASK;

-              } else {

-                // down direction

-                cpi->twopass.frame_mb_stats_buf[mb_index] |=

-                    FPMB_MOTION_DOWN_MASK;

-              }

-            }

+  if (cpi->use_fp_mb_stats) {

+    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);

+  }

 #endif

-            // Does the row vector point inwards or outwards?

-            if (mb_row < cm->mb_rows / 2) {

-              if (mv.row > 0)

-                --sum_in_vectors;

-              else if (mv.row < 0)

-                ++sum_in_vectors;

-            } else if (mb_row > cm->mb_rows / 2) {

-              if (mv.row > 0)

-                ++sum_in_vectors;

-              else if (mv.row < 0)

-                --sum_in_vectors;

-            }

+  set_first_pass_params(cpi);

+  vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));

-            // Does the col vector point inwards or outwards?

-            if (mb_col < cm->mb_cols / 2) {

-              if (mv.col > 0)

-                --sum_in_vectors;

-              else if (mv.col < 0)

-                ++sum_in_vectors;

-            } else if (mb_col > cm->mb_cols / 2) {

-              if (mv.col > 0)

-                ++sum_in_vectors;

-              else if (mv.col < 0)

-                --sum_in_vectors;

-            }

-            frame_noise_energy += (int64_t)SECTION_NOISE_DEF;

-          } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {

-            frame_noise_energy += fp_estimate_block_noise(x, bsize);

-          } else {  // 0,0 mv but high error

-            frame_noise_energy += (int64_t)SECTION_NOISE_DEF;

-          }

-        } else {  // Intra < inter error

-          if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH))

-            frame_noise_energy += fp_estimate_block_noise(x, bsize);

-          else

-            frame_noise_energy += (int64_t)SECTION_NOISE_DEF;

-        }

-      } else {

-        sr_coded_error += (int64_t)this_error;

-      }

-      coded_error += (int64_t)this_error;

+  if (lc != NULL) {

+    twopass = &lc->twopass;

-      // Adjust to the next column of MBs.

-      x->plane[0].src.buf += 16;

-      x->plane[1].src.buf += uv_mb_height;

-      x->plane[2].src.buf += uv_mb_height;

+    cpi->lst_fb_idx = cpi->svc.spatial_layer_id;

+    cpi->ref_frame_flags = VP9_LAST_FLAG;

-      recon_yoffset += 16;

-      recon_uvoffset += uv_mb_height;

+    if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <

+        REF_FRAMES) {

+      cpi->gld_fb_idx =

+          cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;

+      cpi->ref_frame_flags |= VP9_GOLD_FLAG;

+      cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);

+    } else {

+      cpi->refresh_golden_frame = 0;

-    // Adjust to the next row of MBs.

-    x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;

-    x->plane[1].src.buf +=

-        uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;

-    x->plane[2].src.buf +=

-        uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;

+    if (lc->current_video_frame_in_layer == 0) cpi->ref_frame_flags = 0;

-    vpx_clear_system_state();

+    vp9_scale_references(cpi);

+    // Use either last frame or alt frame for motion search.

+    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {

+      first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);

+      if (first_ref_buf == NULL)

+        first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);

+    }

+    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {

+      gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);

+      if (gld_yv12 == NULL) {

+        gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);

+      }

+    } else {

+      gld_yv12 = NULL;

+    }

+    set_ref_ptrs(cm, xd,

+                 (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME : NONE,

+                 (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);

+    cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,

+                                        &cpi->scaled_source, 0);

-  // Clamp the image start to rows/2. This number of rows is discarded top

-  // and bottom as dead data so rows / 2 means the frame is blank.

-  if ((image_data_start_row > cm->mb_rows / 2) ||

-      (image_data_start_row == INVALID_ROW)) {

-    image_data_start_row = cm->mb_rows / 2;

+  vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);

+  vp9_setup_src_planes(x, cpi->Source, 0, 0);

+  vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);

+  if (!frame_is_intra_only(cm)) {

+    vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);

-  // Exclude any image dead zone

-  if (image_data_start_row > 0) {

-    intra_skip_count =

-        VPXMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));

-  }

-  {

-    FIRSTPASS_STATS fps;

-    // The minimum error here insures some bit allocation to frames even

-    // in static regions. The allocation per MB declines for larger formats

-    // where the typical "real" energy per MB also falls.

-    // Initial estimate here uses sqrt(mbs) to define the min_err, where the

-    // number of mbs is proportional to the image area.

-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)

-                            ? cpi->initial_mbs

-                            : cpi->common.MBs;

-    const double min_err = 200 * sqrt(num_mbs);

+  xd->mi = cm->mi_grid_visible;

+  xd->mi[0] = cm->mi;

-    intra_factor = intra_factor / (double)num_mbs;

-    brightness_factor = brightness_factor / (double)num_mbs;

-    fps.weight = intra_factor * brightness_factor;

+  vp9_frame_init_quantizer(cpi);

-    fps.frame = cm->current_video_frame;

-    fps.spatial_layer_id = cpi->svc.spatial_layer_id;

-    fps.coded_error = (double)(coded_error >> 8) + min_err;

-    fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;

-    fps.intra_error = (double)(intra_error >> 8) + min_err;

-    fps.frame_noise_energy = (double)frame_noise_energy / (double)num_mbs;

-    fps.count = 1.0;

-    fps.pcnt_inter = (double)intercount / num_mbs;

-    fps.pcnt_second_ref = (double)second_ref_count / num_mbs;

-    fps.pcnt_neutral = (double)neutral_count / num_mbs;

-    fps.intra_skip_pct = (double)intra_skip_count / num_mbs;

-    fps.intra_smooth_pct = (double)intra_smooth_count / num_mbs;

-    fps.inactive_zone_rows = (double)image_data_start_row;

-    // Currently set to 0 as most issues relate to letter boxing.

-    fps.inactive_zone_cols = (double)0;

+  x->skip_recode = 0;

-    if (mvcount > 0) {

-      fps.MVr = (double)sum_mvr / mvcount;

-      fps.mvr_abs = (double)sum_mvr_abs / mvcount;

-      fps.MVc = (double)sum_mvc / mvcount;

-      fps.mvc_abs = (double)sum_mvc_abs / mvcount;

-      fps.MVrv =

-          ((double)sum_mvrs - ((double)sum_mvr * sum_mvr / mvcount)) / mvcount;

-      fps.MVcv =

-          ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;

-      fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);

-      fps.pcnt_motion = (double)mvcount / num_mbs;

+  vp9_init_mv_probs(cm);

+  vp9_initialize_rd_consts(cpi);

+  cm->log2_tile_rows = 0;

+  {

+    FIRSTPASS_STATS fps;

+    TileDataEnc *first_tile_col;

+    if (!cpi->new_mt) {

+      cm->log2_tile_cols = 0;

+      cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy;

+      cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy;

+      first_pass_encode(cpi, fp_acc_data);

+      first_pass_stat_calc(cpi, &fps, fp_acc_data);

     } else {

-      fps.MVr = 0.0;

-      fps.mvr_abs = 0.0;

-      fps.MVc = 0.0;

-      fps.mvc_abs = 0.0;

-      fps.MVrv = 0.0;

-      fps.MVcv = 0.0;

-      fps.mv_in_out_count = 0.0;

-      fps.pcnt_motion = 0.0;

+      cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read;

+      cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write;

+#if ENABLE_MT_BIT_MATCH

+      cm->log2_tile_cols = 0;

+      vp9_zero_array(cpi->twopass.fp_mb_float_stats, cm->MBs);

+#endif

+      vp9_encode_fp_row_mt(cpi);

+      first_tile_col = &cpi->tile_data[0];

+#if ENABLE_MT_BIT_MATCH

+      accumulate_floating_point_stats(cpi, first_tile_col);

+#endif

+      first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data));

     // Dont allow a value of 0 for duration.

--- a/vp9/encoder/vp9_firstpass.h

+++ b/vp9/encoder/vp9_firstpass.h

@@ -39,7 +39,41 @@

 } FIRSTPASS_MB_STATS;

 #endif

+#define INVALID_ROW -1

+#define ENABLE_MT_BIT_MATCH 0

+#if ENABLE_MT_BIT_MATCH

 typedef struct {

+  double frame_mb_intra_factor;

+  double frame_mb_brightness_factor;

+  double frame_mb_neutral_count;

+} FP_MB_FLOAT_STATS;

+#endif

+typedef struct {

+  double intra_factor;

+  double brightness_factor;

+  int64_t coded_error;

+  int64_t sr_coded_error;

+  int64_t frame_noise_energy;

+  int64_t intra_error;

+  int intercount;

+  int second_ref_count;

+  double neutral_count;

+  int intra_skip_count;

+  int image_data_start_row;

+  int mvcount;

+  int sum_mvr;

+  int sum_mvr_abs;

+  int sum_mvc;

+  int sum_mvc_abs;

+  int64_t sum_mvrs;

+  int64_t sum_mvcs;

+  int sum_in_vectors;

+  int intra_smooth_count;

+} FIRSTPASS_DATA;

+typedef struct {

   double frame;

   double weight;

   double intra_error;

@@ -114,6 +148,11 @@

   uint8_t *this_frame_mb_stats;

   FIRSTPASS_MB_STATS firstpass_mb_stats;

 #endif

+#if ENABLE_MT_BIT_MATCH

+  FP_MB_FLOAT_STATS *fp_mb_float_stats;

+#endif

   // An indication of the content type of the current frame

   FRAME_CONTENT_TYPE fr_content_type;

@@ -141,11 +180,19 @@

 } TWO_PASS;

 struct VP9_COMP;

+struct ThreadData;

+struct TileDataEnc;

 void vp9_init_first_pass(struct VP9_COMP *cpi);

 void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);

 void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);

 void vp9_end_first_pass(struct VP9_COMP *cpi);

+void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi,

+                                       struct ThreadData *td,

+                                       FIRSTPASS_DATA *fp_acc_data,

+                                       struct TileDataEnc *tile_data,

+                                       MV *best_ref_mv, int mb_row);

 void vp9_init_second_pass(struct VP9_COMP *cpi);

 void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);

--- /dev/null

+++ b/vp9/encoder/vp9_job_queue.h

@@ -1,0 +1,46 @@

+/*

+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VP9_ENCODER_VP9_JOB_QUEUE_H_

+#define VP9_ENCODER_VP9_JOB_QUEUE_H_

+typedef enum {

+  FIRST_PASS_JOB,

+  ENCODE_JOB,

+  ARNR_JOB,

+  NUM_JOB_TYPES,

+} JOB_TYPE;

+// Encode job parameters

+typedef struct {

+  int vert_unit_row_num;  // Index of the vertical unit row

+  int tile_col_id;        // tile col id within a tile

+  int tile_row_id;        // tile col id within a tile

+} JobNode;

+// Job queue element parameters

+typedef struct {

+  // Pointer to the next link in the job queue

+  void *next;

+  // Job information context of the module

+  JobNode job_info;

+} JobQueue;

+// Job queue handle

+typedef struct {

+  // Pointer to the next link in the job queue

+  void *next;

+  // Counter to store the number of jobs picked up for processing

+  int num_jobs_acquired;

+} JobQueueHandle;

+#endif  // VP9_ENCODER_VP9_JOB_QUEUE_H_

--- /dev/null

+++ b/vp9/encoder/vp9_multi_thread.c

@@ -1,0 +1,282 @@

+/*

+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h>

+#include "vp9/encoder/vp9_encoder.h"

+#include "vp9/encoder/vp9_ethread.h"

+#include "vp9/encoder/vp9_multi_thread.h"

+void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,

+                               int tile_id) {

+  RowMTInfo *row_mt_info;

+  JobQueueHandle *job_queue_hdl = NULL;

+  void *next = NULL;

+  JobNode *job_info = NULL;

+#if CONFIG_MULTITHREAD

+  pthread_mutex_t *mutex_handle = NULL;

+#endif

+  row_mt_info = (RowMTInfo *)(&multi_thread_ctxt->row_mt_info[tile_id]);

+  job_queue_hdl = (JobQueueHandle *)&row_mt_info->job_queue_hdl;

+#if CONFIG_MULTITHREAD

+  mutex_handle = &row_mt_info->job_mutex;

+#endif

+// lock the mutex for queue access

+#if CONFIG_MULTITHREAD

+  pthread_mutex_lock(mutex_handle);

+#endif

+  next = job_queue_hdl->next;

+  if (NULL != next) {

+    JobQueue *job_queue = (JobQueue *)next;

+    job_info = &job_queue->job_info;

+    // Update the next job in the queue

+    job_queue_hdl->next = job_queue->next;

+    job_queue_hdl->num_jobs_acquired++;

+  }

+#if CONFIG_MULTITHREAD

+  pthread_mutex_unlock(mutex_handle);

+#endif

+  return job_info;

+}

+void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {

+  struct VP9Common *cm = &cpi->common;

+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;

+  int tile_row, tile_col;

+  const int tile_cols = 1 << cm->log2_tile_cols;

+  const int tile_rows = 1 << cm->log2_tile_rows;

+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;

+  int jobs_per_tile_col, total_jobs;

+  jobs_per_tile_col = VPXMAX(cm->mb_rows, sb_rows);

+  // Calculate the total number of jobs

+  total_jobs = jobs_per_tile_col * tile_cols;

+  multi_thread_ctxt->allocated_tile_cols = tile_cols;

+  multi_thread_ctxt->allocated_tile_rows = tile_rows;

+  multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;

+  multi_thread_ctxt->job_queue =

+      (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue));

+#if CONFIG_MULTITHREAD

+  // Create mutex for each tile

+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {

+    RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col];

+    pthread_mutex_init(&row_mt_info->job_mutex, NULL);

+  }

+#endif

+  // Allocate memory for row based multi-threading

+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {

+    TileDataEnc *this_tile = &cpi->tile_data[tile_col];

+    vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col);

+  }

+  // Assign the sync pointer of tile row zero for every tile row > 0

+  for (tile_row = 1; tile_row < tile_rows; tile_row++) {

+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {

+      TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];

+      TileDataEnc *this_col_tile = &cpi->tile_data[tile_col];

+      this_tile->row_mt_sync = this_col_tile->row_mt_sync;

+    }

+  }

+  // Calculate the number of vertical units in the given tile row

+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {

+    TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols];

+    TileInfo *tile_info = &this_tile->tile_info;

+    multi_thread_ctxt->num_tile_vert_sbs[tile_row] =

+        get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);

+  }

+}

+void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) {

+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;

+  int tile_col;

+  // Deallocate memory for job queue

+  if (multi_thread_ctxt->job_queue) vpx_free(multi_thread_ctxt->job_queue);

+#if CONFIG_MULTITHREAD

+  // Destroy mutex for each tile

+  for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;

+       tile_col++) {

+    RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col];

+    if (row_mt_info) pthread_mutex_destroy(&row_mt_info->job_mutex);

+  }

+#endif

+  // Free row based multi-threading sync memory

+  for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;

+       tile_col++) {

+    TileDataEnc *this_tile = &cpi->tile_data[tile_col];

+    vp9_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);

+  }

+}

+void vp9_multi_thread_tile_init(VP9_COMP *cpi) {

+  VP9_COMMON *const cm = &cpi->common;

+  const int tile_cols = 1 << cm->log2_tile_cols;

+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;

+  int i;

+  for (i = 0; i < tile_cols; i++) {

+    TileDataEnc *this_tile = &cpi->tile_data[i];

+    int jobs_per_tile_col = cpi->oxcf.pass == 1 ? cm->mb_rows : sb_rows;

+    // Initialize cur_col to -1 for all rows.

+    memset(this_tile->row_mt_sync.cur_col, -1,

+           sizeof(*this_tile->row_mt_sync.cur_col) * jobs_per_tile_col);

+    vp9_zero(this_tile->fp_data);

+    this_tile->fp_data.image_data_start_row = INVALID_ROW;

+  }

+}

+void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,

+                               int tile_cols, int num_workers) {

+  int tile_id = 0;

+  int i;

+  // Allocating the threads for the tiles

+  for (i = 0; i < num_workers; i++) {

+    multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++;

+    if (tile_id == tile_cols) tile_id = 0;

+  }

+}

+int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt,

+                             int cur_tile_id) {

+  RowMTInfo *row_mt_info;

+  JobQueueHandle *job_queue_hndl;

+#if CONFIG_MULTITHREAD

+  pthread_mutex_t *mutex;

+#endif

+  int num_jobs_remaining;

+  row_mt_info = &multi_thread_ctxt->row_mt_info[cur_tile_id];

+  job_queue_hndl = &row_mt_info->job_queue_hdl;

+#if CONFIG_MULTITHREAD

+  mutex = &row_mt_info->job_mutex;

+#endif

+#if CONFIG_MULTITHREAD

+  pthread_mutex_lock(mutex);

+#endif

+  num_jobs_remaining =

+      multi_thread_ctxt->jobs_per_tile_col - job_queue_hndl->num_jobs_acquired;

+#if CONFIG_MULTITHREAD

+  pthread_mutex_unlock(mutex);

+#endif

+  return (num_jobs_remaining);

+}

+void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type) {

+  VP9_COMMON *const cm = &cpi->common;

+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;

+  JobQueue *job_queue = multi_thread_ctxt->job_queue;

+  const int tile_cols = 1 << cm->log2_tile_cols;

+  int job_row_num, jobs_per_tile, jobs_per_tile_col, total_jobs;

+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;

+  int tile_col, i;

+  jobs_per_tile_col = (job_type != ENCODE_JOB) ? cm->mb_rows : sb_rows;

+  total_jobs = jobs_per_tile_col * tile_cols;

+  multi_thread_ctxt->jobs_per_tile_col = jobs_per_tile_col;

+  // memset the entire job queue buffer to zero

+  memset(job_queue, 0, total_jobs * sizeof(JobQueue));

+  // Job queue preparation

+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {

+    RowMTInfo *tile_ctxt = &multi_thread_ctxt->row_mt_info[tile_col];

+    JobQueue *job_queue_curr, *job_queue_temp;

+    int tile_row = 0;

+    tile_ctxt->job_queue_hdl.next = (void *)job_queue;

+    tile_ctxt->job_queue_hdl.num_jobs_acquired = 0;

+    job_queue_curr = job_queue;

+    job_queue_temp = job_queue;

+    // loop over all the vertical rows

+    for (job_row_num = 0, jobs_per_tile = 0; job_row_num < jobs_per_tile_col;

+         job_row_num++, jobs_per_tile++) {

+      job_queue_curr->job_info.vert_unit_row_num = job_row_num;

+      job_queue_curr->job_info.tile_col_id = tile_col;

+      job_queue_curr->job_info.tile_row_id = tile_row;

+      job_queue_curr->next = (void *)(job_queue_temp + 1);

+      job_queue_curr = ++job_queue_temp;

+      if (ENCODE_JOB == job_type) {

+        if (jobs_per_tile >=

+            multi_thread_ctxt->num_tile_vert_sbs[tile_row] - 1) {

+          tile_row++;

+          jobs_per_tile = -1;

+        }

+      }

+    }

+    // Set the last pointer to NULL

+    job_queue_curr += -1;

+    job_queue_curr->next = (void *)NULL;

+    // Move to the next tile

+    job_queue += jobs_per_tile_col;

+  }

+  for (i = 0; i < cpi->num_workers; i++) {

+    EncWorkerData *thread_data;

+    thread_data = &cpi->tile_thr_data[i];

+    thread_data->thread_id = i;

+    for (tile_col = 0; tile_col < tile_cols; tile_col++)

+      thread_data->tile_completion_status[tile_col] = 0;

+  }

+}

+int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,

+                              int *tile_completion_status, int *cur_tile_id,

+                              int tile_cols) {

+  int tile_col;

+  int tile_id = -1;  // Stores the tile ID with minimum proc done

+  int max_num_jobs_remaining = 0;

+  int num_jobs_remaining;

+  // Mark the completion to avoid check in the loop

+  tile_completion_status[*cur_tile_id] = 1;

+  // Check for the status of all the tiles

+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {

+    if (tile_completion_status[tile_col] == 0) {

+      num_jobs_remaining =

+          vp9_get_job_queue_status(multi_thread_ctxt, tile_col);

+      // Mark the completion to avoid checks during future switches across tiles

+      if (num_jobs_remaining == 0) tile_completion_status[tile_col] = 1;

+      if (num_jobs_remaining > max_num_jobs_remaining) {

+        max_num_jobs_remaining = num_jobs_remaining;

+        tile_id = tile_col;

+      }

+    }

+  }

+  if (-1 == tile_id) {

+    return 1;

+  } else {

+    // Update the cur ID to the next tile ID that will be processed,

+    // which will be the least processed tile

+    *cur_tile_id = tile_id;

+    return 0;

+  }

+}

--- /dev/null

+++ b/vp9/encoder/vp9_multi_thread.h

@@ -1,0 +1,38 @@

+/*

+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VP9_ENCODER_VP9_MULTI_THREAD_H

+#define VP9_ENCODER_VP9_MULTI_THREAD_H

+#include "vp9/encoder/vp9_encoder.h"

+#include "vp9/encoder/vp9_job_queue.h"

+void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,

+                               int tile_id);

+void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type);

+int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt,

+                             int cur_tile_id);

+void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,

+                               int tile_cols, int num_workers);

+void vp9_multi_thread_tile_init(VP9_COMP *cpi);

+void vp9_row_mt_mem_alloc(VP9_COMP *cpi);

+void vp9_row_mt_mem_dealloc(VP9_COMP *cpi);

+int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,

+                              int *tile_completion_status, int *cur_tile_id,

+                              int tile_cols);

+#endif  // VP9_ENCODER_VP9_MULTI_THREAD_H

--- a/vp9/vp9_cx_iface.c

+++ b/vp9/vp9_cx_iface.c

@@ -51,6 +51,7 @@

   vpx_color_range_t color_range;

   int render_width;

   int render_height;

+  unsigned int new_mt;

};

 static struct vp9_extracfg default_extra_cfg = {

@@ -82,6 +83,7 @@

   0,                     // color range

   0,                     // render width

   0,                     // render height

+  1,                     // new_mt

};

 struct vpx_codec_alg_priv {

@@ -245,6 +247,7 @@

         "kf_min_dist not supported in auto mode, use 0 "

         "or kf_max_dist instead.");

+  RANGE_CHECK(extra_cfg, new_mt, 0, 1);

   RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);

   RANGE_CHECK(extra_cfg, cpu_used, -8, 8);

   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);

@@ -554,6 +557,8 @@

   oxcf->target_level = extra_cfg->target_level;

+  oxcf->new_mt = extra_cfg->new_mt;

   for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {

 #if CONFIG_SPATIAL_SVC

     oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl];

@@ -842,6 +847,13 @@

   return update_extra_cfg(ctx, &extra_cfg);

+static vpx_codec_err_t ctrl_set_new_mt(vpx_codec_alg_priv_t *ctx,

+                                       va_list args) {

+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;

+  extra_cfg.new_mt = CAST(VP9E_SET_NEW_MT, args);

+  return update_extra_cfg(ctx, &extra_cfg);

+}

 static vpx_codec_err_t ctrl_get_level(vpx_codec_alg_priv_t *ctx, va_list args) {

   int *const arg = va_arg(args, int *);

   if (arg == NULL) return VPX_CODEC_INVALID_PARAM;

@@ -1594,6 +1606,7 @@

   { VP9E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config },

   { VP9E_SET_RENDER_SIZE, ctrl_set_render_size },

   { VP9E_SET_TARGET_LEVEL, ctrl_set_target_level },

+  { VP9E_SET_NEW_MT, ctrl_set_new_mt },

   // Getters

   { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -39,9 +39,12 @@

 VP9_CX_SRCS-yes += encoder/vp9_encodemv.h

 VP9_CX_SRCS-yes += encoder/vp9_extend.h

 VP9_CX_SRCS-yes += encoder/vp9_firstpass.h

+VP9_CX_SRCS-yes += encoder/vp9_job_queue.h

 VP9_CX_SRCS-yes += encoder/vp9_lookahead.c

 VP9_CX_SRCS-yes += encoder/vp9_lookahead.h

 VP9_CX_SRCS-yes += encoder/vp9_mcomp.h

+VP9_CX_SRCS-yes += encoder/vp9_multi_thread.c

+VP9_CX_SRCS-yes += encoder/vp9_multi_thread.h

 VP9_CX_SRCS-yes += encoder/vp9_encoder.h

 VP9_CX_SRCS-yes += encoder/vp9_quantize.h

 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h

--- a/vpx/vp8cx.h

+++ b/vpx/vp8cx.h

@@ -547,6 +547,14 @@

*/

   VP9E_SET_TARGET_LEVEL,

+  /*!\brief Codec control function to set row level multi-threading.

+  *

+  * 0 : off, 1 : on

+  *

+  * Supported in codecs: VP9

+  */

+  VP9E_SET_NEW_MT,

   /*!\brief Codec control function to get bitstream level.

    * Supported in codecs: VP9

@@ -837,6 +845,9 @@

 VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int)

 #define VPX_CTRL_VP9E_SET_TARGET_LEVEL

+VPX_CTRL_USE_TYPE(VP9E_SET_NEW_MT, unsigned int)

+#define VPX_CTRL_VP9E_SET_NEW_MT

 VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)

 #define VPX_CTRL_VP9E_GET_LEVEL

--- a/vpxenc.c

+++ b/vpxenc.c

@@ -470,6 +470,9 @@

     NULL, "target-level", 1,

     "Target level (255: off (default); 0: only keep level stats; 10: level 1.0;"

     " 11: level 1.1; ... 62: level 6.2)");

+static const arg_def_t new_mt =

+    ARG_DEF(NULL, "new-mt", 1, "Enable row based multi-threading in VP9");

 #endif

 #if CONFIG_VP9_ENCODER

@@ -498,6 +501,7 @@

                                        &min_gf_interval,

                                        &max_gf_interval,

                                        &target_level,

+                                       &new_mt,

 #if CONFIG_VP9_HIGHBITDEPTH

                                        &bitdeptharg,

                                        &inbitdeptharg,

@@ -528,6 +532,7 @@

                                         VP9E_SET_MIN_GF_INTERVAL,

                                         VP9E_SET_MAX_GF_INTERVAL,

                                         VP9E_SET_TARGET_LEVEL,

+                                        VP9E_SET_NEW_MT,

                                         0 };

 #endif

--

⑨