shithub: libvpx

Download patch

ref: 9ce3a7d76c5ef702337b96b9aa2c944da1b31869
parent: 4d0d78424b6aff08a8c11046a9001184425d8485
author: Hangyu Kuang <hkuang@google.com>
date: Wed Jul 30 16:43:40 EDT 2014

Implement frame parallel decode for VP9.

Using 4 threads, frame parallel decode is ~3x faster than single thread
decode and around 30% faster than tile parallel decode for frame parallel
encoded video on both Android and desktop with 4 threads. Decode speed is
scalable to threads too which means decode could be even faster with more threads.

Change-Id: Ia0a549aaa3e83b5a17b31d8299aa496ea4f21e3e

--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -12,11 +12,37 @@
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_systemdependent.h"
 
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+static INLINE void alloc_mi_array(VP9_COMMON *cm, int mi_size, int idx) {
+  CHECK_MEM_ERROR(cm, cm->mip_array[idx],
+                  vpx_calloc(mi_size, sizeof(*cm->mip_array[0])));
+  CHECK_MEM_ERROR(cm, cm->mi_grid_base_array[idx],
+                  vpx_calloc(mi_size, sizeof(*cm->mi_grid_base_array[0])));
+}
+
 static void clear_mi_border(const VP9_COMMON *cm, MODE_INFO *mi) {
   int i;
 
@@ -49,7 +75,12 @@
   vpx_memset(cm->mi_grid_base, 0, cm->mi_stride * (cm->mi_rows + 1) *
                                       sizeof(*cm->mi_grid_base));
 
-  clear_mi_border(cm, cm->prev_mip);
+  // Only clear mi border in non frame-parallel decode. In frame-parallel
+  // decode, prev_mip is managed by previous decoding thread. While in
+  // non frame-parallel decode, prev_mip and mip are both managed by
+  // current decoding thread.
+  if (!cm->frame_parallel_decode)
+    clear_mi_border(cm, cm->prev_mip);
 }
 
 static int alloc_mi(VP9_COMMON *cm, int mi_size) {
@@ -56,33 +87,35 @@
   int i;
 
   for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
-    cm->mip_array[i] =
-        (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip));
-    if (cm->mip_array[i] == NULL)
-      return 1;
-
-    cm->mi_grid_base_array[i] =
-        (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
-    if (cm->mi_grid_base_array[i] == NULL)
-      return 1;
+    // Delay reallocation as another thread is accessing prev_mi.
+    if (cm->frame_parallel_decode && i == cm->prev_mi_idx) {
+      cm->update_prev_mi = 1;
+      continue;
+    }
+    alloc_mi_array(cm, mi_size, i);
   }
 
-  // Init the index.
-  cm->mi_idx = 0;
-  cm->prev_mi_idx = 1;
-
   cm->mip = cm->mip_array[cm->mi_idx];
-  cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
   cm->mi_grid_base = cm->mi_grid_base_array[cm->mi_idx];
-  cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx];
 
+  if (!cm->frame_parallel_decode) {
+    cm->mi_idx = 0;
+    cm->prev_mi_idx = 1;
+    // In frame-parallel decode, prev_mip comes from another thread,
+    // so current decoding thread should not touch it.
+    cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
+    cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx];
+  }
+
   return 0;
 }
 
-static void free_mi(VP9_COMMON *cm) {
+static void free_mi(VP9_COMMON *cm, int decode_done) {
   int i;
 
   for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    if (cm->frame_parallel_decode && i == cm->prev_mi_idx && !decode_done)
+      continue;
     vpx_free(cm->mip_array[i]);
     cm->mip_array[i] = NULL;
     vpx_free(cm->mi_grid_base_array[i]);
@@ -90,9 +123,12 @@
   }
 
   cm->mip = NULL;
-  cm->prev_mip = NULL;
   cm->mi_grid_base = NULL;
-  cm->prev_mi_grid_base = NULL;
+
+  if (!cm->frame_parallel_decode) {
+    cm->prev_mip = NULL;
+    cm->prev_mi_grid_base = NULL;
+  }
 }
 
 static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
@@ -109,8 +145,11 @@
   cm->prev_seg_map_idx = 1;
 
   cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
-  cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
 
+  if (!cm->frame_parallel_decode) {
+    cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+  }
+
   return 0;
 }
 
@@ -123,7 +162,10 @@
   }
 
   cm->current_frame_seg_map = NULL;
-  cm->last_frame_seg_map = NULL;
+
+  if (!cm->frame_parallel_decode) {
+    cm->last_frame_seg_map = NULL;
+  }
 }
 
 void vp9_free_frame_buffers(VP9_COMMON *cm) {
@@ -144,8 +186,7 @@
 }
 
 void vp9_free_context_buffers(VP9_COMMON *cm) {
-  free_mi(cm);
-
+  free_mi(cm, 1);
   free_seg_map(cm);
 
   vpx_free(cm->above_context);
@@ -170,7 +211,7 @@
 
   set_mb_mi(cm, aligned_width, aligned_height);
 
-  free_mi(cm);
+  free_mi(cm, 0);
   if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE)))
     goto fail;
 
@@ -288,7 +329,6 @@
 void vp9_remove_common(VP9_COMMON *cm) {
   vp9_free_frame_buffers(cm);
   vp9_free_context_buffers(cm);
-  vp9_free_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers);
 }
 
 void vp9_update_frame_size(VP9_COMMON *cm) {
@@ -306,6 +346,20 @@
 void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
   // Swap indices.
   const int tmp = cm->mi_idx;
+
+  // Only used in frame parallel decode: Update the prev_mi buffer if
+  // needed. The worker that was accessing it must already finish decoding.
+  // So it can be resized safely now.
+  if (cm->update_prev_mi) {
+    const int mi_size = cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE);
+    vpx_free(cm->mip_array[cm->prev_mi_idx]);
+    vpx_free(cm->mi_grid_base_array[cm->prev_mi_idx]);
+    cm->mip_array[cm->prev_mi_idx] = NULL;
+    cm->mi_grid_base_array[cm->prev_mi_idx] = NULL;
+    alloc_mi_array(cm, mi_size, cm->prev_mi_idx);
+    cm->update_prev_mi = 0;
+  }
+
   cm->mi_idx = cm->prev_mi_idx;
   cm->prev_mi_idx = tmp;
 
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -439,7 +439,8 @@
   int i;
   vp9_clearall_segfeatures(&cm->seg);
   cm->seg.abs_delta = SEGMENT_DELTADATA;
-  if (cm->last_frame_seg_map)
+
+  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
     vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
   if (cm->current_frame_seg_map)
@@ -467,7 +468,7 @@
     cm->frame_contexts[cm->frame_context_idx] = cm->fc;
   }
 
-  if (frame_is_intra_only(cm))
+  if (frame_is_intra_only(cm) && !cm->frame_parallel_decode)
     vpx_memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) *
                                     sizeof(*cm->prev_mip));
 
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -17,15 +17,13 @@
                              const TileInfo *const tile,
                              MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                              int_mv *mv_ref_list,
-                             int block, int mi_row, int mi_col) {
+                             int block, int mi_row, int mi_col,
+                             find_mv_refs_sync sync, void *const data) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
-  const MODE_INFO *prev_mi = cm->coding_use_prev_mi && cm->prev_mi
-        ? cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col]
-        : NULL;
-  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
+  MODE_INFO *prev_mi = NULL;
+  MB_MODE_INFO *prev_mbmi = NULL;
 
-
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
 
   int different_ref_found = 0;
@@ -71,6 +69,14 @@
     }
   }
 
+  // Synchronize here for frame parallel decode if sync function is provided.
+  if (sync != NULL) {
+    sync(data, mi_row);
+  }
+  prev_mi = cm->coding_use_prev_mi && cm->prev_mi ?
+            cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col] : NULL;
+  prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
+
   // Check the last frame's mode and mv info.
   if (prev_mbmi) {
     if (prev_mbmi->ref_frame[0] == ref_frame)
@@ -109,12 +115,13 @@
 }
 
 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                    const TileInfo *const tile,
-                                    MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                                    int_mv *mv_ref_list,
-                                    int mi_row, int mi_col) {
+                      const TileInfo *const tile,
+                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list,
+                      int mi_row, int mi_col,
+                      find_mv_refs_sync sync, void *const data) {
   find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1,
-                   mi_row, mi_col);
+                   mi_row, mi_col, sync, data);
 }
 
 static void lower_mv_precision(MV *mv, int allow_hp) {
@@ -152,7 +159,7 @@
   assert(MAX_MV_REF_CANDIDATES == 2);
 
   find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block,
-                   mi_row, mi_col);
+                   mi_row, mi_col, NULL, NULL);
 
   near->as_int = 0;
   switch (block) {
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -204,10 +204,12 @@
                xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
+typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                       const TileInfo *const tile,
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                      int_mv *mv_ref_list, int mi_row, int mi_col);
+                      int_mv *mv_ref_list, int mi_row, int mi_col,
+                      find_mv_refs_sync sync, void *const data);
 
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -36,10 +36,13 @@
 #define REF_FRAMES_LOG2 3
 #define REF_FRAMES (1 << REF_FRAMES_LOG2)
 
-// 1 scratch frame for the new frame, 3 for scaled references on the encoder
+// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
+// in parallel, 3 for scaled references on the encoder.
+// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
+// of framebuffers.
 // TODO(jkoleszar): These 3 extra references could probably come from the
 // normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 4)
+#define FRAME_BUFFERS (REF_FRAMES + 7)
 
 #define FRAME_CONTEXTS_LOG2 2
 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
@@ -64,6 +67,18 @@
   int ref_count;
   vpx_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
+
+  // The Following variables will only be used in frame parallel decode.
+
+  // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
+  // that no FrameWorker owns, or is decoding, this buffer.
+  VP9Worker *frame_worker_owner;
+
+  // row and col indicate which position frame has been decoded to in real
+  // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
+  // when the frame is fully decoded.
+  int row;
+  int col;
 } RefCntBuffer;
 
 typedef struct {
@@ -114,6 +129,10 @@
 
   int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
 
+  // Prepare ref_frame_map for the next frame.
+  // Only used in frame parallel decode.
+  int next_ref_frame_map[REF_FRAMES];
+
   // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
   // roll new_fb_idx into it.
 
@@ -178,6 +197,9 @@
   MODE_INFO **prev_mi_grid_base;
   MODE_INFO **prev_mi_grid_visible;
 
+  // Used in frame parallel decode for delay resizing prev_mi.
+  int update_prev_mi;
+
   // Persistent mb segment id map used in prediction.
   int seg_map_idx;
   int prev_seg_map_idx;
@@ -197,6 +219,10 @@
   struct loopfilter lf;
   struct segmentation seg;
 
+  // TODO(hkuang): Remove this as it is the same as frame_parallel_decode
+  // in pbi.
+  int frame_parallel_decode;  // frame-based threading.
+
   // Context probabilities for reference frame prediction
   int allow_comp_inter_inter;
   MV_REFERENCE_FRAME comp_fixed_ref;
@@ -235,6 +261,11 @@
   ENTROPY_CONTEXT *above_context;
 } VP9_COMMON;
 
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+void lock_buffer_pool(BufferPool *const pool);
+void unlock_buffer_pool(BufferPool *const pool);
+
 static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
   return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
 }
@@ -242,6 +273,8 @@
 static INLINE int get_free_fb(VP9_COMMON *cm) {
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
   int i;
+
+  lock_buffer_pool(cm->buffer_pool);
   for (i = 0; i < FRAME_BUFFERS; ++i)
     if (frame_bufs[i].ref_count == 0)
       break;
@@ -248,6 +281,7 @@
 
   assert(i < FRAME_BUFFERS);
   frame_bufs[i].ref_count = 1;
+  unlock_buffer_pool(cm->buffer_pool);
   return i;
 }
 
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -327,21 +327,24 @@
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   RefBuffer *ref_buffer = &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME];
   xd->block_refs[idx] = ref_buffer;
+
   if (!vp9_is_valid_scale(&ref_buffer->sf))
     vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                        "Invalid scale factors");
   vp9_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
                        &ref_buffer->sf);
-  xd->corrupted |= ref_buffer->buf->corrupted;
+  if (!cm->frame_parallel_decode)
+    xd->corrupted |= ref_buffer->buf->corrupted;
 }
 
-static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                          const TileInfo *const tile,
                          int mi_row, int mi_col,
                          vp9_reader *r, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &pbi->common;
   const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
-  vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
+  vp9_read_mode_info(pbi, xd, tile, mi_row, mi_col, r);
 
   if (less8x8)
     bsize = BLOCK_8X8;
@@ -365,7 +368,7 @@
       set_ref(cm, xd, 1, mi_row, mi_col);
 
     // Prediction
-    vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    vp9_dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize);
 
     // Reconstruction
     if (!mbmi->skip) {
@@ -404,10 +407,11 @@
   return p;
 }
 
-static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                              const TileInfo *const tile,
                              int mi_row, int mi_col,
                              vp9_reader* r, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &pbi->common;
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize, uv_subsize;
@@ -422,27 +426,27 @@
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Invalid block size.");
   if (subsize < BLOCK_8X8) {
-    decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+    decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
         break;
       case PARTITION_HORZ:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
         if (mi_row + hbs < cm->mi_rows)
-          decode_block(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+          decode_block(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize);
         break;
       case PARTITION_VERT:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
         if (mi_col + hbs < cm->mi_cols)
-          decode_block(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+          decode_block(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize);
         break;
       case PARTITION_SPLIT:
-        decode_partition(cm, xd, tile, mi_row,       mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row,       mi_col + hbs, r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
+        decode_partition(pbi, xd, tile, mi_row,       mi_col,       r, subsize);
+        decode_partition(pbi, xd, tile, mi_row,       mi_col + hbs, r, subsize);
+        decode_partition(pbi, xd, tile, mi_row + hbs, mi_col,       r, subsize);
+        decode_partition(pbi, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
         break;
       default:
         assert(0 && "Invalid partition type");
@@ -638,6 +642,7 @@
     vp9_update_frame_size(cm);
   }
 
+  lock_buffer_pool(pool);
   if (vp9_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height,
           cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
@@ -646,6 +651,7 @@
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
+  unlock_buffer_pool(pool);
 }
 
 static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -778,7 +784,7 @@
   const int tile_rows = 1 << cm->log2_tile_rows;
   TileBuffer tile_buffers[4][1 << 6];
   int tile_row, tile_col;
-  int mi_row, mi_col;
+  int mi_row = 0, mi_col = 0;
   TileData *tile_data = NULL;
 
   if (cm->lf.filter_level && pbi->lf_worker.data1 == NULL) {
@@ -798,7 +804,6 @@
     vp9_copy(lf_data->planes, pbi->mb.plane);
     lf_data->stop = 0;
     lf_data->y_only = 0;
-    vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
   }
 
   assert(tile_rows <= 4);
@@ -856,7 +861,7 @@
         vp9_zero(tile_data->xd.left_seg_context);
         for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
              mi_col += MI_BLOCK_SIZE) {
-          decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col,
+          decode_partition(pbi, &tile_data->xd, &tile, mi_row, mi_col,
                            &tile_data->bit_reader, BLOCK_64X64);
         }
       }
@@ -880,6 +885,12 @@
           winterface->execute(&pbi->lf_worker);
         }
       }
+      // After loopfiltering, the last 7 row pixels in each superblock row may
+      // still be changed by the longest loopfilter of the next superblock
+      // row.
+      if (pbi->frame_parallel_decode)
+        vp9_frameworker_broadcast(pbi->cur_buf,
+                                  mi_row << MI_BLOCK_SIZE_LOG2);
     }
   }
 
@@ -895,6 +906,8 @@
   // Get last tile data.
   tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
 
+  if (pbi->frame_parallel_decode)
+    vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
   return vp9_reader_find_end(&tile_data->bit_reader);
 }
 
@@ -909,7 +922,7 @@
     vp9_zero(tile_data->xd.left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
-      decode_partition(tile_data->cm, &tile_data->xd, tile,
+      decode_partition(tile_data->pbi, &tile_data->xd, tile,
                        mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
     }
   }
@@ -1015,10 +1028,10 @@
       TileInfo *const tile = (TileInfo*)worker->data2;
       TileBuffer *const buf = &tile_buffers[0][n];
 
-      tile_data->cm = cm;
+      tile_data->pbi = pbi;
       tile_data->xd = pbi->mb;
       tile_data->xd.corrupted = 0;
-      vp9_tile_init(tile, tile_data->cm, 0, buf->col);
+      vp9_tile_init(tile, &pbi->common, 0, buf->col);
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
@@ -1078,8 +1091,9 @@
                                        struct vp9_read_bit_buffer *rb) {
   VP9_COMMON *const cm = &pbi->common;
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  BufferPool *const pool = pbi->common.buffer_pool;
+  int i, mask, ref_index = 0;
   size_t sz;
-  int i;
 
   cm->last_frame_type = cm->frame_type;
 
@@ -1096,7 +1110,7 @@
   if (cm->show_existing_frame) {
     // Show an existing frame directly.
     const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
-
+    lock_buffer_pool(pool);
     if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1)
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Buffer %d does not contain a decoded frame",
@@ -1103,9 +1117,15 @@
                          frame_to_show);
 
     ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+    unlock_buffer_pool(pool);
     pbi->refresh_frame_flags = 0;
     cm->lf.filter_level = 0;
     cm->show_frame = 1;
+
+    if (pbi->frame_parallel_decode) {
+      for (i = 0; i < REF_FRAMES; ++i)
+        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
+    }
     return 0;
   }
 
@@ -1166,7 +1186,6 @@
         ref_frame->buf = &frame_bufs[idx].buf;
         cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
       }
-
       setup_frame_size_with_refs(cm, rb);
 
       cm->allow_high_precision_mv = vp9_rb_read_bit(rb);
@@ -1198,6 +1217,29 @@
   // below, forcing the use of context 0 for those frame types.
   cm->frame_context_idx = vp9_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
 
+  // Generate next_ref_frame_map.
+  lock_buffer_pool(pool);
+  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    if (mask & 1) {
+      cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+      ++frame_bufs[cm->new_fb_idx].ref_count;
+    } else {
+      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    }
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+    ++ref_index;
+  }
+
+  for (; ref_index < REF_FRAMES; ++ref_index) {
+    cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+  }
+  unlock_buffer_pool(pool);
+
   if (frame_is_intra_only(cm) || cm->error_resilient_mode)
     vp9_setup_past_independence(cm);
 
@@ -1343,6 +1385,7 @@
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0};
+  int context_updated = 0;
 
   uint8_t clear_data[MAX_VP9_HEADER_SIZE];
   const size_t first_partition_size = read_uncompressed_header(pbi,
@@ -1380,6 +1423,28 @@
   xd->corrupted = 0;
   new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
 
+  if (cm->lf.filter_level) {
+    vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
+  }
+
+  // If encoded in frame parallel mode, frame context is ready after decoding
+  // the frame header.
+  if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
+    VP9Worker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    if (cm->refresh_frame_context) {
+      context_updated = 1;
+      cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+    }
+    vp9_frameworker_lock_stats(worker);
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    frame_worker_data->frame_context_ready = 1;
+    // Signal the main thread that context is ready.
+    vp9_frameworker_signal_stats(worker);
+    vp9_frameworker_unlock_stats(worker);
+  }
+
   // TODO(jzern): remove frame_parallel_decoding_mode restriction for
   // single-frame tile decoding.
   if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
@@ -1407,7 +1472,8 @@
     }
   }
 
-  if (cm->refresh_frame_context)
+  // Non frame parallel update frame context here.
+  if (cm->refresh_frame_context && !context_updated)
     cm->frame_contexts[cm->frame_context_idx] = cm->fc;
 }
 
@@ -1454,10 +1520,9 @@
   } while (--b_h);
 }
 
-void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
-                                       int bw, int bh,
-                                       int x, int y, int w, int h,
-                                       int mi_x, int mi_y) {
+void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                                int plane, int block, int bw, int bh, int x,
+                                int y, int w, int h, int mi_x, int mi_y) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const MODE_INFO *mi = xd->mi[0];
   const int is_compound = has_second_ref(&mi->mbmi);
@@ -1484,20 +1549,23 @@
                                                pd->subsampling_y);
 
     MV32 scaled_mv;
-    int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride,
-        subpel_x, subpel_y;
+    int xs, ys, x0, y0, x0_16, y0_16, y1, frame_width, frame_height,
+        buf_stride, subpel_x, subpel_y;
     uint8_t *ref_frame, *buf_ptr;
-    const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
+    const int idx = xd->block_refs[ref]->idx;
+    BufferPool *const pool = pbi->common.buffer_pool;
+    RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
 
     // Get reference frame pointer, width and height.
     if (plane == 0) {
-      frame_width = ref_buf->y_crop_width;
-      frame_height = ref_buf->y_crop_height;
-      ref_frame = ref_buf->y_buffer;
+      frame_width = ref_frame_buf->buf.y_crop_width;
+      frame_height = ref_frame_buf->buf.y_crop_height;
+      ref_frame = ref_frame_buf->buf.y_buffer;
     } else {
-      frame_width = ref_buf->uv_crop_width;
-      frame_height = ref_buf->uv_crop_height;
-      ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer;
+      frame_width = ref_frame_buf->buf.uv_crop_width;
+      frame_height = ref_frame_buf->buf.uv_crop_height;
+      ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
+                           : ref_frame_buf->buf.v_buffer;
     }
 
     if (vp9_is_scaled(sf)) {
@@ -1550,15 +1618,18 @@
     buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
     buf_stride = pre_buf->stride;
 
+    // Get reference block bottom right vertical coordinate.
+    y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
+
     // Do border extension if there is motion or the
     // width/height is not a multiple of 8 pixels.
     if (scaled_mv.col || scaled_mv.row ||
         (frame_width & 0x7) || (frame_height & 0x7)) {
-      // Get reference block bottom right coordinate.
-      int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
-      int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
       int x_pad = 0, y_pad = 0;
 
+      // Get reference block bottom right horizontal coordinate.
+      int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
+
       if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
         x0 -= VP9_INTERP_EXTEND - 1;
         x1 += VP9_INTERP_EXTEND;
@@ -1571,6 +1642,12 @@
         y_pad = 1;
       }
 
+      // Wait until reference block is ready. Pad 7 more pixels as last 7
+      // pixels of each superblock row can be changed by next superblock row.
+       if (pbi->frame_parallel_decode)
+         vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                              (y1 + 7) << (plane == 0 ? 0 : 1));
+
       // Skip border extension if block is inside the frame.
       if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width ||
           y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
@@ -1582,6 +1659,12 @@
         buf_stride = x1 - x0 + 1;
         buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
       }
+    } else {
+      // Wait until reference block is ready. Pad 7 more pixels as last 7
+      // pixels of each superblock row can be changed by next superblock row.
+       if (pbi->frame_parallel_decode)
+         vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                              (y1 + 7) << (plane == 0 ? 0 : 1));
     }
 
     inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
@@ -1589,7 +1672,8 @@
   }
 }
 
-void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+void vp9_dec_build_inter_predictors_sb(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                                       int mi_row, int mi_col,
                                        BLOCK_SIZE bsize) {
   int plane;
   const int mi_x = mi_col * MI_SIZE;
@@ -1607,10 +1691,10 @@
       assert(bsize == BLOCK_8X8);
       for (y = 0; y < num_4x4_h; ++y)
         for (x = 0; x < num_4x4_w; ++x)
-          dec_build_inter_predictors(xd, plane, i++, bw, bh,
+          dec_build_inter_predictors(pbi, xd, plane, i++, bw, bh,
                                      4 * x, 4 * y, 4, 4, mi_x, mi_y);
     } else {
-      dec_build_inter_predictors(xd, plane, 0, bw, bh,
+      dec_build_inter_predictors(pbi, xd, plane, 0, bw, bh,
                                  0, 0, bw, bh, mi_x, mi_y);
     }
   }
--- a/vp9/decoder/vp9_decodeframe.h
+++ b/vp9/decoder/vp9_decodeframe.h
@@ -25,7 +25,8 @@
                       const uint8_t *data, const uint8_t *data_end,
                       const uint8_t **p_data_end);
 
-void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+void vp9_dec_build_inter_predictors_sb(struct VP9Decoder *const pbi,
+                                       MACROBLOCKD *xd, int mi_row, int mi_col,
                                        BLOCK_SIZE bsize);
 #ifdef __cplusplus
 }  // extern "C"
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -420,11 +420,18 @@
   }
 }
 
-static void read_inter_block_mode_info(VP9_COMMON *const cm,
+static void fpm_sync(void *const data, int mi_row) {
+  VP9Decoder *const pbi = (VP9Decoder *)data;
+  vp9_frameworker_wait(pbi->frame_worker_owner, pbi->prev_buf,
+                       mi_row << MI_BLOCK_SIZE_LOG2);
+}
+
+static void read_inter_block_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
                                        const TileInfo *const tile,
                                        MODE_INFO *const mi,
                                        int mi_row, int mi_col, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
@@ -438,7 +445,7 @@
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
     vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame],
-                     mi_row, mi_col);
+                     mi_row, mi_col, fpm_sync, (void *)pbi);
   }
 
   inter_mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
@@ -512,10 +519,13 @@
   }
 }
 
-static void read_inter_frame_mode_info(VP9_COMMON *const cm,
+// TODO(hkuang): Pass cm instead of pbi. This requires change in
+// vp9_frameworker_wait.
+static void read_inter_frame_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
                                        const TileInfo *const tile,
                                        int mi_row, int mi_col, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   int inter_block;
@@ -529,16 +539,17 @@
                                !mbmi->skip || !inter_block, r);
 
   if (inter_block)
-    read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
+    read_inter_block_mode_info(pbi, xd, tile, mi, mi_row, mi_col, r);
   else
     read_intra_block_mode_info(cm, mi, r);
 }
 
-void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
                         const TileInfo *const tile,
                         int mi_row, int mi_col, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
   if (frame_is_intra_only(cm))
     read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
   else
-    read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r);
+    read_inter_frame_mode_info(pbi, xd, tile, mi_row, mi_col, r);
 }
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -11,6 +11,7 @@
 #ifndef VP9_DECODER_VP9_DECODEMV_H_
 #define VP9_DECODER_VP9_DECODEMV_H_
 
+#include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_reader.h"
 
 #ifdef __cplusplus
@@ -19,7 +20,7 @@
 
 struct TileInfo;
 
-void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
                         const struct TileInfo *const tile,
                         int mi_row, int mi_col, vp9_reader *r);
 
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -26,6 +26,7 @@
 #endif
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_thread.h"
 
 #include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_decoder.h"
@@ -63,6 +64,7 @@
 
   // Initialize the references to not point to any frame buffers.
   vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+  vpx_memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
 
   cm->current_video_frame = 0;
   pbi->ready_for_new_data = 1;
@@ -195,29 +197,51 @@
   return 0;
 }
 
+static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
+                                      BufferPool *const pool) {
+  if (idx >= 0) {
+    --frame_bufs[idx].ref_count;
+    if (frame_bufs[idx].ref_count == 0) {
+      pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+    }
+  }
+}
+
 /* If any buffer updating is signaled it should be done here. */
 static void swap_frame_buffers(VP9Decoder *pbi) {
   int ref_index = 0, mask;
-  VP9_COMMON * const cm = &pbi->common;
-  BufferPool * const pool = cm->buffer_pool;
+  VP9_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
+  lock_buffer_pool(pool);
   for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
-    if (mask & 1) {
-      const int old_idx = cm->ref_frame_map[ref_index];
-      ref_cnt_fb(frame_bufs, &cm->ref_frame_map[ref_index],
-                 cm->new_fb_idx);
-      if (old_idx >= 0 && frame_bufs[old_idx].ref_count == 0)
-        pool->release_fb_cb(pool->cb_priv,
-                            &frame_bufs[old_idx].raw_frame_buffer);
+    const int old_idx = cm->ref_frame_map[ref_index];
+    // Current thread releases the holding of reference frame.
+    decrease_ref_count(old_idx, frame_bufs, pool);
+
+    // Release the reference frame in reference map.
+    if ((mask & 1) && old_idx >= 0) {
+      decrease_ref_count(old_idx, frame_bufs, pool);
     }
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
     ++ref_index;
   }
 
+  // Current thread releases the holding of reference frame.
+  for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+    const int old_idx = cm->ref_frame_map[ref_index];
+    decrease_ref_count(old_idx, frame_bufs, pool);
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+  }
+  unlock_buffer_pool(pool);
+
   cm->frame_to_show = get_frame_new_buffer(cm);
 
   if (!pbi->frame_parallel_decode || !cm->show_frame) {
+    lock_buffer_pool(pool);
     --frame_bufs[cm->new_fb_idx].ref_count;
+    unlock_buffer_pool(pool);
   }
 
   // Invalidate these references until the next frame starts.
@@ -256,6 +280,20 @@
                         &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
   cm->new_fb_idx = get_free_fb(cm);
 
+
+  if (pbi->frame_parallel_decode) {
+    VP9Worker *const worker = pbi->frame_worker_owner;
+    vp9_frameworker_lock_stats(worker);
+    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
+    // Reset decoding progress.
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    vp9_frameworker_unlock_stats(worker);
+  } else {
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+  }
+
   if (setjmp(cm->error.jmp)) {
     cm->error.setjmp = 0;
 
@@ -283,19 +321,38 @@
 
   vp9_clear_system_state();
 
-  cm->last_width = cm->width;
-  cm->last_height = cm->height;
-
   if (!cm->show_existing_frame)
     cm->last_show_frame = cm->show_frame;
-  if (cm->show_frame) {
-    if (!cm->show_existing_frame)
-      vp9_swap_mi_and_prev_mi(cm);
 
-    cm->current_video_frame++;
-  }
+  // Update progress in frame parallel decode.
+  if (pbi->frame_parallel_decode) {
+    // Need to lock the mutex here as another thread may
+    // be accessing this buffer.
+    VP9Worker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    vp9_frameworker_lock_stats(worker);
 
-  vp9_swap_current_and_last_seg_map(cm);
+    if (cm->show_frame) {
+      if (!cm->show_existing_frame)
+        vp9_swap_mi_and_prev_mi(cm);
+      cm->current_video_frame++;
+    }
+    vp9_swap_current_and_last_seg_map(cm);
+    frame_worker_data->frame_decoded = 1;
+    frame_worker_data->frame_context_ready = 1;
+    vp9_frameworker_signal_stats(worker);
+    vp9_frameworker_unlock_stats(worker);
+  } else {
+    cm->last_width = cm->width;
+    cm->last_height = cm->height;
+    if (cm->show_frame) {
+      if (!cm->show_existing_frame)
+        vp9_swap_mi_and_prev_mi(cm);
+      cm->current_video_frame++;
+    }
+
+    vp9_swap_current_and_last_seg_map(cm);
+  }
 
   pbi->ready_for_new_data = 0;
 
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -45,6 +45,12 @@
 
   int frame_parallel_decode;  // frame-based threading.
 
+  // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
+  // the same.
+  RefCntBuffer *cur_buf;   //  Current decoding frame buffer.
+  RefCntBuffer *prev_buf;  //  Previous decoding frame buffer.
+
+  VP9Worker *frame_worker_owner;   // frame_worker that owns this pbi.
   VP9Worker lf_worker;
   VP9Worker *tile_workers;
   int num_tile_workers;
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -17,6 +17,8 @@
 #include "vp9/decoder/vp9_dthread.h"
 #include "vp9/decoder/vp9_decoder.h"
 
+// #define DEBUG_THREAD
+
 #if CONFIG_MULTITHREAD
 static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
   const int kMaxTryLocks = 4000;
@@ -278,4 +280,167 @@
     // case this call will be followed by an _alloc() which may fail.
     vp9_zero(*lf_sync);
   }
+}
+
+// TODO(hkuang): Clean up all the #ifdef in this file.
+void vp9_frameworker_lock_stats(VP9Worker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_lock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
+}
+
+void vp9_frameworker_unlock_stats(VP9Worker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_unlock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
+}
+
+void vp9_frameworker_signal_stats(VP9Worker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  // TODO(hkuang): Investigate using broadcast or signal.
+  pthread_cond_signal(&worker_data->stats_cond);
+#else
+  (void)worker;
+#endif
+}
+
+// TODO(hkuang): Remove worker parameter as it is only used in debug code.
+void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
+                          int row) {
+#if CONFIG_MULTITHREAD
+  if (!ref_buf)
+    return;
+
+  // Enabling the following line of code will get harmless tsan error but
+  // will get best performance.
+  // if (ref_buf->row >= row) return;
+
+  {
+    // Find the worker thread that owns the reference frame. If the reference
+    // frame has been fully decoded, it may not have owner.
+    VP9Worker *const ref_worker = ref_buf->frame_worker_owner;
+    FrameWorkerData *const ref_worker_data =
+        (FrameWorkerData *)ref_worker->data1;
+    const VP9Decoder *const pbi = ref_worker_data->pbi;
+
+#ifdef DEBUG_THREAD
+    {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      printf("%d %p worker is waiting for %d %p worker (%d)  ref %d \r\n",
+             worker_data->worker_id, worker, ref_worker_data->worker_id,
+             ref_buf->frame_worker_owner, row, ref_buf->row);
+    }
+#endif
+
+    vp9_frameworker_lock_stats(ref_worker);
+    while (ref_buf->row < row && pbi->cur_buf == ref_buf) {
+      pthread_cond_wait(&ref_worker_data->stats_cond,
+                        &ref_worker_data->stats_mutex);
+    }
+    vp9_frameworker_unlock_stats(ref_worker);
+  }
+#else
+  (void)ref_buf;
+  (void)row;
+  (void)ref_buf;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) {
+#if CONFIG_MULTITHREAD
+  VP9Worker *worker = buf->frame_worker_owner;
+
+#ifdef DEBUG_THREAD
+  printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
+         buf->frame_worker_owner, row);
+#endif
+
+  vp9_frameworker_lock_stats(worker);
+  buf->row = row;
+  vp9_frameworker_signal_stats(worker);
+  vp9_frameworker_unlock_stats(worker);
+#else
+  (void)buf;
+  (void)row;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
+                                  VP9Worker *const src_worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
+  FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
+  VP9_COMMON *const src_cm = &src_worker_data->pbi->common;
+  VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common;
+  int i;
+
+  // Wait until source frame's context is ready.
+  vp9_frameworker_lock_stats(src_worker);
+  while (!src_worker_data->frame_context_ready) {
+    pthread_cond_wait(&src_worker_data->stats_cond,
+        &src_worker_data->stats_mutex);
+  }
+
+  // src worker may have already finished decoding a frame and swapped the mi.
+  // TODO(hkuang): Remove following code after implenment no ModeInfo decoding.
+  if (src_worker_data->frame_decoded) {
+    dst_cm->prev_mip = src_cm->prev_mip;
+    dst_cm->prev_mi = src_cm->prev_mi;
+    dst_cm->prev_mi_grid_base = src_cm->prev_mi_grid_base;
+    dst_cm->prev_mi_grid_visible = src_cm->prev_mi_grid_visible;
+    dst_cm->last_frame_seg_map = src_cm->last_frame_seg_map;
+  } else {
+    dst_cm->prev_mip = src_cm->mip;
+    dst_cm->prev_mi = src_cm->mi;
+    dst_cm->prev_mi_grid_base = src_cm->mi_grid_base;
+    dst_cm->prev_mi_grid_visible = src_cm->mi_grid_visible;
+    dst_cm->last_frame_seg_map = src_cm->current_frame_seg_map;
+  }
+
+  vp9_frameworker_unlock_stats(src_worker);
+
+  dst_worker_data->pbi->prev_buf =
+      src_worker_data->pbi->common.show_existing_frame ?
+          NULL : src_worker_data->pbi->cur_buf;
+
+  dst_cm->last_width = !src_cm->show_existing_frame ?
+                       src_cm->width : src_cm->last_width;
+  dst_cm->last_height = !src_cm->show_existing_frame ?
+                        src_cm->height : src_cm->last_height;
+  dst_cm->display_width = src_cm->display_width;
+  dst_cm->display_height = src_cm->display_height;
+  dst_cm->subsampling_x = src_cm->subsampling_x;
+  dst_cm->subsampling_y = src_cm->subsampling_y;
+  dst_cm->last_show_frame = !src_cm->show_existing_frame ?
+                            src_cm->show_frame : src_cm->last_show_frame;
+  dst_cm->last_frame_type = src_cm->last_frame_type;
+  dst_cm->frame_type = src_cm->frame_type;
+  dst_cm->y_dc_delta_q = src_cm->y_dc_delta_q;
+  dst_cm->uv_dc_delta_q = src_cm->uv_dc_delta_q;
+  dst_cm->uv_ac_delta_q = src_cm->uv_ac_delta_q;
+  dst_cm->base_qindex = src_cm->base_qindex;
+
+  for (i = 0; i < REF_FRAMES; ++i)
+    dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
+
+  memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
+         (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
+  dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
+  dst_cm->lf.filter_level = src_cm->lf.filter_level;
+  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS);
+  memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+  dst_cm->seg = src_cm->seg;
+  memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
+         FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
+#else
+  (void) dst_worker;
+  (void) src_worker;
+#endif  // CONFIG_MULTITHREAD
 }
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -19,7 +19,7 @@
 struct VP9Decoder;
 
 typedef struct TileWorkerData {
-  struct VP9Common *cm;
+  struct VP9Decoder *pbi;
   vp9_reader bit_reader;
   DECLARE_ALIGNED(16, struct macroblockd, xd);
 
@@ -55,6 +55,14 @@
   // It is used to make a copy of the compressed data.
   uint8_t *scratch_buffer;
   size_t scratch_buffer_size;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t stats_mutex;
+  pthread_cond_t stats_cond;
+#endif
+
+  int frame_context_ready;  // Current frame's context is ready to read.
+  int frame_decoded;        // Finished decoding current frame.
 } FrameWorkerData;
 
 // Allocate memory for loopfilter row synchronization.
@@ -70,5 +78,24 @@
                               struct VP9Common *cm,
                               int frame_filter_level,
                               int y_only);
+
+void vp9_frameworker_lock_stats(VP9Worker *const worker);
+void vp9_frameworker_unlock_stats(VP9Worker *const worker);
+void vp9_frameworker_signal_stats(VP9Worker *const worker);
+
+// Wait until ref_buf has been decoded to row in real pixel unit.
+// Note: worker may already finish decoding ref_buf and release it in order to
+// start decoding next frame. So need to check whether worker is still decoding
+// ref_buf.
+void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
+                          int row);
+
+// FrameWorker broadcasts its decoding progress so other workers that are
+// waiting on it can resume decoding.
+void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row);
+
+// Copy necessary decoding context from src worker to dst worker.
+void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
+                                  VP9Worker *const src_worker);
 
 #endif  // VP9_DECODER_VP9_DTHREAD_H_
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -484,7 +484,7 @@
 
       if (cm->coding_use_prev_mi)
         vp9_find_mv_refs(cm, xd, tile, xd->mi[0], ref_frame,
-                         candidates, mi_row, mi_col);
+                         candidates, mi_row, mi_col, NULL, NULL);
       else
         const_motion[ref_frame] = mv_refs_rt(cm, xd, tile, xd->mi[0],
                                              ref_frame, candidates,
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2253,7 +2253,8 @@
   vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col);
+  vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col,
+                   NULL, NULL);
 
   // Candidate refinement carried out at encoder and decoder
   vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -18,6 +18,7 @@
 #include "vpx/vpx_decoder.h"
 
 #include "vp9/common/vp9_frame_buffers.h"
+#include "vp9/common/vp9_thread.h"
 
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
@@ -28,6 +29,15 @@
 
 typedef vpx_codec_stream_info_t vp9_stream_info_t;
 
+// This limit is due to framebuffer numbers.
+// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
+#define FRAME_CACHE_SIZE 6   // Cache maximum 6 decoded frames.
+
+typedef struct cache_frame {
+  int fb_idx;
+  vpx_image_t img;
+} cache_frame;
+
 struct vpx_codec_alg_priv {
   vpx_codec_priv_t        base;
   vpx_codec_dec_cfg_t     cfg;
@@ -35,17 +45,24 @@
   int                     postproc_cfg_set;
   vp8_postproc_cfg_t      postproc_cfg;
   vpx_decrypt_cb          decrypt_cb;
-  void                   *decrypt_state;
+  void                    *decrypt_state;
   vpx_image_t             img;
   int                     flushed;
   int                     invert_tile_order;
-  int                     frame_parallel_decode;  // frame-based threading.
   int                     last_show_frame;  // Index of last output frame.
 
+  // Frame parallel related.
+  int                     frame_parallel_decode;  // frame-based threading.
   VP9Worker               *frame_workers;
   int                     num_frame_workers;
-  int                     next_submit_thread_id;
-  int                     next_output_thread_id;
+  int                     next_submit_worker_id;
+  int                     last_submit_worker_id;
+  int                     next_output_worker_id;
+  int                     available_threads;
+  cache_frame             frame_cache[FRAME_CACHE_SIZE];
+  int                     frame_cache_write;
+  int                     frame_cache_read;
+  int                     num_cache_frames;
 
   // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
   BufferPool              *buffer_pool;
@@ -77,12 +94,11 @@
     ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
     ctx->priv->init_flags = ctx->init_flags;
     ctx->priv->alg_priv->flushed = 0;
+    // Only do frame parallel decode when threads > 1.
     ctx->priv->alg_priv->frame_parallel_decode =
-        (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING);
+        ((ctx->config.dec->threads > 1) &&
+         (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING)) ? 1 : 0;
 
-    // Disable frame parallel decoding for now.
-    ctx->priv->alg_priv->frame_parallel_decode = 0;
-
     if (ctx->config.dec) {
       // Update the reference to the config structure to an internal copy.
       ctx->priv->alg_priv->cfg = *ctx->config.dec;
@@ -98,10 +114,21 @@
     int i;
     for (i = 0; i < ctx->num_frame_workers; ++i) {
       VP9Worker *const worker = &ctx->frame_workers[i];
-      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-      vp9_decoder_remove(worker_data->pbi);
-      vpx_free(worker_data);
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      vp9_get_worker_interface()->end(worker);
+      vp9_decoder_remove(frame_worker_data->pbi);
+      vpx_free(frame_worker_data->scratch_buffer);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_destroy(&frame_worker_data->stats_mutex);
+      pthread_cond_destroy(&frame_worker_data->stats_cond);
+#endif
+      vpx_free(frame_worker_data);
     }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+    vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
   }
 
   vpx_free(ctx->frame_workers);
@@ -222,8 +249,8 @@
 
   for (i = 0; i < ctx->num_frame_workers; ++i) {
     VP9Worker *const worker = &ctx->frame_workers[i];
-    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-    VP9_COMMON *const cm = &worker_data->pbi->common;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
     BufferPool *const pool = cm->buffer_pool;
 
     cm->new_fb_idx = -1;
@@ -260,14 +287,15 @@
 }
 
 static int frame_worker_hook(void *arg1, void *arg2) {
-  FrameWorkerData *const worker_data = (FrameWorkerData *)arg1;
-  const uint8_t *data = worker_data->data;
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
+  const uint8_t *data = frame_worker_data->data;
   (void)arg2;
-  worker_data->result = vp9_receive_compressed_data(worker_data->pbi,
-                                                    worker_data->data_size,
-                                                    &data);
-  worker_data->data_end = data;
-  return !worker_data->result;
+  frame_worker_data->result =
+      vp9_receive_compressed_data(frame_worker_data->pbi,
+                                  frame_worker_data->data_size,
+                                  &data);
+  frame_worker_data->data_end = data;
+  return !frame_worker_data->result;
 }
 
 static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
@@ -275,14 +303,28 @@
   const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
 
   ctx->last_show_frame = -1;
-  ctx->next_submit_thread_id = 0;
-  ctx->next_output_thread_id = 0;
+  ctx->next_submit_worker_id = 0;
+  ctx->last_submit_worker_id = 0;
+  ctx->next_output_worker_id = 0;
+  ctx->frame_cache_read = 0;
+  ctx->frame_cache_write = 0;
+  ctx->num_cache_frames = 0;
   ctx->num_frame_workers =
       (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1;
+  ctx->available_threads = ctx->num_frame_workers;
+  ctx->flushed = 0;
+
   ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
   if (ctx->buffer_pool == NULL)
     return VPX_CODEC_MEM_ERROR;
 
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
+      set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
+
   ctx->frame_workers = (VP9Worker *)
       vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers));
   if (ctx->frame_workers == NULL) {
@@ -292,28 +334,51 @@
 
   for (i = 0; i < ctx->num_frame_workers; ++i) {
     VP9Worker *const worker = &ctx->frame_workers[i];
-    FrameWorkerData *worker_data = NULL;
+    FrameWorkerData *frame_worker_data = NULL;
     winterface->init(worker);
     worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
     if (worker->data1 == NULL) {
-      set_error_detail(ctx, "Failed to allocate worker_data");
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
       return VPX_CODEC_MEM_ERROR;
     }
-    worker_data = (FrameWorkerData *)worker->data1;
-    worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
-    if (worker_data->pbi == NULL) {
-      set_error_detail(ctx, "Failed to allocate worker_data");
+    frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
+    if (frame_worker_data->pbi == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
       return VPX_CODEC_MEM_ERROR;
     }
+    frame_worker_data->pbi->frame_worker_owner = worker;
+    frame_worker_data->pbi->common.mi_idx = 0;
+    frame_worker_data->pbi->common.prev_mi_idx = 1;
+    frame_worker_data->worker_id = i;
+    frame_worker_data->scratch_buffer = NULL;
+    frame_worker_data->scratch_buffer_size = 0;
+    frame_worker_data->frame_context_ready = 0;
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
+      return VPX_CODEC_MEM_ERROR;
+    }
 
+    if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
     // If decoding in serial mode, FrameWorker thread could create tile worker
     // thread or loopfilter thread.
-    worker_data->pbi->max_threads =
+    frame_worker_data->pbi->max_threads =
         (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
 
-    worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
-    worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+    frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+    frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+    frame_worker_data->pbi->common.frame_parallel_decode =
+        ctx->frame_parallel_decode;
     worker->hook = (VP9WorkerHook)frame_worker_hook;
+    if (!winterface->reset(worker)) {
+      set_error_detail(ctx, "Frame Worker thread creation failed");
+      return VPX_CODEC_MEM_ERROR;
+    }
   }
 
   // If postprocessing was enabled by the application and a
@@ -348,36 +413,66 @@
       return VPX_CODEC_ERROR;
   }
 
-  // Initialize the decoder workers on the first frame
-  if (ctx->frame_workers == NULL) {
-    const vpx_codec_err_t res = init_decoder(ctx);
-    if (res != VPX_CODEC_OK)
-      return res;
-  }
-
   if (!ctx->frame_parallel_decode) {
     VP9Worker *const worker = ctx->frame_workers;
-    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-    worker_data->data = *data;
-    worker_data->data_size = data_sz;
-    worker_data->user_priv = user_priv;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->data = *data;
+    frame_worker_data->data_size = data_sz;
+    frame_worker_data->user_priv = user_priv;
 
     // Set these even if already initialized.  The caller may have changed the
     // decrypt config between frames.
-    worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
-    worker_data->pbi->decrypt_state = ctx->decrypt_state;
+    frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
+    frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
 
     worker->had_error = 0;
     winterface->execute(worker);
 
     // Update data pointer after decode.
-    *data = worker_data->data_end;
+    *data = frame_worker_data->data_end;
 
     if (worker->had_error)
-      return update_error_state(ctx, &worker_data->pbi->common.error);
+      return update_error_state(ctx, &frame_worker_data->pbi->common.error);
   } else {
-    // TODO(hkuang): Implement frame parallel decode.
-    return VPX_CODEC_INCAPABLE;
+    const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+    VP9Worker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    // Copy context from last worker thread to next worker thread.
+    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+      vp9_frameworker_copy_context(
+          &ctx->frame_workers[ctx->next_submit_worker_id],
+          &ctx->frame_workers[ctx->last_submit_worker_id]);
+
+    // Copy the compressed data into worker's internal buffer.
+    // TODO(hkuang): Will all the workers allocate the same size
+    // as the size of the first intra frame be better? This will
+    // avoid too many deallocate and allocate.
+    if (frame_worker_data->scratch_buffer_size < data_sz) {
+      frame_worker_data->scratch_buffer =
+          (uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz);
+      if (frame_worker_data->scratch_buffer == NULL) {
+        set_error_detail(ctx, "Failed to reallocate scratch buffer");
+        return VPX_CODEC_MEM_ERROR;
+      }
+      frame_worker_data->scratch_buffer_size = data_sz;
+    }
+    frame_worker_data->data_size = data_sz;
+    vpx_memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
+
+    frame_worker_data->frame_decoded = 0;
+    frame_worker_data->frame_context_ready = 0;
+    frame_worker_data->data = frame_worker_data->scratch_buffer;
+    frame_worker_data->user_priv = user_priv;
+
+    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+      ctx->last_submit_worker_id =
+          (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
+
+    ctx->next_submit_worker_id =
+        (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
+
+    --ctx->available_threads;
+    winterface->launch(worker);
   }
 
   if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
@@ -461,6 +556,30 @@
   return VPX_CODEC_OK;
 }
 
+static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
+  YV12_BUFFER_CONFIG sd;
+  vp9_ppflags_t flags = {0, 0, 0};
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+  VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+  ctx->next_output_worker_id =
+      (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+  winterface->sync(worker);
+  ++ctx->available_threads;
+  if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
+    yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
+                    frame_worker_data->user_priv);
+    ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
+        frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+    ctx->frame_cache_write =
+        (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
+    ++ctx->num_cache_frames;
+  }
+}
+
 static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                       const uint8_t *data, unsigned int data_sz,
                                       void *user_priv, long deadline) {
@@ -478,6 +597,13 @@
   // Reset flushed when receiving a valid frame.
   ctx->flushed = 0;
 
+  // Initialize the decoder workers on the first frame.
+  if (ctx->frame_workers == NULL) {
+    const vpx_codec_err_t res = init_decoder(ctx);
+    if (res != VPX_CODEC_OK)
+      return res;
+  }
+
   res = parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
                                ctx->decrypt_cb, ctx->decrypt_state);
   if (res != VPX_CODEC_OK)
@@ -494,7 +620,6 @@
       for (i = 0; i < frame_count; ++i) {
         const uint8_t *data_start_copy = data_start;
         const uint32_t frame_size = frame_sizes[i];
-        vpx_codec_err_t res;
         if (data_start < data
             || frame_size > (uint32_t) (data_end - data_start)) {
           set_error_detail(ctx, "Invalid frame size in index");
@@ -501,23 +626,40 @@
           return VPX_CODEC_CORRUPT_FRAME;
         }
 
+        if (ctx->available_threads == 0) {
+          // No more threads for decoding. Wait until the next output worker
+          // finishes decoding. Then copy the decoded frame into cache.
+          if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+            wait_worker_and_cache_frame(ctx);
+          } else {
+            // TODO(hkuang): Add unit test to test this path.
+            set_error_detail(ctx, "Frame output cache is full.");
+            return VPX_CODEC_ERROR;
+          }
+        }
+
         res = decode_one(ctx, &data_start_copy, frame_size, user_priv,
                          deadline);
         if (res != VPX_CODEC_OK)
           return res;
-
         data_start += frame_size;
       }
     } else {
-      res = decode_one(ctx, &data_start, data_sz, user_priv, deadline);
+      if (ctx->available_threads == 0) {
+        // No more threads for decoding. Wait until the next output worker
+        // finishes decoding. Then copy the decoded frame into cache.
+        if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+          wait_worker_and_cache_frame(ctx);
+        } else {
+          // TODO(hkuang): Add unit test to test this path.
+          set_error_detail(ctx, "Frame output cache is full.");
+          return VPX_CODEC_ERROR;
+        }
+      }
+
+      res = decode_one(ctx, &data, data_sz, user_priv, deadline);
       if (res != VPX_CODEC_OK)
         return res;
-
-      // Extra data detected after the frame.
-      if (data_start < data_end - 1) {
-        set_error_detail(ctx, "Fail to decode frame in parallel mode");
-        return VPX_CODEC_INCAPABLE;
-      }
     }
   } else {
     // Decode in serial mode.
@@ -561,41 +703,73 @@
     }
   }
 
-  return VPX_CODEC_OK;
+  return res;
 }
 
+static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) {
+  RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
+  // Decrease reference count of last output frame in frame parallel mode.
+  if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
+    BufferPool *const pool = ctx->buffer_pool;
+    lock_buffer_pool(pool);
+    --frame_bufs[ctx->last_show_frame].ref_count;
+    if (frame_bufs[ctx->last_show_frame].ref_count == 0) {
+      pool->release_fb_cb(pool->cb_priv,
+                          &frame_bufs[ctx->last_show_frame].raw_frame_buffer);
+    }
+    unlock_buffer_pool(pool);
+  }
+}
+
 static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
                                       vpx_codec_iter_t *iter) {
   vpx_image_t *img = NULL;
 
+  // Only return frame when all the cpu are busy or
+  // application fluhsed the decoder in frame parallel decode.
+  if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
+      !ctx->flushed) {
+    return img;
+  }
+
+  // Output the frames in the cache first.
+  if (ctx->num_cache_frames > 0) {
+    release_last_output_frame(ctx);
+    ctx->last_show_frame  = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
+    img = &ctx->frame_cache[ctx->frame_cache_read].img;
+    ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
+    --ctx->num_cache_frames;
+    return img;
+  }
+
   // iter acts as a flip flop, so an image is only returned on the first
   // call to get_frame.
   if (*iter == NULL && ctx->frame_workers != NULL) {
-    YV12_BUFFER_CONFIG sd;
-    vp9_ppflags_t flags = {0, 0, 0};
-
-    VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_thread_id];
-    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-    if (vp9_get_raw_frame(worker_data->pbi, &sd, &flags) == 0) {
-      VP9_COMMON *const cm = &worker_data->pbi->common;
-      BufferPool *const pool = cm->buffer_pool;
-      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-      yuvconfig2image(&ctx->img, &sd, worker_data->user_priv);
-      ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-      img = &ctx->img;
-      *iter = img;
-      // Decrease reference count of last output frame in frame parallel mode.
-      if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
-        --frame_bufs[ctx->last_show_frame].ref_count;
-        if (frame_bufs[ctx->last_show_frame].ref_count == 0) {
-          pool->release_fb_cb(pool->cb_priv,
-              &frame_bufs[ctx->last_show_frame].raw_frame_buffer);
-        }
+    do {
+      YV12_BUFFER_CONFIG sd;
+      vp9_ppflags_t flags = {0, 0, 0};
+      const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+      VP9Worker *const worker =
+          &ctx->frame_workers[ctx->next_output_worker_id];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      ctx->next_output_worker_id =
+          (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+      // Wait for the frame from worker thread.
+      winterface->sync(worker);
+      ++ctx->available_threads;
+      if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+        VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+        RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+        release_last_output_frame(ctx);
+        ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
+        yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
+        ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+        img = &ctx->img;
+        return img;
       }
-      ctx->last_show_frame = worker_data->pbi->common.new_fb_idx;
-    }
+    } while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
   }
-
   return img;
 }
 
@@ -631,9 +805,9 @@
     vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
     YV12_BUFFER_CONFIG sd;
     VP9Worker *const worker = ctx->frame_workers;
-    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-    return vp9_set_reference_dec(&worker_data->pbi->common,
+    return vp9_set_reference_dec(&frame_worker_data->pbi->common,
                                  (VP9_REFFRAME)frame->frame_type, &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -654,9 +828,9 @@
     vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data;
     YV12_BUFFER_CONFIG sd;
     VP9Worker *const worker = ctx->frame_workers;
-    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-    return vp9_copy_reference_dec(worker_data->pbi,
+    return vp9_copy_reference_dec(frame_worker_data->pbi,
                                   (VP9_REFFRAME)frame->frame_type, &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -676,8 +850,8 @@
   if (data) {
     YV12_BUFFER_CONFIG* fb;
     VP9Worker *const worker = ctx->frame_workers;
-    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-    vp9_get_reference_dec(worker_data->pbi, data->idx, &fb);
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    vp9_get_reference_dec(frame_worker_data->pbi, data->idx, &fb);
     yuvconfig2image(&data->img, fb, NULL);
     return VPX_CODEC_OK;
   } else {
@@ -724,8 +898,9 @@
   if (update_info) {
     if (ctx->frame_workers) {
       VP9Worker *const worker = ctx->frame_workers;
-      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-      *update_info = worker_data->pbi->refresh_frame_flags;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      *update_info = frame_worker_data->pbi->refresh_frame_flags;
     } else {
       return VPX_CODEC_ERROR;
     }
@@ -735,22 +910,18 @@
   }
 }
 
-
 static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
                                                 va_list args) {
   int *corrupted = va_arg(args, int *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (corrupted) {
     if (ctx->frame_workers) {
       VP9Worker *const worker = ctx->frame_workers;
-      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-      *corrupted = worker_data->pbi->common.frame_to_show->corrupted;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      RefCntBuffer *const frame_bufs =
+          frame_worker_data->pbi->common.buffer_pool->frame_bufs;
+      *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
     } else {
       return VPX_CODEC_ERROR;
     }
@@ -773,8 +944,9 @@
   if (display_size) {
     if (ctx->frame_workers) {
       VP9Worker *const worker = ctx->frame_workers;
-      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-      const VP9_COMMON *const cm = &worker_data->pbi->common;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
       display_size[0] = cm->display_width;
       display_size[1] = cm->display_height;
     } else {
--- a/vpx/vpx_frame_buffer.h
+++ b/vpx/vpx_frame_buffer.h
@@ -22,8 +22,11 @@
 #include "./vpx_integer.h"
 
 /*!\brief The maximum number of work buffers used by libvpx.
+ *  Support maximum 4 threads to decode video in parallel.
+ *  Each thread will use one work buffer.
+ * TODO(hkuang): Add support to set number of worker threads dynamically.
  */
-#define VPX_MAXIMUM_WORK_BUFFERS 1
+#define VPX_MAXIMUM_WORK_BUFFERS 4
 
 /*!\brief The maximum number of reference buffers that a VP9 encoder may use.
  */