shithub: openh264

Download patch

ref: b0f95c47c9df38613cedd5ea00a117fb8af20339
parent: 8b881445bd17d7ced5c072601f0cd8a185bd3368
author: xiaotiansf <xiaotianshimail@gmail.com>
date: Mon Oct 28 08:35:15 EDT 2019

Mainly update DecodeCurrentAccessUnit and associated functions to do threaded parallel frame decodings.

Add additional function PrefetchLastPicForThread and variables for better control of threaded decoding in the case of multi-slice with one slice group contains one slice.

--- a/codec/decoder/core/inc/decoder_context.h
+++ b/codec/decoder/core/inc/decoder_context.h
@@ -277,6 +277,7 @@
   PPicture          pPreviousDecodedPictureInDpb; //pointer to previously decoded picture in DPB for error concealment
   int32_t           iPrevFrameNum;// frame number of previous frame well decoded for non-truncated mode yet
   bool              bLastHasMmco5;
+  uint32_t          uiDecodingTimeStamp; //represent relative decoding time stamps
 } SWelsLastDecPicInfo, *PWelsLastDecPicInfo;
 
 typedef struct tagPictInfo {
@@ -538,6 +539,8 @@
   PPicture pDec;
   SWelsDecEvent sImageReady;
   SWelsDecEvent sSliceDecodeStart;
+  SWelsDecEvent sSliceDecodeFinsh;
+  int32_t       iPicBuffIdx; //picBuff Index
 } SWelsDecoderThreadCTX, *PWelsDecoderThreadCTX;
 
 static inline void ResetActiveSPSForEachLayer (PWelsDecoderContext pCtx) {
--- a/codec/decoder/core/inc/pic_queue.h
+++ b/codec/decoder/core/inc/pic_queue.h
@@ -54,6 +54,8 @@
 
 PPicture PrefetchPic (PPicBuff pPicBuff);  // To get current node applicable
 PPicture PrefetchPicForThread (PPicBuff pPicBuff); // To get current node applicable in the case of threaded mode
+PPicture PrefetchLastPicForThread (PPicBuff pPicBuff,
+                                   const int32_t& iLast); // To get last node applicable in the case of threaded mode
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/picture.h
+++ b/codec/decoder/core/inc/picture.h
@@ -89,6 +89,7 @@
   uint32_t    uiDecodingTimeStamp; //represent relative decoding time stamps
   int32_t     iPicBuffIdx;
   EWelsSliceType  eSliceType;
+  bool        bIsUngroupedMultiSlice; //multi-slice picture with each each slice group contains one slice.
   bool bNewSeqBegin;
   int32_t iMbEcedNum;
   int32_t iMbEcedPropNum;
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -1736,9 +1736,8 @@
     if (!pCurDqLayer->pMbCorrectlyDecodedFlag[iNextMbXyIndex]) { //already con-ed, overwrite
       pCurDqLayer->pMbCorrectlyDecodedFlag[iNextMbXyIndex] = true;
       pCtx->pDec->iMbEcedPropNum += (pCurDqLayer->pMbRefConcealedFlag[iNextMbXyIndex] ? 1 : 0);
-      ++pCtx->iTotalNumMbRec;
     }
-
+    ++pCtx->iTotalNumMbRec;
     if (pCtx->iTotalNumMbRec > iTotalMbTargetLayer) {
       WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING,
                "WelsTargetSliceConstruction():::pCtx->iTotalNumMbRec:%d, iTotalMbTargetLayer:%d",
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -393,6 +393,7 @@
   sLastDecPicInfo.pPreviousDecodedPictureInDpb = NULL;
   sLastDecPicInfo.iPrevFrameNum = -1;
   sLastDecPicInfo.bLastHasMmco5 = false;
+  sLastDecPicInfo.uiDecodingTimeStamp = 0;
 }
 
 /*!
@@ -437,6 +438,9 @@
     iNumRefFrames = MAX_REF_PIC_COUNT + 2;
   } else {
     iNumRefFrames = pCtx->pSps->iNumRefFrames + 2;
+    if (pCtx->pThreadCtx != NULL) {
+      iNumRefFrames = MAX_REF_PIC_COUNT + 1;
+    }
   }
 
 #ifdef LONG_TERM_REF
@@ -478,7 +482,9 @@
                          && kiPicHeight == pCtx->iImgHeightInPixel) && (!bNeedChangePicQueue)) // have same scaled buffer
 
   // sync update pRefList
-  WelsResetRefPic (pCtx); // added to sync update ref list due to pictures are free
+  if (pCtx->pThreadCtx == NULL) {
+    WelsResetRefPic (pCtx); // added to sync update ref list due to pictures are free
+  }
 
   if (pCtx->bHaveGotMemory && (kiPicWidth == pCtx->iImgWidthInPixel && kiPicHeight == pCtx->iImgHeightInPixel)
       && pCtx->pPicBuff != NULL && pCtx->pPicBuff->iCapacity != iPicQueueSize) {
@@ -554,6 +560,17 @@
   if (NULL != pPicBuff && NULL != *pPicBuff) {
     DestroyPicBuff (pCtx, pPicBuff, pMa);
   }
+  if (pCtx->pThreadCtx != NULL) {
+    //prevent from double destruction of PPicBuff
+    PWelsDecoderThreadCTX pThreadCtx = (PWelsDecoderThreadCTX) (pCtx->pThreadCtx);
+    int32_t threadCount = pThreadCtx->sThreadInfo.uiThrMaxNum;
+    int32_t  id = pThreadCtx->sThreadInfo.uiThrNum;
+    for (int32_t i = 0; i < threadCount; ++i) {
+      if (pThreadCtx[i - id].pCtx != NULL) {
+        pThreadCtx[i - id].pCtx->pPicBuff = NULL;
+      }
+    }
+  }
 
   if (pCtx->pTempDec) {
     FreePicture (pCtx->pTempDec, pCtx->pMemAlign);
@@ -796,7 +813,11 @@
             }
             CheckAndFinishLastPic (pCtx, ppDst, pDstBufInfo);
             if (pCtx->bAuReadyFlag && pCtx->pAccessUnitList->uiAvailUnitsNum != 0) {
-              ConstructAccessUnit (pCtx, ppDst, pDstBufInfo);
+              if (pCtx->pThreadCtx == NULL) {
+                ConstructAccessUnit (pCtx, ppDst, pDstBufInfo);
+              } else {
+                pCtx->pAccessUnitList->uiAvailUnitsNum = 1;
+              }
             }
           }
           DecodeFinishUpdate (pCtx);
@@ -852,9 +873,15 @@
       if (IS_PARAM_SETS_NALS (pCtx->sCurNalHead.eNalUnitType)) {
         iRet = ParseNonVclNal (pCtx, pNalPayload, iDstIdx - iConsumedBytes, pSrcNal - 3, iSrcIdx + 3);
       }
-      CheckAndFinishLastPic (pCtx, ppDst, pDstBufInfo);
+      if (pCtx->pThreadCtx == NULL) {
+        CheckAndFinishLastPic (pCtx, ppDst, pDstBufInfo);
+      }
       if (pCtx->bAuReadyFlag && pCtx->pAccessUnitList->uiAvailUnitsNum != 0) {
-        ConstructAccessUnit (pCtx, ppDst, pDstBufInfo);
+        if (pCtx->pThreadCtx == NULL) {
+          ConstructAccessUnit (pCtx, ppDst, pDstBufInfo);
+        } else {
+          pCtx->pAccessUnitList->uiAvailUnitsNum = 1;
+        }
       }
     }
     DecodeFinishUpdate (pCtx);
--- a/codec/decoder/core/src/decoder_core.cpp
+++ b/codec/decoder/core/src/decoder_core.cpp
@@ -194,8 +194,9 @@
              "DecodeFrameConstruction(): iTotalNumMbRec:%d, total_num_mb_sps:%d, cur_layer_mb_width:%d, cur_layer_mb_height:%d ",
              pCtx->iTotalNumMbRec, kiTotalNumMbInCurLayer, pCurDq->iMbWidth, pCurDq->iMbHeight);
     bFrameCompleteFlag = false; //return later after output buffer is done
-    if (pCtx->bInstantDecFlag) //no-delay decoding, wait for new slice
+    if (pCtx->bInstantDecFlag) { //no-delay decoding, wait for new slice
       return ERR_INFO_MB_NUM_INADEQUATE;
+    }
   } else if (pCurDq->sLayerInfo.sNalHeaderExt.bIdrFlag
              && (pCtx->iErrorCode == dsErrorFree)) { //complete non-ECed IDR frame done
     pCtx->pDec->bIsComplete = true;
@@ -220,9 +221,26 @@
   ppDst[1] = ppDst[1] + pCtx->sFrameCrop.iTopOffset  * pPic->iLinesize[1] + pCtx->sFrameCrop.iLeftOffset;
   ppDst[2] = ppDst[2] + pCtx->sFrameCrop.iTopOffset  * pPic->iLinesize[1] + pCtx->sFrameCrop.iLeftOffset;
   pDstInfo->iBufferStatus = 1;
-
-  bool bOutResChange = (pCtx->iLastImgWidthInPixel != pDstInfo->UsrData.sSystemBuffer.iWidth)
-                       || (pCtx->iLastImgHeightInPixel != pDstInfo->UsrData.sSystemBuffer.iHeight);
+  if (pCtx->pThreadCtx != NULL && pPic->bIsComplete == false) {
+    pPic->bIsComplete = true;
+  }
+  if (pCtx->pThreadCtx != NULL) {
+    uint32_t uiMbHeight = (pCtx->pDec->iHeightInPixel + 15) >> 4;
+    for (uint32_t i = 0; i < uiMbHeight; ++i) {
+      SET_EVENT (&pCtx->pDec->pReadyEvent[i]);
+    }
+  }
+  bool bOutResChange = false;
+  if (pCtx->pThreadCtx == NULL || pCtx->pLastThreadCtx == NULL) {
+    bOutResChange = (pCtx->iLastImgWidthInPixel != pDstInfo->UsrData.sSystemBuffer.iWidth)
+                    || (pCtx->iLastImgHeightInPixel != pDstInfo->UsrData.sSystemBuffer.iHeight);
+  } else {
+    if (pCtx->pLastThreadCtx != NULL) {
+      PWelsDecoderThreadCTX pLastThreadCtx = (PWelsDecoderThreadCTX) (pCtx->pLastThreadCtx);
+      bOutResChange = (pLastThreadCtx->pCtx->iLastImgWidthInPixel != pDstInfo->UsrData.sSystemBuffer.iWidth)
+                      || (pLastThreadCtx->pCtx->iLastImgHeightInPixel != pDstInfo->UsrData.sSystemBuffer.iHeight);
+    }
+  }
   pCtx->iLastImgWidthInPixel = pDstInfo->UsrData.sSystemBuffer.iWidth;
   pCtx->iLastImgHeightInPixel = pDstInfo->UsrData.sSystemBuffer.iHeight;
   if (pCtx->pParam->eEcActiveIdc == ERROR_CON_DISABLE) //no buffer output if EC is disabled and frame incomplete
@@ -846,8 +864,9 @@
  *  Parse slice header of bitstream in avc for storing data structure
  */
 int32_t ParseSliceHeaderSyntaxs (PWelsDecoderContext pCtx, PBitStringAux pBs, const bool kbExtensionFlag) {
-  PNalUnit const kpCurNal               = pCtx->pAccessUnitList->pNalUnitsList[pCtx->pAccessUnitList->uiAvailUnitsNum -
-                                                                                 1];
+  PNalUnit const kpCurNal               =
+    pCtx->pAccessUnitList->pNalUnitsList[pCtx->pAccessUnitList->uiAvailUnitsNum -
+                                                                                1];
 
   PNalUnitHeaderExt pNalHeaderExt       = NULL;
   PSliceHeader pSliceHead               = NULL;
@@ -1462,7 +1481,6 @@
 
 int32_t InitialDqLayersContext (PWelsDecoderContext pCtx, const int32_t kiMaxWidth, const int32_t kiMaxHeight) {
   int32_t i = 0;
-
   WELS_VERIFY_RETURN_IF (ERR_INFO_INVALID_PARAM, (NULL == pCtx || kiMaxWidth <= 0 || kiMaxHeight <= 0))
   pCtx->sMb.iMbWidth  = (kiMaxWidth + 15) >> 4;
   pCtx->sMb.iMbHeight = (kiMaxHeight + 15) >> 4;
@@ -1508,7 +1526,8 @@
         sizeof (
           bool),
         "pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[]");
-    pCtx->sMb.pTransformSize8x8Flag[i] = (bool*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (bool),
+    pCtx->sMb.pTransformSize8x8Flag[i] = (bool*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                                           bool),
                                          "pCtx->sMb.pTransformSize8x8Flag[]");
     pCtx->sMb.pChromaQp[i] = (int8_t (*)[2])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
                                int8_t) * 2,
@@ -1519,9 +1538,11 @@
                                   int16_t) * MV_A * MB_BLOCK4x4_NUM, "pCtx->sMb.pMvd[][]");
     pCtx->sMb.pCbfDc[i] = (uint16_t*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (uint16_t),
                           "pCtx->sMb.pCbfDc[]");
-    pCtx->sMb.pNzc[i] = (int8_t (*)[24])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * 24,
+    pCtx->sMb.pNzc[i] = (int8_t (*)[24])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                          int8_t) * 24,
                         "pCtx->sMb.pNzc[]");
-    pCtx->sMb.pNzcRs[i] = (int8_t (*)[24])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * 24,
+    pCtx->sMb.pNzcRs[i] = (int8_t (*)[24])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                            int8_t) * 24,
                           "pCtx->sMb.pNzcRs[]");
     pCtx->sMb.pScaledTCoeff[i] = (int16_t (*)[MB_COEFF_LIST_SIZE])pMa->WelsMallocz (pCtx->sMb.iMbWidth *
                                  pCtx->sMb.iMbHeight *
@@ -1539,20 +1560,24 @@
                                    "pCtx->sMb.pChromaPredMode[]");
     pCtx->sMb.pCbp[i] = (int8_t*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
                         "pCtx->sMb.pCbp[]");
-    pCtx->sMb.pSubMbType[i] = (uint32_t (*)[MB_PARTITION_SIZE])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *
+    pCtx->sMb.pSubMbType[i] = (uint32_t (*)[MB_PARTITION_SIZE])pMa->WelsMallocz (pCtx->sMb.iMbWidth *
+                              pCtx->sMb.iMbHeight *
                               sizeof (
                                 uint32_t) * MB_PARTITION_SIZE, "pCtx->sMb.pSubMbType[]");
     pCtx->sMb.pSliceIdc[i] = (int32_t*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int32_t),
                              "pCtx->sMb.pSliceIdc[]"); // using int32_t for slice_idc, 4/21/2010
-    pCtx->sMb.pResidualPredFlag[i] = (int8_t*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
+    pCtx->sMb.pResidualPredFlag[i] = (int8_t*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                                       int8_t),
                                      "pCtx->sMb.pResidualPredFlag[]");
-    pCtx->sMb.pInterPredictionDoneFlag[i] = (int8_t*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
-        int8_t), "pCtx->sMb.pInterPredictionDoneFlag[]");
+    pCtx->sMb.pInterPredictionDoneFlag[i] = (int8_t*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *
+                                            sizeof (
+                                                int8_t), "pCtx->sMb.pInterPredictionDoneFlag[]");
 
     pCtx->sMb.pMbCorrectlyDecodedFlag[i] = (bool*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
         bool),
                                            "pCtx->sMb.pMbCorrectlyDecodedFlag[]");
-    pCtx->sMb.pMbRefConcealedFlag[i] = (bool*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (bool),
+    pCtx->sMb.pMbRefConcealedFlag[i] = (bool*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                                         bool),
                                        "pCtx->pMbRefConcealedFlag[]");
 
     // check memory block valid due above allocated..
@@ -1599,6 +1624,8 @@
   return ERR_NONE;
 }
 
+
+
 void UninitialDqLayersContext (PWelsDecoderContext pCtx) {
   int32_t i = 0;
   CMemoryAlign* pMa = pCtx->pMemAlign;
@@ -2307,39 +2334,18 @@
  *  0 - success; otherwise returned error_no defined in error_no.h
  */
 int32_t ConstructAccessUnit (PWelsDecoderContext pCtx, uint8_t** ppDst, SBufferInfo* pDstInfo) {
-  int32_t iErr;
-  PAccessUnit pCurAu = pCtx->pAccessUnitList;
-  pCtx->bAuReadyFlag = false;
-  pCtx->pLastDecPicInfo->bLastHasMmco5 = false;
-  bool bTmpNewSeqBegin = CheckNewSeqBeginAndUpdateActiveLayerSps (pCtx);
-  pCtx->bNewSeqBegin = pCtx->bNewSeqBegin || bTmpNewSeqBegin;
-  iErr = WelsDecodeAccessUnitStart (pCtx);
-  GetVclNalTemporalId (pCtx);
-
-  if (ERR_NONE != iErr) {
-    ForceResetCurrentAccessUnit (pCtx->pAccessUnitList);
-    if (!pCtx->pParam->bParseOnly)
-      pDstInfo->iBufferStatus = 0;
-    pCtx->bNewSeqBegin = pCtx->bNewSeqBegin || pCtx->bNextNewSeqBegin;
-    pCtx->bNextNewSeqBegin = false; // reset it
-    if (pCtx->bNewSeqBegin)
-      ResetActiveSPSForEachLayer (pCtx);
-    return iErr;
-  }
-
-  pCtx->pSps = pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pSps;
-  pCtx->pPps = pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pPps;
-
-  //try to allocate or relocate DPB memory only when new sequence is coming.
-  if (pCtx->bNewSeqBegin) {
-    WelsResetRefPic (pCtx); //clear ref pPic when IDR NAL
-    iErr = SyncPictureResolutionExt (pCtx, pCtx->pSps->iMbWidth, pCtx->pSps->iMbHeight);
-
+  int32_t iErr = ERR_NONE;
+  if (pCtx->pThreadCtx == NULL) {
+    iErr = InitConstructAccessUnit (pCtx, pDstInfo);
     if (ERR_NONE != iErr) {
-      WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING, "sync picture resolution ext failed,  the error is %d", iErr);
       return iErr;
     }
   }
+  if (pCtx->pCabacDecEngine == NULL) {
+    pCtx->pCabacDecEngine = (SWelsCabacDecEngine*)pCtx->pMemAlign->WelsMallocz (sizeof (SWelsCabacDecEngine),
+                            "pCtx->pCabacDecEngine");
+    WELS_VERIFY_RETURN_IF (ERR_INFO_OUT_OF_MEMORY, (NULL == pCtx->pCabacDecEngine))
+  }
 
   iErr = DecodeCurrentAccessUnit (pCtx, ppDst, pDstInfo);
 
@@ -2412,6 +2418,9 @@
 
 int32_t InitRefPicList (PWelsDecoderContext pCtx, const uint8_t kuiNRi, int32_t iPoc) {
   int32_t iRet = ERR_NONE;
+  if (pCtx->pThreadCtx != NULL && pCtx->bNewSeqBegin) {
+    WelsResetRefPic (pCtx);
+  }
   if (pCtx->eSliceType == B_SLICE) {
     iRet = WelsInitBSliceRefList (pCtx, iPoc);
     CreateImplicitWeightTable (pCtx);
@@ -2466,13 +2475,26 @@
  * Decode current access unit when current AU is completed.
  */
 int32_t DecodeCurrentAccessUnit (PWelsDecoderContext pCtx, uint8_t** ppDst, SBufferInfo* pDstInfo) {
-  int32_t iRefCount[LIST_A];
-  PNalUnit pNalCur = NULL;
+  PNalUnit pNalCur = pCtx->pNalCur = NULL;
   PAccessUnit pCurAu = pCtx->pAccessUnitList;
 
   int32_t iIdx = pCurAu->uiStartPos;
   int32_t iEndIdx = pCurAu->uiEndPos;
 
+  //get current thread ctx
+  PWelsDecoderThreadCTX pThreadCtx = NULL;
+  if (pCtx->pThreadCtx != NULL) {
+    pThreadCtx = (PWelsDecoderThreadCTX)pCtx->pThreadCtx;
+  }
+  //get last thread ctx
+  PWelsDecoderThreadCTX pLastThreadCtx = NULL;
+  if (pCtx->pLastThreadCtx != NULL) {
+    pLastThreadCtx = (PWelsDecoderThreadCTX) (pCtx->pLastThreadCtx);
+    if (pLastThreadCtx->pDec == NULL) {
+      pLastThreadCtx->pDec = PrefetchLastPicForThread (pCtx->pPicBuff,
+                             pLastThreadCtx->iPicBuffIdx);
+    }
+  }
   int32_t iPpsId = 0;
   int32_t iRet = ERR_NONE;
 
@@ -2487,7 +2509,7 @@
     true; // Another fresh slice comingup for given dq layer, for multiple slices in case of header parts of slices sometimes loss over error-prone channels, 8/14/2008
 
   //update pCurDqLayer at the starting of AU decoding
-  if (pCtx->bInitialDqLayersMem) {
+  if (pCtx->bInitialDqLayersMem || pCtx->pCurDqLayer == NULL) {
     pCtx->pCurDqLayer = pCtx->pDqLayersList[0];
   }
 
@@ -2500,8 +2522,71 @@
     PSliceHeaderExt pShExt = NULL;
     PSliceHeader pSh = NULL;
 
+    if (pLastThreadCtx != NULL) {
+      pSh = &pNalCur->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
+      if (pSh->iFirstMbInSlice == 0) {
+        if (pLastThreadCtx->pCtx->pDec != NULL && pLastThreadCtx->pCtx->pDec->bIsUngroupedMultiSlice) {
+          WAIT_EVENT (&pLastThreadCtx->sSliceDecodeFinsh, WELS_DEC_THREAD_WAIT_INFINITE);
+        }
+        pCtx->pDec = NULL;
+        pCtx->iTotalNumMbRec = 0;
+      } else if (pLastThreadCtx->pCtx->pDec != NULL) {
+        if (pSh->iFrameNum == pLastThreadCtx->pCtx->pDec->iFrameNum
+            && pSh->iPicOrderCntLsb == pLastThreadCtx->pCtx->pDec->iFramePoc) {
+          WAIT_EVENT (&pLastThreadCtx->sSliceDecodeFinsh, WELS_DEC_THREAD_WAIT_INFINITE);
+          pCtx->pDec = pLastThreadCtx->pCtx->pDec;
+          pCtx->pDec->bIsUngroupedMultiSlice = true;
+          pCtx->sRefPic = pLastThreadCtx->pCtx->sRefPic;
+          pCtx->iTotalNumMbRec = pLastThreadCtx->pCtx->iTotalNumMbRec;
+        }
+      }
+    }
+    bool isNewFrame = true;
+    if (pThreadCtx != NULL) {
+      isNewFrame = pCtx->pDec == NULL;
+    }
     if (pCtx->pDec == NULL) {
+      if (pLastThreadCtx != NULL) {
+        pLastThreadCtx->pDec->bUsedAsRef = pLastThreadCtx->pCtx->uiNalRefIdc > 0;
+        if (pLastThreadCtx->pDec->bUsedAsRef) {
+          for (int32_t listIdx = LIST_0; listIdx < LIST_A; ++listIdx) {
+            uint32_t i = 0;
+            while (i < MAX_DPB_COUNT && pLastThreadCtx->pCtx->sRefPic.pRefList[listIdx][i]) {
+              pLastThreadCtx->pDec->pRefPic[listIdx][i] = pLastThreadCtx->pCtx->sRefPic.pRefList[listIdx][i];
+              pLastThreadCtx->pDec->pRefPic[listIdx][i]->bAvailableFlag = false;
+              ++i;
+            }
+          }
+          pLastThreadCtx->pCtx->sTmpRefPic = pLastThreadCtx->pCtx->sRefPic;
+          WelsMarkAsRef (pLastThreadCtx->pCtx, pLastThreadCtx->pDec);
+          pCtx->sRefPic = pLastThreadCtx->pCtx->sTmpRefPic;
+        } else {
+          pCtx->sRefPic = pLastThreadCtx->pCtx->sRefPic;
+        }
+        //printf ("last uiDecodingTimeStamp = %d\n", pLastThreadCtx->pCtx->uiDecodingTimeStamp);
+        for (int32_t i = 0; i < pCtx->sRefPic.uiRefCount[LIST_0]; ++i) {
+          if (pCtx->sRefPic.pRefList[LIST_0][i] != NULL) {
+            pCtx->sRefPic.pRefList[LIST_0][i]->bAvailableFlag = false;
+          }
+        }
+        for (int32_t i = 0; i < pCtx->sRefPic.uiRefCount[LIST_1]; ++i) {
+          if (pCtx->sRefPic.pRefList[LIST_1][i] != NULL) {
+            pCtx->sRefPic.pRefList[LIST_1][i]->bAvailableFlag = false;
+          }
+        }
+      }
       pCtx->pDec = PrefetchPic (pCtx->pPicBuff);
+      if (pThreadCtx != NULL) {
+        if (pCtx->pDec != NULL) {
+          pCtx->pDec->bAvailableFlag = false;
+          pCtx->pDec->bIsUngroupedMultiSlice = false;
+          pThreadCtx->pDec = pCtx->pDec;
+          uint32_t uiMbHeight = (pCtx->pDec->iHeightInPixel + 15) >> 4;
+          for (uint32_t i = 0; i < uiMbHeight; ++i) {
+            RESET_EVENT (&pCtx->pDec->pReadyEvent[i]);
+          }
+        }
+      }
       if (pCtx->iTotalNumMbRec != 0)
         pCtx->iTotalNumMbRec = 0;
 
@@ -2519,6 +2604,9 @@
     }
     pCtx->pDec->uiTimeStamp = pNalCur->uiTimeStamp;
     pCtx->pDec->uiDecodingTimeStamp = pCtx->uiDecodingTimeStamp;
+    if (pThreadCtx != NULL) {
+      pThreadCtx->iPicBuffIdx = pCtx->pDec->iPicBuffIdx;
+    }
 
     if (pCtx->iTotalNumMbRec == 0) { //Picture start to decode
       for (int32_t i = 0; i < LAYER_NUM_EXCHANGEABLE; ++ i)
@@ -2556,6 +2644,7 @@
       pCtx->pDec->iFramePoc = pSh->iPicOrderCntLsb; // still can not obtain correct, because current do not support POCtype 2
       pCtx->pDec->bIdrFlag = pNalCur->sNalHeaderExt.bIdrFlag;
       pCtx->pDec->eSliceType = pSh->eSliceType;
+
       memcpy (&pLayerInfo.sSliceInLayer.sSliceHeaderExt, pShExt, sizeof (SSliceHeaderExt)); //confirmed_safe_unsafe_usage
       pLayerInfo.sSliceInLayer.bSliceHeaderExtFlag      = pNalCur->sNalData.sVclNal.bSliceHeaderExtFlag;
       pLayerInfo.sSliceInLayer.eSliceType               = pSh->eSliceType;
@@ -2587,11 +2676,9 @@
       bFreshSliceAvailable = (iCurrIdD != iLastIdD
                               || iCurrIdQ != iLastIdQ);        // do not need condition of (first_mb == 0) due multiple slices might be disorder
 
+
       WelsDqLayerDecodeStart (pCtx, pNalCur, pLayerInfo.pSps, pLayerInfo.pPps);
 
-      if (iCurrIdQ == BASE_QUALITY_ID) {
-        ST64 (iRefCount, LD64 (pLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.uiRefCount));
-      }
 
       if ((iLastIdD < 0) ||  //case 1: first layer
           (iLastIdD == iCurrIdD)) { //case 2: same uiDId
@@ -2601,13 +2688,23 @@
           const bool kbIdrFlag = dq_cur->sLayerInfo.sNalHeaderExt.bIdrFlag
                                  || (dq_cur->sLayerInfo.sNalHeaderExt.sNalUnitHeader.eNalUnitType == NAL_UNIT_CODED_SLICE_IDR);
           // Subclause 8.2.5.2 Decoding process for gaps in frame_num
+          int32_t iPrevFrameNum = pCtx->pLastDecPicInfo->iPrevFrameNum;
+          if (pLastThreadCtx != NULL) {
+            if (pCtx->bNewSeqBegin) {
+              iPrevFrameNum = 0;
+            } else if (pLastThreadCtx->pDec != NULL) {
+              iPrevFrameNum = pLastThreadCtx->pDec->iFrameNum;
+            } else {
+              iPrevFrameNum = pCtx->bNewSeqBegin ? 0 : pLastThreadCtx->pCtx->iFrameNum;
+            }
+          }
           if (!kbIdrFlag  &&
-              pSh->iFrameNum != pCtx->pLastDecPicInfo->iPrevFrameNum &&
-              pSh->iFrameNum != ((pCtx->pLastDecPicInfo->iPrevFrameNum + 1) & ((1 << dq_cur->sLayerInfo.pSps->uiLog2MaxFrameNum) -
+              pSh->iFrameNum != iPrevFrameNum &&
+              pSh->iFrameNum != ((iPrevFrameNum + 1) & ((1 << dq_cur->sLayerInfo.pSps->uiLog2MaxFrameNum) -
                                  1))) {
             WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING,
                      "referencing pictures lost due frame gaps exist, prev_frame_num: %d, curr_frame_num: %d",
-                     pCtx->pLastDecPicInfo->iPrevFrameNum,
+                     iPrevFrameNum,
                      pSh->iFrameNum);
 
             bAllRefComplete = false;
@@ -2623,7 +2720,7 @@
           }
         }
 
-        if (iCurrIdD == kuiDependencyIdMax && iCurrIdQ == BASE_QUALITY_ID) {
+        if (iCurrIdD == kuiDependencyIdMax && iCurrIdQ == BASE_QUALITY_ID && isNewFrame) {
           iRet = InitRefPicList (pCtx, pCtx->uiNalRefIdc, pSh->iPicOrderCntLsb);
           if (iRet) {
             pCtx->bRPLRError = true;
@@ -2643,7 +2740,13 @@
         if (pSh->eSliceType == B_SLICE && !pSh->iDirectSpatialMvPredFlag)
           ComputeColocatedTemporalScaling (pCtx);
 
-        iRet = WelsDecodeSlice (pCtx, bFreshSliceAvailable, pNalCur);
+        if (pThreadCtx != NULL) {
+          memset (&pCtx->lastReadyHeightOffset[0][0], -1, LIST_A * MAX_REF_PIC_COUNT * sizeof (int16_t));
+          SET_EVENT (&pThreadCtx->sSliceDecodeStart);
+          iRet = WelsDecodeAndConstructSlice (pCtx);
+        } else {
+          iRet = WelsDecodeSlice (pCtx, bFreshSliceAvailable, pNalCur);
+        }
 
         //Output good store_base reconstruction when enhancement quality layer occurred error for MGS key picture case
         if (iRet != ERR_NONE) {
@@ -2659,7 +2762,7 @@
           }
         }
 
-        if (bReconstructSlice) {
+        if (pThreadCtx == NULL && bReconstructSlice) {
           if ((iRet = WelsDecodeConstructSlice (pCtx, pNalCur)) != ERR_NONE) {
             pCtx->pDec->bIsComplete = false; // reconstruction error, directly set the flag false
             return iRet;
@@ -2666,10 +2769,12 @@
           }
         }
         if (bAllRefComplete && pCtx->eSliceType != I_SLICE) {
-          if (pCtx->sRefPic.uiRefCount[LIST_0] > 0) {
-            bAllRefComplete &= CheckRefPicturesComplete (pCtx);
-          } else {
-            bAllRefComplete = false;
+          if (pCtx->pThreadCtx == NULL) {
+            if (pCtx->sRefPic.uiRefCount[LIST_0] > 0) {
+              bAllRefComplete &= CheckRefPicturesComplete (pCtx);
+            } else {
+              bAllRefComplete = false;
+            }
           }
         }
       }
@@ -2721,34 +2826,49 @@
         }
       }
 
+      if (pThreadCtx != NULL && pCtx->uiDecodingTimeStamp > 1 && pCtx->pLastDecPicInfo->uiDecodingTimeStamp > 0) {
+        while (pCtx->uiDecodingTimeStamp > pCtx->pLastDecPicInfo->uiDecodingTimeStamp + 1) {
+          WelsSleep (1);
+        }
+      }
+      if (pThreadCtx != NULL) {
+        pCtx->pLastDecPicInfo->uiDecodingTimeStamp = pCtx->uiDecodingTimeStamp;
+      }
       iRet = DecodeFrameConstruction (pCtx, ppDst, pDstInfo);
-      if (iRet)
+      if (iRet) {
+        if (pThreadCtx != NULL) {
+          SET_EVENT (&pThreadCtx->sSliceDecodeFinsh);
+        }
         return iRet;
+      }
 
       pCtx->pLastDecPicInfo->pPreviousDecodedPictureInDpb = pCtx->pDec; //store latest decoded picture for EC
-      pCtx->bUsedAsRef = false;
-      if (pCtx->uiNalRefIdc > 0) {
-        pCtx->bUsedAsRef = true;
-        for (int32_t listIdx = LIST_0; listIdx < LIST_A; ++listIdx) {
-          uint32_t i = 0;
-          while (i < MAX_DPB_COUNT && pCtx->sRefPic.pRefList[listIdx][i]) {
-            pCtx->pDec->pRefPic[listIdx][i] = pCtx->sRefPic.pRefList[listIdx][i];
-            ++i;
+      pCtx->bUsedAsRef = pCtx->uiNalRefIdc > 0;
+      if (pCtx->pThreadCtx == NULL) {
+        if (pCtx->bUsedAsRef) {
+          for (int32_t listIdx = LIST_0; listIdx < LIST_A; ++listIdx) {
+            uint32_t i = 0;
+            while (i < MAX_DPB_COUNT && pCtx->sRefPic.pRefList[listIdx][i]) {
+              pCtx->pDec->pRefPic[listIdx][i] = pCtx->sRefPic.pRefList[listIdx][i];
+              ++i;
+            }
           }
-        }
-        iRet = WelsMarkAsRef (pCtx);
-        if (iRet != ERR_NONE) {
-          if (iRet == ERR_INFO_DUPLICATE_FRAME_NUM)
-            pCtx->iErrorCode |= dsBitstreamError;
-          if (pCtx->pParam->eEcActiveIdc == ERROR_CON_DISABLE) {
-            pCtx->pDec = NULL;
-            return iRet;
+          iRet = WelsMarkAsRef (pCtx);
+          if (iRet != ERR_NONE) {
+            if (iRet == ERR_INFO_DUPLICATE_FRAME_NUM)
+              pCtx->iErrorCode |= dsBitstreamError;
+            if (pCtx->pParam->eEcActiveIdc == ERROR_CON_DISABLE) {
+              pCtx->pDec = NULL;
+              return iRet;
+            }
           }
+          if (!pCtx->pParam->bParseOnly)
+            ExpandReferencingPicture (pCtx->pDec->pData, pCtx->pDec->iWidthInPixel, pCtx->pDec->iHeightInPixel,
+                                      pCtx->pDec->iLinesize,
+                                      pCtx->sExpandPicFunc.pfExpandLumaPicture, pCtx->sExpandPicFunc.pfExpandChromaPicture);
         }
-        if (!pCtx->pParam->bParseOnly)
-          ExpandReferencingPicture (pCtx->pDec->pData, pCtx->pDec->iWidthInPixel, pCtx->pDec->iHeightInPixel,
-                                    pCtx->pDec->iLinesize,
-                                    pCtx->sExpandPicFunc.pfExpandLumaPicture, pCtx->sExpandPicFunc.pfExpandChromaPicture);
+      } else {
+        SET_EVENT (&pThreadCtx->sImageReady);
       }
       pCtx->pDec = NULL; //after frame decoding, always set to NULL
     }
@@ -2758,8 +2878,27 @@
       pCtx->pLastDecPicInfo->iPrevFrameNum = pSh->iFrameNum;
     if (pCtx->pLastDecPicInfo->bLastHasMmco5)
       pCtx->pLastDecPicInfo->iPrevFrameNum = 0;
+    if (pThreadCtx != NULL) {
+      int32_t threadCount = pThreadCtx->sThreadInfo.uiThrMaxNum;
+      int32_t  id = pThreadCtx->sThreadInfo.uiThrNum;
+      for (int32_t i = 0; i < threadCount; ++i) {
+        if (pThreadCtx[i - id].pCtx != NULL) {
+          unsigned long long uiTimeStamp = pThreadCtx[i - id].pCtx->uiTimeStamp;
+          if (uiTimeStamp > 0 && pThreadCtx[i - id].pCtx->sSpsPpsCtx.iSeqId > pCtx->sSpsPpsCtx.iSeqId) {
+            CopySpsPps (pThreadCtx[i - id].pCtx, pCtx);
+            if (pCtx->pPicBuff != pThreadCtx[i - id].pCtx->pPicBuff) {
+              pCtx->pPicBuff = pThreadCtx[i - id].pCtx->pPicBuff;
+            }
+            InitialDqLayersContext (pCtx, pCtx->pSps->iMbWidth << 4, pCtx->pSps->iMbHeight << 4);
+            break;
+          }
+        }
+      }
+    }
   }
-
+  if (pThreadCtx != NULL) {
+    SET_EVENT (&pThreadCtx->sSliceDecodeFinsh);
+  }
   return ERR_NONE;
 }
 
@@ -2875,6 +3014,7 @@
     if (iRealMbIdx == -1) //caused by abnormal return of FmoNextMb()
       return false;
   }
+
   return bAllRefComplete;
 }
 } // namespace WelsDec
--- a/codec/decoder/core/src/manage_dec_ref.cpp
+++ b/codec/decoder/core/src/manage_dec_ref.cpp
@@ -150,7 +150,7 @@
           && pCtx->eSliceType != SI_SLICE)) {
     if (pCtx->pParam->eEcActiveIdc !=
         ERROR_CON_DISABLE) { //IDR lost!, recover it for future decoding with data all set to 0
-      PPicture pRef = pCtx->pThreadCtx != NULL ? PrefetchPicForThread (pCtx->pPicBuff) : PrefetchPic (pCtx->pPicBuff);
+      PPicture pRef = PrefetchPic (pCtx->pPicBuff);
       if (pRef != NULL) {
         // IDR lost, set new
         pRef->bIsComplete = false; // Set complete flag to false for lost IDR ref picture
--- a/codec/decoder/core/src/pic_queue.cpp
+++ b/codec/decoder/core/src/pic_queue.cpp
@@ -217,4 +217,16 @@
   return pPic;
 }
 
+PPicture PrefetchLastPicForThread (PPicBuff pPicBuf, const int32_t& iLastPicBuffIdx) {
+  PPicture pPic = NULL;
+
+  if (pPicBuf->iCapacity == 0) {
+    return NULL;
+  }
+  if (iLastPicBuffIdx >= 0 && iLastPicBuffIdx < pPicBuf->iCapacity) {
+    pPic = pPicBuf->ppPic[iLastPicBuffIdx];
+  }
+  return pPic;
+}
+
 } // namespace WelsDec
--- a/codec/decoder/core/src/wels_decoder_thread.cpp
+++ b/codec/decoder/core/src/wels_decoder_thread.cpp
@@ -57,6 +57,12 @@
 #define HW_NCPU_NAME "hw.ncpu"
 #endif
 #endif
+#ifdef ANDROID_NDK
+#include <cpu-features.h>
+#endif
+#ifdef __ANDROID__
+#include <android/api-level.h>
+#endif
 
 #include "wels_decoder_thread.h"
 #include <stdio.h>