shithub: openh264

ref: 234f70988fff844bf64e1242482cf9a7431b8060
dir: /codec/common/src/deblocking_common.cpp/

View raw version
#include "deblocking_common.h"
#include "macros.h"

//  C code only
void DeblockLumaLt4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta,
                       int8_t* pTc) {
  for (int32_t i = 0; i < 16; i++) {
    int32_t iTc0 = pTc[i >> 2];
    if (iTc0 >= 0) {
      int32_t p0 = pPix[-iStrideX];
      int32_t p1 = pPix[-2 * iStrideX];
      int32_t p2 = pPix[-3 * iStrideX];
      int32_t q0 = pPix[0];
      int32_t q1 = pPix[iStrideX];
      int32_t q2 = pPix[2 * iStrideX];
      bool bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
      bool bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
      bool bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
      int32_t iTc = iTc0;
      if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
        bool bDetaP2P0 =  WELS_ABS (p2 - p0) < iBeta;
        bool bDetaQ2Q0 =  WELS_ABS (q2 - q0) < iBeta;
        if (bDetaP2P0) {
          pPix[-2 * iStrideX] = p1 + WELS_CLIP3 ((p2 + ((p0 + q0 + 1) >> 1) - (p1 * (1 << 1))) >> 1, -iTc0, iTc0);
          iTc++;
        }
        if (bDetaQ2Q0) {
          pPix[iStrideX] = q1 + WELS_CLIP3 ((q2 + ((p0 + q0 + 1) >> 1) - (q1 * (1 << 1))) >> 1, -iTc0, iTc0);
          iTc++;
        }
        int32_t iDeta = WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc, iTc);
        pPix[-iStrideX] = WelsClip1 (p0 + iDeta);     /* p0' */
        pPix[0]  = WelsClip1 (q0 - iDeta);     /* q0' */
      }
    }
    pPix += iStrideY;
  }
}
void DeblockLumaEq4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta) {
  int32_t p0, p1, p2, q0, q1, q2;
  int32_t iDetaP0Q0;
  bool bDetaP1P0, bDetaQ1Q0;
  for (int32_t i = 0; i < 16; i++) {
    p0 = pPix[-iStrideX];
    p1 = pPix[-2 * iStrideX];
    p2 = pPix[-3 * iStrideX];
    q0 = pPix[0];
    q1 = pPix[iStrideX];
    q2 = pPix[2 * iStrideX];
    iDetaP0Q0 = WELS_ABS (p0 - q0);
    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
    if ((iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0) {
      if (iDetaP0Q0 < ((iAlpha >> 2) + 2)) {
        bool bDetaP2P0 = WELS_ABS (p2 - p0) < iBeta;
        bool bDetaQ2Q0 =  WELS_ABS (q2 - q0) < iBeta;
        if (bDetaP2P0) {
          const int32_t p3 = pPix[-4 * iStrideX];
          pPix[-iStrideX] = (p2 + (p1 * (1 << 1)) + (p0 * (1 << 1)) + (q0 * (1 << 1)) + q1 + 4) >> 3;   //p0
          pPix[-2 * iStrideX] = (p2 + p1 + p0 + q0 + 2) >> 2;                         //p1
          pPix[-3 * iStrideX] = ((p3 * (1 << 1)) + p2 + (p2 * (1 << 1)) + p1 + p0 + q0 + 4) >> 3; //p2
        } else {
          pPix[-1 * iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2;                       //p0
        }
        if (bDetaQ2Q0) {
          const int32_t q3 = pPix[3 * iStrideX];
          pPix[0] = (p1 + (p0 * (1 << 1)) + (q0 * (1 << 1)) + (q1 * (1 << 1)) + q2 + 4) >> 3;           //q0
          pPix[iStrideX] = (p0 + q0 + q1 + q2 + 2) >> 2;                              //q1
          pPix[2 * iStrideX] = ((q3 * (1 << 1)) + q2 + (q2 * (1 << 1)) + q1 + q0 + p0 + 4) >> 3;  //q2
        } else {
          pPix[0] = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2;                                   //q0
        }
      } else {
        pPix[-iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2;   //p0
        pPix[ 0] = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2;          //q0
      }
    }
    pPix += iStrideY;
  }
}
void DeblockLumaLt4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
  DeblockLumaLt4_c (pPix, iStride, 1, iAlpha, iBeta, tc);
}
void DeblockLumaLt4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
  DeblockLumaLt4_c (pPix, 1, iStride, iAlpha, iBeta, tc);
}
void DeblockLumaEq4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
  DeblockLumaEq4_c (pPix, iStride, 1, iAlpha, iBeta);
}
void DeblockLumaEq4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
  DeblockLumaEq4_c (pPix, 1, iStride, iAlpha, iBeta);
}
void DeblockChromaLt4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
                         int32_t iBeta, int8_t* pTc) {
  int32_t p0, p1, q0, q1, iDeta;
  bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;

  for (int32_t i = 0; i < 8; i++) {
    int32_t iTc0 = pTc[i >> 1];
    if (iTc0 > 0) {
      p0 = pPixCb[-iStrideX];
      p1 = pPixCb[-2 * iStrideX];
      q0 = pPixCb[0];
      q1 = pPixCb[iStrideX];

      bDetaP0Q0 =  WELS_ABS (p0 - q0) < iAlpha;
      bDetaP1P0 =  WELS_ABS (p1 - p0) < iBeta;
      bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
      if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
        iDeta = WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
        pPixCb[-iStrideX] = WelsClip1 (p0 + iDeta);     /* p0' */
        pPixCb[0]  = WelsClip1 (q0 - iDeta);     /* q0' */
      }


      p0 = pPixCr[-iStrideX];
      p1 = pPixCr[-2 * iStrideX];
      q0 = pPixCr[0];
      q1 = pPixCr[iStrideX];

      bDetaP0Q0 =  WELS_ABS (p0 - q0) < iAlpha;
      bDetaP1P0 =  WELS_ABS (p1 - p0) < iBeta;
      bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;

      if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
        iDeta = WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
        pPixCr[-iStrideX] = WelsClip1 (p0 + iDeta);     /* p0' */
        pPixCr[0]  = WelsClip1 (q0 - iDeta);     /* q0' */
      }
    }
    pPixCb += iStrideY;
    pPixCr += iStrideY;
  }
}
void DeblockChromaEq4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
                         int32_t iBeta) {
  int32_t p0, p1, q0, q1;
  bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
  for (int32_t i = 0; i < 8; i++) {
    //cb
    p0 = pPixCb[-iStrideX];
    p1 = pPixCb[-2 * iStrideX];
    q0 = pPixCb[0];
    q1 = pPixCb[iStrideX];
    bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
    if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
      pPixCb[-iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2;     /* p0' */
      pPixCb[0]  = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2;     /* q0' */
    }

    //cr
    p0 = pPixCr[-iStrideX];
    p1 = pPixCr[-2 * iStrideX];
    q0 = pPixCr[0];
    q1 = pPixCr[iStrideX];
    bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
    if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
      pPixCr[-iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2;     /* p0' */
      pPixCr[0]  = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2;     /* q0' */
    }
    pPixCr += iStrideY;
    pPixCb += iStrideY;
  }
}
void DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
                          int8_t* tc) {
  DeblockChromaLt4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta, tc);
}
void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
                          int8_t* tc) {
  DeblockChromaLt4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta, tc);
}
void DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
  DeblockChromaEq4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta);
}
void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
  DeblockChromaEq4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta);
}

void DeblockChromaLt42_c (uint8_t* pPixCbCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
                          int32_t iBeta, int8_t* pTc) {
  int32_t p0, p1, q0, q1, iDeta;
  bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;

  for (int32_t i = 0; i < 8; i++) {
    int32_t iTc0 = pTc[i >> 1];
    if (iTc0 > 0) {
      p0 = pPixCbCr[-iStrideX];
      p1 = pPixCbCr[-2 * iStrideX];
      q0 = pPixCbCr[0];
      q1 = pPixCbCr[iStrideX];

      bDetaP0Q0 =  WELS_ABS (p0 - q0) < iAlpha;
      bDetaP1P0 =  WELS_ABS (p1 - p0) < iBeta;
      bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
      if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
        iDeta = WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
        pPixCbCr[-iStrideX] = WelsClip1 (p0 + iDeta);     /* p0' */
        pPixCbCr[0]  = WelsClip1 (q0 - iDeta);     /* q0' */
      }


    }
    pPixCbCr += iStrideY;
  }
}
void DeblockChromaEq42_c (uint8_t* pPixCbCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
                          int32_t iBeta) {
  int32_t p0, p1, q0, q1;
  bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
  for (int32_t i = 0; i < 8; i++) {
    p0 = pPixCbCr[-iStrideX];
    p1 = pPixCbCr[-2 * iStrideX];
    q0 = pPixCbCr[0];
    q1 = pPixCbCr[iStrideX];
    bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
    if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
      pPixCbCr[-iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2;     /* p0' */
      pPixCbCr[0]  = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2;     /* q0' */
    }

    pPixCbCr += iStrideY;
  }
}

void DeblockChromaLt4V2_c (uint8_t* pPixCbCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
                           int8_t* tc) {
  DeblockChromaLt42_c (pPixCbCr, iStride, 1, iAlpha, iBeta, tc);
}
void DeblockChromaLt4H2_c (uint8_t* pPixCbCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
                           int8_t* tc) {

  DeblockChromaLt42_c (pPixCbCr, 1, iStride, iAlpha, iBeta, tc);
}
void DeblockChromaEq4V2_c (uint8_t* pPixCbCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
  DeblockChromaEq42_c (pPixCbCr, iStride, 1, iAlpha, iBeta);
}
void DeblockChromaEq4H2_c (uint8_t* pPixCbCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
  DeblockChromaEq42_c (pPixCbCr, 1, iStride, iAlpha, iBeta);
}

void WelsNonZeroCount_c (int8_t* pNonZeroCount) {
  int32_t i;
  for (i = 0; i < 24; i++) {
    pNonZeroCount[i] = !!pNonZeroCount[i];
  }
}

#ifdef X86_ASM
extern "C" {
  void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
    ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);

    DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
    DeblockLumaLt4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
    DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
  }

  void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
    ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);

    DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
    DeblockLumaEq4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta);
    DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
  }

}

#endif

#ifdef HAVE_MMI
extern "C" {
  void DeblockLumaLt4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
    ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);

    DeblockLumaTransposeH2V_mmi (pPixY - 4, iStride, &uiBuf[0]);
    DeblockLumaLt4V_mmi (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
    DeblockLumaTransposeV2H_mmi (pPixY - 4, iStride, &uiBuf[0]);
  }

  void DeblockLumaEq4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
    ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);

    DeblockLumaTransposeH2V_mmi (pPixY - 4, iStride, &uiBuf[0]);
    DeblockLumaEq4V_mmi (&uiBuf[4 * 16], 16, iAlpha, iBeta);
    DeblockLumaTransposeV2H_mmi (pPixY - 4, iStride, &uiBuf[0]);
  }
}
#endif//HAVE_MMI