shithub: openh264

Download patch

ref: 1c532d6f748e6e22fe7860d19ce6fcd73693285c
parent: cd9982ec4472664290d67a15fbc9cf53a97e1007
parent: c06f9ec41e26c9e86314c3207d0dcbd3d5594ceb
author: HaiboZhu <haibozhu@cisco.com>
date: Mon Aug 8 13:07:00 EDT 2016

Merge pull request #2538 from saamas/common-x86-mc-optimizations

[Common/x86] Motion compensation optimizations

--- a/codec/common/inc/mc.h
+++ b/codec/common/inc/mc.h
@@ -252,8 +252,6 @@
                              int32_t iHeight);
 void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                            const uint8_t* kpABCD, int32_t iHeight);
-void McCopyWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                         int32_t iHeight);
 void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                          int32_t iHeight);
 void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
@@ -307,11 +305,54 @@
         int32_t iWidth, int32_t iHeight);
 
 //***************************************************************************//
-//                       SSSE3 definition                                    //
+//                       SSE3 definition                                     //
 //***************************************************************************//
+void McCopyWidthEq16_sse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                           int32_t iHeight);
 
+//***************************************************************************//
+//                       SSSE3 definition                                    //
+//***************************************************************************//
 void McChromaWidthEq8_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                              const uint8_t* kpABCD, int32_t iHeight);
+void McHorVer02_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight);
+void McHorVer02Width4S16ToU8_ssse3 (const int16_t* pSrc, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02Width5S16ToU8_ssse3 (const int16_t* pSrc, int32_t iSrcStride,
+                                    uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02WidthGe8S16ToU8_ssse3 (const int16_t* pSrc, int32_t iSrcStride,
+                                      uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer20_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight);
+void McHorVer20Width4U8ToS16_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, int16_t* pDst, int32_t iHeight);
+void McHorVer20Width5Or9Or17_ssse3 (const uint8_t* pSrc, int32_t iSrcStride,
+                                    uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer20Width8U8ToS16_ssse3 (const uint8_t* pSrc, int32_t iSrcStride,
+                                    int16_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer20Width9Or17U8ToS16_ssse3 (const uint8_t* pSrc, int32_t iSrcStride,
+                                        int16_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+
+//***************************************************************************//
+//                       AVX2 definition                                     //
+//***************************************************************************//
+#ifdef HAVE_AVX2
+void McHorVer02_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight);
+void McHorVer02Width4S16ToU8_avx2 (const int16_t* pSrc, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02Width5S16ToU8_avx2 (const int16_t* pSrc, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02Width8S16ToU8_avx2 (const int16_t* pSrc, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02Width9S16ToU8_avx2 (const int16_t* pSrc, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02Width16Or17S16ToU8_avx2 (const int16_t* pSrc, int32_t iSrcStride,
+                                        uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer20_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight);
+void McHorVer20Width5Or9Or17_avx2 (const uint8_t* pSrc, int32_t iSrcStride,
+                                   uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer20Width4U8ToS16_avx2 (const uint8_t* pSrc, int32_t iSrcStride, int16_t* pDst, int32_t iHeight);
+void McHorVer20Width8U8ToS16_avx2 (const uint8_t* pSrc, int32_t iSrcStride, int16_t* pDst, int32_t iHeight);
+void McHorVer20Width16U8ToS16_avx2 (const uint8_t* pSrc, int32_t iSrcStride, int16_t* pDst, int32_t iHeight);
+void McHorVer20Width17U8ToS16_avx2 (const uint8_t* pSrc, int32_t iSrcStride, int16_t* pDst, int32_t iHeight);
+#endif //HAVE_AVX2
 
 #endif //X86_ASM
 
--- a/codec/common/src/mc.cpp
+++ b/codec/common/src/mc.cpp
@@ -44,6 +44,8 @@
 #include "ls_defines.h"
 #include "macros.h"
 
+namespace {
+
 typedef void (*PMcChromaWidthExtFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                                        const uint8_t* kpABCD, int32_t iHeight);
 typedef void (*PWelsSampleWidthAveragingFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*,
@@ -51,8 +53,6 @@
 typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                                         int32_t iWidth, int32_t iHeight);
 
-namespace WelsCommon {
-
 /*------------------weight for chroma fraction pixel interpolation------------------*/
 //iA = (8 - dx) * (8 - dy);
 //iB = dx * (8 - dy);
@@ -442,7 +442,7 @@
   else if (iWidth == 8)
     McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else if (iWidth == 4)
-    McCopyWidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else
     McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
@@ -710,6 +710,183 @@
     McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
 }
 
+//***************************************************************************//
+//                          SSSE3 implementation                             //
+//***************************************************************************//
+
+void PixelAvgWidth4Or8Or16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                                 const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
+  if (iWidth < 8) {
+    PixelAvgWidthEq4_mmx   (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
+  } else if (iWidth == 8) {
+    PixelAvgWidthEq8_mmx   (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
+  } else {
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
+  }
+}
+
+void McCopy_sse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                  int32_t iWidth, int32_t iHeight) {
+  switch (iWidth) {
+  case 16: return McCopyWidthEq16_sse3 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  case 8:  return McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  case 4:  return McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  }
+  return McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+void McHorVer22_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 16 + 5, 8, 16);
+  if (iWidth < 8) {
+    McHorVer20Width4U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
+    McHorVer02Width4S16ToU8_ssse3 (&pTmp[0][0], pDst, iDstStride, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
+    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
+  } else {
+    McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
+    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, 8, iHeight);
+    McHorVer20Width8U8ToS16_ssse3 (pSrc + 8, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
+    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst + 8, iDstStride, 8, iHeight);
+  }
+}
+
+void McHorVer01_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
+  McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+}
+
+void McHorVer03_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
+  McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride,
+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+}
+
+void McHorVer10_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
+  McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+}
+
+void McHorVer11_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+  McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+  McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+}
+
+void McHorVer12_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
+  McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+  McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+}
+
+void McHorVer13_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+  McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+  McHorVer02_ssse3 (pSrc,              iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+}
+
+void McHorVer21_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
+  McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+  McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+}
+
+void McHorVer23_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
+  McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+  McHorVer22_ssse3 (pSrc,              iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+}
+
+void McHorVer30_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
+  McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+}
+
+void McHorVer31_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+  McHorVer20_ssse3 (pSrc,     iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+  McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+}
+
+void McHorVer32_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
+  McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+  McHorVer22_ssse3 (pSrc,     iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+}
+
+void McHorVer33_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+  McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+  McHorVer02_ssse3 (pSrc + 1,          iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+}
+
+void McHorVer22Width5Or9Or17_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, WELS_ALIGN(17, 16 / sizeof (int16_t)), 16)
+  if (iWidth > 5) {
+    McHorVer20Width9Or17U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight + 5);
+    McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
+  } else {
+    McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
+    McHorVer02Width5S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iHeight);
+  }
+}
+
+void McLuma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = {
+    {McCopy_sse3,      McHorVer01_ssse3, McHorVer02_ssse3, McHorVer03_ssse3},
+    {McHorVer10_ssse3, McHorVer11_ssse3, McHorVer12_ssse3, McHorVer13_ssse3},
+    {McHorVer20_ssse3, McHorVer21_ssse3, McHorVer22_ssse3, McHorVer23_ssse3},
+    {McHorVer30_ssse3, McHorVer31_ssse3, McHorVer32_ssse3, McHorVer33_ssse3},
+  };
+
+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
 void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                      int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
   static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
@@ -728,6 +905,169 @@
     McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
 }
 
+//***************************************************************************//
+//                          AVX2 implementation                              //
+//***************************************************************************//
+
+#ifdef HAVE_AVX2
+
+void McHorVer22_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 16 + 5, 16, 32);
+  if (iWidth < 8) {
+    McHorVer20Width4U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
+    McHorVer02Width4S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20Width8U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
+    McHorVer02Width8S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
+  } else {
+    McHorVer20Width16U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
+    McHorVer02Width16Or17S16ToU8_avx2 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
+  }
+}
+
+void McHorVer01_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
+  McHorVer02_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+}
+
+void McHorVer03_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
+  McHorVer02_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride,
+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+}
+
+void McHorVer10_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
+  McHorVer20_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
+                              &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+}
+
+void McHorVer11_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+  McHorVer20_avx2 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+  McHorVer02_avx2 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+}
+
+void McHorVer12_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
+  McHorVer02_avx2 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+  McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+}
+
+void McHorVer13_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+  McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+  McHorVer02_avx2 (pSrc,              iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+}
+
+void McHorVer21_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
+  McHorVer20_avx2 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+  McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+}
+
+void McHorVer23_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
+  McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+  McHorVer22_avx2 (pSrc,              iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+}
+
+void McHorVer30_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
+  McHorVer20_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+}
+
+void McHorVer31_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+  McHorVer20_avx2 (pSrc,     iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+  McHorVer02_avx2 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+}
+
+void McHorVer32_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
+  McHorVer02_avx2 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+  McHorVer22_avx2 (pSrc,     iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
+                              &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+}
+
+void McHorVer33_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+  ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+  McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+  McHorVer02_avx2 (pSrc + 1,          iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+  PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+                              &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+}
+
+void McHorVer22Width5Or9Or17_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                   int32_t iWidth, int32_t iHeight) {
+  if (iWidth < 9) {
+    ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 9 + 5, WELS_ALIGN(5, 16 / sizeof (int16_t)), 16)
+    McHorVer20Width8U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
+    McHorVer02Width5S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
+  } else if (iWidth == 9) {
+    ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, 16, 32)
+    McHorVer20Width16U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
+    McHorVer02Width9S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight);
+  } else {
+    ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, WELS_ALIGN(17, 32 / sizeof (int16_t)), 32)
+    McHorVer20Width17U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
+    McHorVer02Width16Or17S16ToU8_avx2 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
+  }
+}
+
+void McLuma_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                  int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = {
+    {McCopy_sse3,     McHorVer01_avx2, McHorVer02_avx2, McHorVer03_avx2},
+    {McHorVer10_avx2, McHorVer11_avx2, McHorVer12_avx2, McHorVer13_avx2},
+    {McHorVer20_avx2, McHorVer21_avx2, McHorVer22_avx2, McHorVer23_avx2},
+    {McHorVer30_avx2, McHorVer31_avx2, McHorVer32_avx2, McHorVer33_avx2},
+  };
+
+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
+#endif //HAVE_AVX2
+
 void PixelAvg_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
                     const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
   static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
@@ -1319,7 +1659,9 @@
 }
 #endif
 
-void InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {
+} // anon ns.
+
+void WelsCommon::InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {
   pMcFuncs->pfLumaHalfpelHor  = McHorVer20_c;
   pMcFuncs->pfLumaHalfpelVer  = McHorVer02_c;
   pMcFuncs->pfLumaHalfpelCen  = McHorVer22_c;
@@ -1338,8 +1680,20 @@
   }
 
   if (uiCpuFlag & WELS_CPU_SSSE3) {
+    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_ssse3;
+    pMcFuncs->pfLumaHalfpelVer  = McHorVer02_ssse3;
+    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17_ssse3;
     pMcFuncs->pMcChromaFunc = McChroma_ssse3;
+    pMcFuncs->pMcLumaFunc   = McLuma_ssse3;
   }
+#ifdef HAVE_AVX2
+  if (uiCpuFlag & WELS_CPU_AVX2) {
+    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_avx2;
+    pMcFuncs->pfLumaHalfpelVer  = McHorVer02_avx2;
+    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17_avx2;
+    pMcFuncs->pMcLumaFunc       = McLuma_avx2;
+  }
+#endif
 #endif //(X86_ASM)
 
 #if defined(HAVE_NEON)
@@ -1363,4 +1717,3 @@
   }
 #endif
 }
-} // namespace WelsCommon
--- a/codec/common/x86/mb_copy.asm
+++ b/codec/common/x86/mb_copy.asm
@@ -44,6 +44,10 @@
 ;*********************************************************************************************/
 %include "asm_inc.asm"
 
+%ifdef __NASM_VER__
+    %use smartalign
+%endif
+
 ;***********************************************************************
 ; Macros and other preprocessor constants
 ;***********************************************************************
@@ -502,39 +506,37 @@
     LOAD_7_PARA_POP
     ret
 
-;*******************************************************************************
-;  void McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
-;                          uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-WELS_EXTERN McCopyWidthEq4_mmx
-    push    r5
-    %assign  push_num 1
-    LOAD_5_PARA
+; load_instr=%1 store_instr=%2 p_dst=%3 i_dststride=%4 p_src=%5 i_srcstride=%6 cnt=%7 r_tmp=%8,%9 mm_tmp=%10,%11
+%macro CopyStrided4N 11
+    lea             %8, [3 * %6]
+    lea             %9, [3 * %4]
+ALIGN 32
+%%loop:
+    %1              %10, [%5]
+    %1              %11, [%5 + %6]
+    %2              [%3], %10
+    %2              [%3 + %4], %11
+    %1              %10, [%5 + 2 * %6]
+    %1              %11, [%5 + %8]
+    %2              [%3 + 2 * %4], %10
+    %2              [%3 + %9], %11
+    lea             %5, [%5 + 4 * %6]
+    lea             %3, [%3 + 4 * %4]
+    sub             %7, 4
+    jg              %%loop
+%endmacro
 
-    SIGN_EXTENSION  r1, r1d
-    SIGN_EXTENSION  r3, r3d
-    SIGN_EXTENSION  r4, r4d
-
-ALIGN 4
-.height_loop:
-    mov r5d, [r0]
-    mov [r2], r5d
-
-    add r0, r1
-    add r2, r3
-    dec r4
-    jnz .height_loop
-    WELSEMMS
-    LOAD_5_PARA_POP
-    pop    r5
-    ret
-
 ;*******************************************************************************
 ;   void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
-;                           uint8_t *pDst, int iDstStride, int iHeight )
+;                            uint8_t *pDst, int iDstStride, int iHeight )
 ;*******************************************************************************
 WELS_EXTERN McCopyWidthEq8_mmx
     %assign  push_num 0
+%ifdef X86_32
+    push            r5
+    push            r6
+    %assign  push_num 2
+%endif
     LOAD_5_PARA
 
     SIGN_EXTENSION  r1, r1d
@@ -541,17 +543,14 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
 
-ALIGN 4
-.height_loop:
-    movq mm0, [r0]
-    movq [r2], mm0
-    add r0, r1
-    add r2, r3
-    dec r4
-    jnz .height_loop
+    CopyStrided4N   movq, movq, r2, r3, r0, r1, r4, r5, r6, mm0, mm1
 
     WELSEMMS
     LOAD_5_PARA_POP
+%ifdef X86_32
+    pop             r6
+    pop             r5
+%endif
     ret
 
 
@@ -588,4 +587,29 @@
     jnz     .height_loop
 
     LOAD_5_PARA_POP
+    ret
+
+
+;*******************************************************************************
+;   void McCopyWidthEq16_sse3( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+WELS_EXTERN McCopyWidthEq16_sse3
+    %assign push_num 0
+%ifdef X86_32
+    push            r5
+    push            r6
+    %assign push_num 2
+%endif
+    LOAD_5_PARA
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+
+    CopyStrided4N   lddqu, MOVDQ, r2, r3, r0, r1, r4, r5, r6, xmm0, xmm1
+
+    LOAD_5_PARA_POP
+%ifdef X86_32
+    pop             r6
+    pop             r5
+%endif
     ret
--- a/codec/common/x86/mc_luma.asm
+++ b/codec/common/x86/mc_luma.asm
@@ -44,16 +44,61 @@
 ;*******************************************************************************
 ; Local Data (Read Only)
 ;*******************************************************************************
-SECTION .rodata align=16
+SECTION .rodata align=32
 
 ;*******************************************************************************
 ; Various memory constants (trigonometric values or rounding values)
 ;*******************************************************************************
 
+%ifdef HAVE_AVX2
+ALIGN 32
+dwm32768_256:
+    times 16 dw -32768
+maddubsw_m2p10_m40m40_p10m2_p0p0_256:
+    times 4 db -2, 10, -40, -40, 10, -2, 0, 0
+dwm1024_256:
+    times 16 dw -1024
+dd32768_256:
+    times 8 dd 32768
+maddubsw_p1m5_256:
+    times 16 db 1, -5
+maddubsw_m5p1_256:
+    times 16 db -5, 1
+db20_256:
+    times 32 db 20
+maddubsw_m5p20_256:
+    times 16 db -5, 20
+maddubsw_p20m5_256:
+    times 16 db 20, -5
+h264_w0x10_256:
+    times 16 dw 16
+dw32_256:
+    times 16 dw 32
+%endif ; HAVE_AVX2
+
 ALIGN 16
-h264_w0x10:
-    dw 16, 16, 16, 16
-ALIGN 16
+shufb_32435465768798A9:
+    db 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9
+shufb_011267784556ABBC:
+    db 0, 1, 1, 2, 6, 7, 7, 8, 4, 5, 5, 6, 0Ah, 0Bh, 0Bh, 0Ch
+maddubsw_p1m5_p1m5_m5p1_m5p1_128:
+    times 2 db 1, -5, 1, -5, -5, 1, -5, 1
+maddubsw_m2p10_m40m40_p10m2_p0p0_128:
+    times 2 db -2, 10, -40, -40, 10, -2, 0, 0
+dwm1024_128:
+    times 8 dw -1024
+dd32768_128:
+    times 4 dd 32768
+maddubsw_p1m5_128:
+    times 8 db 1, -5
+maddubsw_m5p1_128:
+    times 8 db -5, 1
+db20_128:
+    times 16 db 20
+maddubsw_m5p20_128:
+    times 8 db -5, 20
+maddubsw_p20m5_128:
+    times 8 db 20, -5
 h264_w0x10_1:
     dw 16, 16, 16, 16, 16, 16, 16, 16
 ALIGN 16
@@ -85,7 +130,7 @@
 
     sub r0, 2
     WELS_Zero mm7
-    movq mm6, [h264_w0x10]
+    movq mm6, [h264_w0x10_1]
 .height_loop:
     movd mm0, [r0]
     punpcklbw mm0, mm7
@@ -1746,3 +1791,2559 @@
 LOAD_6_PARA_POP
 ret
 
+
+; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
+%macro SSSE3_FilterVertical_8px 7
+    pmaddubsw       %1, %4
+    movdqa          %7, %2
+    pmaddubsw       %7, %5
+    paddw           %1, %7
+    movdqa          %7, %3
+    pmaddubsw       %7, %6
+    paddw           %1, %7
+    paddw           %1, [h264_w0x10_1]
+    psraw           %1, 5
+%endmacro
+
+; px_a=%1 px_f=%2 px_bc=%3 px_de=%4 maddubsw_bc=%5 maddubsw_de=%6 tmp=%7,%8
+%macro SSSE3_FilterVertical2_8px 8
+    movdqa          %8, %2
+    pxor            %7, %7
+    punpcklbw       %1, %7
+    punpcklbw       %8, %7
+    paddw           %1, %8
+    movdqa          %7, %3
+    pmaddubsw       %7, %5
+    paddw           %1, %7
+    movdqa          %7, %4
+    pmaddubsw       %7, %6
+    paddw           %1, %7
+    paddw           %1, [h264_w0x10_1]
+    psraw           %1, 5
+%endmacro
+
+; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
+%macro SSSE3_FilterHorizontalbw_8px 6
+    movdqa          %5, %1
+    pshufb          %1, %2
+    pshufb          %5, %3
+    pshufd          %6, %1, 10110001b
+    pmaddubsw       %1, [db20_128]
+    pmaddubsw       %5, %4
+    pmaddubsw       %6, %4
+    paddw           %1, %5
+    paddw           %1, %6
+%endmacro
+
+; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
+%macro SSSE3_FilterHorizontal_8px 6
+    SSSE3_FilterHorizontalbw_8px %1, %2, %3, %4, %5, %6
+    paddw           %1, [h264_w0x10_1]
+    psraw           %1, 5
+%endmacro
+
+; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
+%macro SSSE3_FilterHorizontalbw_2x4px 7
+    movdqa          %6, %1
+    movdqa          %7, %2
+    pshufb          %1, %3
+    pshufb          %2, %3
+    punpcklqdq      %1, %2
+    pshufb          %6, %4
+    pshufb          %7, %4
+    punpcklqdq      %6, %7
+    pshufd          %7, %1, 10110001b
+    pmaddubsw       %1, [db20_128]
+    pmaddubsw       %6, %5
+    pmaddubsw       %7, %5
+    paddw           %1, %6
+    paddw           %1, %7
+%endmacro
+
+; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
+%macro SSSE3_FilterHorizontal_2x4px 7
+    SSSE3_FilterHorizontalbw_2x4px %1, %2, %3, %4, %5, %6, %7
+    paddw           %1, [h264_w0x10_1]
+    psraw           %1, 5
+%endmacro
+
+; pixels=%1 -32768>>scale=%2 tmp=%3
+%macro SSSE3_FilterHorizontalbw_2px 3
+    pmaddubsw       %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_128]
+    pmaddwd         %1, %2
+    pshufd          %3, %1, 10110001b
+    paddd           %1, %3
+%endmacro
+
+; pixels=%1 tmp=%2
+%macro SSSE3_FilterHorizontal_2px 2
+    SSSE3_FilterHorizontalbw_2px %1, [dwm1024_128], %2
+    paddd           %1, [dd32768_128]
+%endmacro
+
+; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
+%macro SSE2_FilterVerticalw_8px 7
+    paddw           %1, %6
+    movdqa          %7, %2
+    paddw           %7, %5
+    psubw           %1, %7
+    psraw           %1, 2
+    psubw           %1, %7
+    movdqa          %7, %3
+    paddw           %7, %4
+    paddw           %1, %7
+    psraw           %1, 2
+    paddw           %7, [h264_mc_hc_32]
+    paddw           %1, %7
+    psraw           %1, 6
+%endmacro
+
+;***********************************************************************
+; void McHorVer02_ssse3(const uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;                       uint8_t *pDst,
+;                       int32_t iDstStride,
+;                       int32_t iWidth,
+;                       int32_t iHeight)
+;***********************************************************************
+
+WELS_EXTERN McHorVer02_ssse3
+%define p_src         r0
+%define i_srcstride   r1
+%define p_dst         r2
+%define i_dststride   r3
+%define i_width       r4
+%define i_height      r5
+%define i_srcstride3  r6
+    %assign push_num 0
+%ifdef X86_32
+    push            r6
+    %assign push_num 1
+%endif
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
+    sub             p_src, i_srcstride
+    sub             p_src, i_srcstride
+    lea             i_srcstride3, [3 * i_srcstride]
+    cmp             i_width, 4
+    jg              .width8or16
+    movd            xmm0, [p_src]
+    movd            xmm4, [p_src + i_srcstride]
+    punpcklbw       xmm0, xmm4
+    movd            xmm1, [p_src + 2 * i_srcstride]
+    punpcklbw       xmm4, xmm1
+    punpcklqdq      xmm0, xmm4
+    movd            xmm4, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    punpcklbw       xmm1, xmm4
+    movd            xmm2, [p_src]
+    punpcklbw       xmm4, xmm2
+    punpcklqdq      xmm1, xmm4
+    movd            xmm4, [p_src + i_srcstride]
+    lea             p_src, [p_src + 2 * i_srcstride]
+    punpcklbw       xmm2, xmm4
+    movd            xmm3, [p_src]
+    punpcklbw       xmm4, xmm3
+    punpcklqdq      xmm2, xmm4
+    movdqa          xmm5, [db20_128]
+    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+    packuswb        xmm0, xmm0
+    movd            [p_dst], xmm0
+    psrlq           xmm0, 32
+    movd            [p_dst + i_dststride], xmm0
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    movd            xmm4, [p_src + i_srcstride]
+    punpcklbw       xmm3, xmm4
+    movd            xmm0, [p_src + 2 * i_srcstride]
+    punpcklbw       xmm4, xmm0
+    punpcklqdq      xmm3, xmm4
+    SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+    packuswb        xmm1, xmm1
+    movd            [p_dst], xmm1
+    psrlq           xmm1, 32
+    movd            [p_dst + i_dststride], xmm1
+    cmp             i_height, 5
+    jl              .width4_height_le5_done
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    movd            xmm4, [p_src + i_srcstride3]
+    punpcklbw       xmm0, xmm4
+    jg              .width4_height_ge8
+    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+    packuswb        xmm2, xmm2
+    movd            [p_dst], xmm2
+.width4_height_le5_done:
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop             r6
+%endif
+    ret
+.width4_height_ge8:
+    lea             p_src, [p_src + 4 * i_srcstride]
+    movd            xmm1, [p_src]
+    punpcklbw       xmm4, xmm1
+    punpcklqdq      xmm0, xmm4
+    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+    packuswb        xmm2, xmm2
+    movd            [p_dst], xmm2
+    psrlq           xmm2, 32
+    movd            [p_dst + i_dststride], xmm2
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    movd            xmm4, [p_src + i_srcstride]
+    punpcklbw       xmm1, xmm4
+    movd            xmm2, [p_src + 2 * i_srcstride]
+    punpcklbw       xmm4, xmm2
+    punpcklqdq      xmm1, xmm4
+    SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+    packuswb        xmm3, xmm3
+    movd            [p_dst], xmm3
+    psrlq           xmm3, 32
+    movd            [p_dst + i_dststride], xmm3
+    cmp             i_height, 9
+    jl              .width4_height_ge8_done
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    movd            xmm4, [p_src + i_srcstride3]
+    punpcklbw       xmm2, xmm4
+    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+    packuswb        xmm0, xmm0
+    movd            [p_dst], xmm0
+.width4_height_ge8_done:
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop             r6
+%endif
+    ret
+
+.width8or16:
+    sub             i_height, 1
+    push            i_height
+%xdefine i_ycnt i_height
+%define i_height [r7]
+.xloop:
+    push            p_src
+    push            p_dst
+    test            i_ycnt, 1
+    jnz             .yloop_begin_even
+    movq            xmm0, [p_src]
+    movq            xmm1, [p_src + i_srcstride]
+    punpcklbw       xmm0, xmm1
+    movq            xmm2, [p_src + 2 * i_srcstride]
+    movq            xmm3, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    punpcklbw       xmm2, xmm3
+    movq            xmm4, [p_src]
+    movq            xmm5, [p_src + i_srcstride]
+    lea             p_src, [p_src + 2 * i_srcstride]
+    punpcklbw       xmm4, xmm5
+    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm7
+    packuswb        xmm0, xmm0
+    movlps          [p_dst], xmm0
+    add             p_dst, i_dststride
+    jmp             .yloop
+.yloop_begin_even:
+    movq            xmm1, [p_src]
+    movq            xmm2, [p_src + i_srcstride]
+    movq            xmm3, [p_src + 2 * i_srcstride]
+    add             p_src, i_srcstride3
+    punpcklbw       xmm2, xmm3
+    movq            xmm4, [p_src]
+    movq            xmm5, [p_src + i_srcstride]
+    lea             p_src, [p_src + 2 * i_srcstride]
+    punpcklbw       xmm4, xmm5
+.yloop:
+    movq            xmm6, [p_src]
+    SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm0, xmm7
+    movq            xmm7, [p_src + i_srcstride]
+    punpcklbw       xmm6, xmm7
+    SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm0
+    packuswb        xmm1, xmm2
+    movlps          [p_dst], xmm1
+    movhps          [p_dst + i_dststride], xmm1
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    movq            xmm0, [p_src + 2 * i_srcstride]
+    SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm2, xmm1
+    movq            xmm1, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    punpcklbw       xmm0, xmm1
+    SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm2
+    packuswb        xmm3, xmm4
+    movlps          [p_dst], xmm3
+    movhps          [p_dst + i_dststride], xmm3
+    cmp             i_ycnt, 4
+    jle             .yloop_exit
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    movq            xmm2, [p_src]
+    SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm4, xmm3
+    movq            xmm3, [p_src + i_srcstride]
+    punpcklbw       xmm2, xmm3
+    SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm4
+    packuswb        xmm5, xmm6
+    movlps          [p_dst], xmm5
+    movhps          [p_dst + i_dststride], xmm5
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    movq            xmm4, [p_src + 2 * i_srcstride]
+    SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm6, xmm5
+    movq            xmm5, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    punpcklbw       xmm4, xmm5
+    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm6
+    packuswb        xmm7, xmm0
+    movlps          [p_dst], xmm7
+    movhps          [p_dst + i_dststride], xmm7
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    sub             i_ycnt, 8
+    jg              .yloop
+.yloop_exit:
+    pop             p_dst
+    pop             p_src
+    sub             i_width, 8
+    jle             .width8or16_done
+    add             p_src, 8
+    add             p_dst, 8
+    mov             i_ycnt, i_height
+    jmp             .xloop
+.width8or16_done:
+    pop             i_ycnt
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop             r6
+%endif
+    ret
+%undef p_src
+%undef i_srcstride
+%undef i_srcstride3
+%undef p_dst
+%undef i_dststride
+%undef i_width
+%undef i_height
+%undef i_ycnt
+
+
+;*******************************************************************************
+; void McHorVer20_ssse3(const uint8_t *pSrc,
+;                       int iSrcStride,
+;                       uint8_t *pDst,
+;                       int iDstStride,
+;                       int iWidth,
+;                       int iHeight);
+;*******************************************************************************
+
+WELS_EXTERN McHorVer20_ssse3
+%define p_src        r0
+%define i_srcstride  r1
+%define p_dst        r2
+%define i_dststride  r3
+%define i_width      r4
+%define i_height     r5
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 7
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
+    movdqa          xmm4, [shufb_32435465768798A9]
+    movdqa          xmm5, [shufb_011267784556ABBC]
+    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+    cmp             i_width, 8
+    je              .width8_yloop
+    jg              .width16_yloop
+.width4_yloop:
+    movdqu          xmm0, [p_src - 2]
+    movdqu          xmm1, [p_src + i_srcstride - 2]
+    lea             p_src, [p_src + 2 * i_srcstride]
+    SSSE3_FilterHorizontal_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
+    packuswb        xmm0, xmm0
+    movd            [p_dst], xmm0
+    psrlq           xmm0, 32
+    movd            [p_dst + i_dststride], xmm0
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    sub             i_height, 2
+    jg              .width4_yloop
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+.width8_yloop:
+    movdqu          xmm0, [p_src - 2]
+    movdqu          xmm1, [p_src + i_srcstride - 2]
+    lea             p_src, [p_src + 2 * i_srcstride]
+    SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
+    SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
+    packuswb        xmm0, xmm1
+    movlps          [p_dst], xmm0
+    movhps          [p_dst + i_dststride], xmm0
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    sub             i_height, 2
+    jg              .width8_yloop
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+.width16_yloop:
+    movdqu          xmm0, [p_src - 2]
+    movdqu          xmm1, [p_src + 6]
+    add             p_src, i_srcstride
+    SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
+    SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
+    packuswb        xmm0, xmm1
+    MOVDQ           [p_dst], xmm0
+    add             p_dst, i_dststride
+    sub             i_height, 1
+    jg              .width16_yloop
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_width
+%undef i_height
+
+
+;***********************************************************************
+; void McHorVer20Width5Or9Or17_ssse3(const uint8_t *pSrc,
+;                                    int32_t iSrcStride,
+;                                    uint8_t *pDst,
+;                                    int32_t iDstStride,
+;                                    int32_t iWidth,
+;                                    int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer20Width5Or9Or17_ssse3
+%define p_src        r0
+%define i_srcstride  r1
+%define p_dst        r2
+%define i_dststride  r3
+%define i_width      r4
+%define i_height     r5
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
+    movdqa          xmm5, [shufb_32435465768798A9]
+    movdqa          xmm6, [shufb_011267784556ABBC]
+    movdqa          xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+    cmp             i_width, 9
+    je              .width9_yloop
+    jg              .width17_yloop
+.width5_yloop:
+    movdqu          xmm0, [p_src - 2]
+    add             p_src, i_srcstride
+    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
+    packuswb        xmm0, xmm0
+    movdqa          xmm1, xmm0
+    psrlq           xmm1, 8
+    movd            [p_dst], xmm0
+    movd            [p_dst + 1], xmm1
+    add             p_dst, i_dststride
+    sub             i_height, 1
+    jg              .width5_yloop
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+.width9_yloop:
+    movdqu          xmm0, [p_src - 2]
+    movdqu          xmm4, [p_src + i_srcstride - 2]
+    lea             p_src, [p_src + 2 * i_srcstride]
+    movdqa          xmm3, xmm0
+    punpckhqdq      xmm3, xmm4
+    SSSE3_FilterHorizontal_2px xmm3, xmm2
+    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
+    packuswb        xmm3, xmm0
+    movd            [p_dst + 5], xmm3
+    movhps          [p_dst], xmm3
+    add             p_dst, i_dststride
+    SSSE3_FilterHorizontal_8px xmm4, xmm5, xmm6, xmm7, xmm1, xmm2
+    packuswb        xmm4, xmm4
+    psrldq          xmm3, 4
+    movd            [p_dst + 5], xmm3
+    movlps          [p_dst], xmm4
+    add             p_dst, i_dststride
+    sub             i_height, 2
+    jg              .width9_yloop
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+.width17_yloop:
+    movdqu          xmm0, [p_src - 2]
+    movdqu          xmm3, [p_src + 6]
+    add             p_src, i_srcstride
+    movdqa          xmm4, xmm3
+    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
+    SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2
+    packuswb        xmm0, xmm3
+    movdqu          xmm1, [p_src - 2]
+    movdqu          xmm3, [p_src + 6]
+    add             p_src, i_srcstride
+    punpckhqdq      xmm4, xmm3
+    SSSE3_FilterHorizontal_2px xmm4, xmm2
+    packuswb        xmm4, xmm4
+    movd            [p_dst + 13], xmm4
+    MOVDQ           [p_dst], xmm0
+    add             p_dst, i_dststride
+    psrldq          xmm4, 4
+    movd            [p_dst + 13], xmm4
+    SSSE3_FilterHorizontal_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm2
+    SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm0, xmm2
+    packuswb        xmm1, xmm3
+    MOVDQ           [p_dst], xmm1
+    add             p_dst, i_dststride
+    sub             i_height, 2
+    jg              .width17_yloop
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_width
+%undef i_height
+
+
+;*******************************************************************************
+; void McHorVer20Width4U8ToS16_ssse3(const uint8_t *pSrc,
+;                                    int iSrcStride,
+;                                    int16_t *pDst,
+;                                    int iHeight);
+;*******************************************************************************
+
+WELS_EXTERN McHorVer20Width4U8ToS16_ssse3
+%define p_src        r0
+%define i_srcstride  r1
+%define p_dst        r2
+%define i_height     r3
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 7
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    sub             p_src, i_srcstride
+    sub             p_src, i_srcstride
+    movdqa          xmm4, [shufb_32435465768798A9]
+    movdqa          xmm5, [shufb_011267784556ABBC]
+    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+    sub             i_height, 1
+.yloop:
+    movdqu          xmm0, [p_src - 2]
+    movdqu          xmm1, [p_src + i_srcstride - 2]
+    lea             p_src, [p_src + 2 * i_srcstride]
+    SSSE3_FilterHorizontalbw_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
+    movdqa          [p_dst], xmm0
+    add             p_dst, 16
+    sub             i_height, 2
+    jg              .yloop
+    ; Height % 2 remainder.
+    movdqu          xmm0, [p_src - 2]
+    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
+    movlps          [p_dst], xmm0
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_height
+
+
+;***********************************************************************
+; void McHorVer02Width4S16ToU8_ssse3(const int16_t *pSrc,
+;                                    uint8_t *pDst,
+;                                    int32_t iDstStride,
+;                                    int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer02Width4S16ToU8_ssse3
+%define p_src        r0
+%define p_dst        r1
+%define i_dststride  r2
+%define i_height     r3
+%define i_srcstride  8
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r2, r2d
+    SIGN_EXTENSION  r3, r3d
+    movdqa          xmm0, [p_src +  0 * i_srcstride]
+    movdqu          xmm1, [p_src +  1 * i_srcstride]
+    movdqa          xmm2, [p_src +  2 * i_srcstride]
+    movdqu          xmm3, [p_src +  3 * i_srcstride]
+    movdqa          xmm4, [p_src +  4 * i_srcstride]
+    movdqu          xmm5, [p_src +  5 * i_srcstride]
+    movdqa          xmm6, [p_src +  6 * i_srcstride]
+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
+    packuswb        xmm0, xmm0
+    movd            [p_dst], xmm0
+    psrlq           xmm0, 32
+    movd            [p_dst + i_dststride], xmm0
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    movdqu          xmm7, [p_src +  7 * i_srcstride]
+    movdqa          xmm0, [p_src +  8 * i_srcstride]
+    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm1
+    packuswb        xmm2, xmm2
+    movd            [p_dst], xmm2
+    psrlq           xmm2, 32
+    movd            [p_dst + i_dststride], xmm2
+    cmp             i_height, 4
+    jle             .done
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    movdqu          xmm1, [p_src +  9 * i_srcstride]
+    movdqa          xmm2, [p_src + 10 * i_srcstride]
+    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm3
+    packuswb        xmm4, xmm4
+    movd            [p_dst], xmm4
+    psrlq           xmm4, 32
+    movd            [p_dst + i_dststride], xmm4
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    movdqu          xmm3, [p_src + 11 * i_srcstride]
+    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm5
+    packuswb        xmm6, xmm6
+    movd            [p_dst], xmm6
+    psrlq           xmm6, 32
+    movd            [p_dst + i_dststride], xmm6
+.done:
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
+%undef p_src
+%undef p_dst
+%undef i_dststride
+%undef i_height
+%undef i_srcstride
+
+
+;***********************************************************************
+; void McHorVer20Width8U8ToS16_ssse3(const uint8_t *pSrc,
+;                                    int16_t iSrcStride,
+;                                    int16_t *pDst,
+;                                    int32_t iDstStride,
+;                                    int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer20Width8U8ToS16_ssse3
+%define p_src        r0
+%define i_srcstride  r1
+%define p_dst        r2
+%define i_dststride  r3
+%define i_height     r4
+    %assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 7
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    sub             p_src, i_srcstride
+    sub             p_src, i_srcstride
+    movdqa          xmm4, [shufb_32435465768798A9]
+    movdqa          xmm5, [shufb_011267784556ABBC]
+    movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+    sub             i_height, 1
+.yloop:
+    movdqu          xmm0, [p_src - 2]
+    movdqu          xmm1, [p_src + i_srcstride - 2]
+    lea             p_src, [p_src + 2 * i_srcstride]
+    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
+    MOVDQ           [p_dst], xmm0
+    add             p_dst, i_dststride
+    SSSE3_FilterHorizontalbw_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
+    MOVDQ           [p_dst], xmm1
+    add             p_dst, i_dststride
+    sub             i_height, 2
+    jg              .yloop
+    jl              .done
+    movdqu          xmm0, [p_src - 2]
+    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
+    MOVDQ           [p_dst], xmm0
+.done:
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_height
+
+
+;***********************************************************************
+; void McHorVer02Width5S16ToU8_ssse3(const int16_t *pSrc,
+;                                    int32_t iTapStride,
+;                                    uint8_t *pDst,
+;                                    int32_t iDstStride,
+;                                    int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer02Width5S16ToU8_ssse3
+%define p_src        r0
+%define i_srcstride  r1
+%define p_dst        r2
+%define i_dststride  r3
+%define i_height     r4
+%define i_srcstride3 r5
+    %assign  push_num 0
+%ifdef X86_32
+    push            r5
+    %assign  push_num 1
+%endif
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    lea             i_srcstride3, [3 * i_srcstride]
+    movdqa          xmm0, [p_src]
+    movdqa          xmm1, [p_src + i_srcstride]
+    movdqa          xmm2, [p_src + 2 * i_srcstride]
+    movdqa          xmm3, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    movdqa          xmm4, [p_src]
+    movdqa          xmm5, [p_src + i_srcstride]
+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+    movdqa          xmm6, [p_src + 2 * i_srcstride]
+    packuswb        xmm0, xmm0
+    movdqa          xmm7, xmm0
+    psrlq           xmm7, 8
+    movd            [p_dst + 1], xmm7
+    movd            [p_dst], xmm0
+    add             p_dst, i_dststride
+    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+    movdqa          xmm7, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    packuswb        xmm1, xmm1
+    movdqa          xmm0, xmm1
+    psrlq           xmm0, 8
+    movd            [p_dst + 1], xmm0
+    movd            [p_dst], xmm1
+    add             p_dst, i_dststride
+    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0
+    movdqa          xmm0, [p_src]
+    packuswb        xmm2, xmm2
+    movdqa          xmm1, xmm2
+    psrlq           xmm1, 8
+    movd            [p_dst + 1], xmm1
+    movd            [p_dst], xmm2
+    add             p_dst, i_dststride
+    SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
+    packuswb        xmm3, xmm3
+    movdqa          xmm2, xmm3
+    psrlq           xmm2, 8
+    movd            [p_dst + 1], xmm2
+    movd            [p_dst], xmm3
+    add             p_dst, i_dststride
+    movdqa          xmm1, [p_src + i_srcstride]
+    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2
+    packuswb        xmm4, xmm4
+    movdqa          xmm3, xmm4
+    psrlq           xmm3, 8
+    movd            [p_dst + 1], xmm3
+    movd            [p_dst], xmm4
+    cmp             i_height, 5
+    jle             .done
+    add             p_dst, i_dststride
+    movdqa          xmm2, [p_src + 2 * i_srcstride]
+    SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3
+    movdqa          xmm3, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    packuswb        xmm5, xmm5
+    movdqa          xmm4, xmm5
+    psrlq           xmm4, 8
+    movd            [p_dst + 1], xmm4
+    movd            [p_dst], xmm5
+    add             p_dst, i_dststride
+    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4
+    movdqa          xmm4, [p_src]
+    packuswb        xmm6, xmm6
+    movdqa          xmm5, xmm6
+    psrlq           xmm5, 8
+    movd            [p_dst + 1], xmm5
+    movd            [p_dst], xmm6
+    add             p_dst, i_dststride
+    SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    packuswb        xmm7, xmm7
+    movdqa          xmm6, xmm7
+    psrlq           xmm6, 8
+    movd            [p_dst + 1], xmm6
+    movd            [p_dst], xmm7
+    add             p_dst, i_dststride
+    movdqa          xmm5, [p_src + i_srcstride]
+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+    packuswb        xmm0, xmm0
+    movdqa          xmm7, xmm0
+    psrlq           xmm7, 8
+    movd            [p_dst + 1], xmm7
+    movd            [p_dst], xmm0
+.done:
+    POP_XMM
+    LOAD_5_PARA_POP
+%ifdef X86_32
+    pop             r5
+%endif
+    ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_height
+%undef i_srcstride3
+
+
+;***********************************************************************
+; void McHorVer20Width9Or17U8ToS16_ssse3(const uint8_t *pSrc,
+;                                        int32_t iSrcStride,
+;                                        int16_t *pDst,
+;                                        int32_t iDstStride,
+;                                        int32_t iWidth,
+;                                        int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer20Width9Or17U8ToS16_ssse3
+%define p_src       r0
+%define i_srcstride r1
+%define p_dst       r2
+%define i_dststride r3
+%define i_width     r4
+%define i_height    r5
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
+    sub             p_src, i_srcstride
+    sub             p_src, i_srcstride
+    pcmpeqw         xmm4, xmm4
+    psllw           xmm4, 15                                ; dw -32768
+    movdqa          xmm5, [shufb_32435465768798A9]
+    movdqa          xmm6, [shufb_011267784556ABBC]
+    movdqa          xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+    cmp             i_width, 9
+    jne             .width17_yloop
+
+.width9_yloop:
+    movdqu          xmm0, [p_src - 2]
+    movdqa          xmm3, xmm0
+    SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
+    movdqu          xmm2, [p_src + i_srcstride - 2]
+    lea             p_src, [p_src + 2 * i_srcstride]
+    punpckhqdq      xmm3, xmm2
+    SSSE3_FilterHorizontalbw_2px xmm3, xmm4, xmm1
+    movlps          [p_dst + 10], xmm3
+    MOVDQ           [p_dst], xmm0
+    add             p_dst, i_dststride
+    movhps          [p_dst + 10], xmm3
+    SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm1, xmm0
+    MOVDQ           [p_dst], xmm2
+    add             p_dst, i_dststride
+    sub             i_height, 2
+    jg              .width9_yloop
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+
+.width17_yloop:
+    movdqu          xmm0, [p_src - 2]
+    movdqu          xmm3, [p_src + 6]
+    add             p_src, i_srcstride
+    SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
+    MOVDQ           [p_dst], xmm0
+    movdqa          xmm0, xmm3
+    SSSE3_FilterHorizontalbw_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2
+    movdqu          xmm2, [p_src + 6]
+    punpckhqdq      xmm0, xmm2
+    SSSE3_FilterHorizontalbw_2px xmm0, xmm4, xmm1
+    movdqu          xmm1, [p_src - 2]
+    add             p_src, i_srcstride
+    movlps          [p_dst + 26], xmm0
+    MOVDQ           [p_dst + 16], xmm3
+    add             p_dst, i_dststride
+    movhps          [p_dst + 26], xmm0
+    SSSE3_FilterHorizontalbw_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm3
+    MOVDQ           [p_dst], xmm1
+    SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm0, xmm3
+    MOVDQ           [p_dst + 16], xmm2
+    add             p_dst, i_dststride
+    sub             i_height, 2
+    jg              .width17_yloop
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_width
+%undef i_height
+
+
+;***********************************************************************
+; void McHorVer02WidthGe8S16ToU8_ssse3(const int16_t *pSrc,
+;                                      int32_t iSrcStride,
+;                                      uint8_t *pDst,
+;                                      int32_t iDstStride,
+;                                      int32_t iWidth,
+;                                      int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer02WidthGe8S16ToU8_ssse3
+%define p_src        r0
+%define i_srcstride  r1
+%define p_dst        r2
+%define i_dststride  r3
+%define i_width      r4
+%define i_height     r5
+%define i_srcstride3 r6
+    %assign  push_num 0
+%ifdef X86_32
+    push            r6
+    %assign  push_num 1
+%endif
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
+    sub             i_height, 1
+    push            i_height
+    lea             i_srcstride3, [3 * i_srcstride]
+    test            i_width, 1
+    jz              .width_loop
+    push            p_src
+    push            p_dst
+    lea             p_src, [p_src + 2 * i_width - 2]
+    add             p_dst, i_width
+    movd            xmm0, [p_src]
+    punpcklwd       xmm0, [p_src + i_srcstride]
+    movd            xmm1, [p_src + 2 * i_srcstride]
+    add             p_src, i_srcstride3
+    punpcklwd       xmm1, [p_src]
+    punpckldq       xmm0, xmm1
+    movd            xmm1, [p_src + i_srcstride]
+    cmp             i_height, 4
+    je              .filter5_unalign
+    punpcklwd       xmm1, [p_src + 2 * i_srcstride]
+    movd            xmm2, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    punpcklwd       xmm2, [p_src]
+    punpckldq       xmm1, xmm2
+    punpcklqdq      xmm0, xmm1
+.height_loop_unalign:
+    movd            xmm1, [p_src + i_srcstride]
+    palignr         xmm1, xmm0, 2
+    movd            xmm2, [p_src + 2 * i_srcstride]
+    palignr         xmm2, xmm1, 2
+    movd            xmm3, [p_src + i_srcstride3]
+    palignr         xmm3, xmm2, 2
+    lea             p_src, [p_src + 4 * i_srcstride]
+    movd            xmm4, [p_src]
+    palignr         xmm4, xmm3, 2
+    movd            xmm5, [p_src + i_srcstride]
+    palignr         xmm5, xmm4, 2
+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
+    packuswb        xmm0, xmm0
+    movdqa          xmm6, xmm0
+    pslld           xmm6, 24
+    movd            [p_dst - 4], xmm6
+    movlps          [p_dst + 4 * i_dststride - 8], xmm6
+    add             p_dst, i_dststride
+    movdqa          xmm6, xmm0
+    pslld           xmm6, 16
+    movd            [p_dst - 4], xmm6
+    movlps          [p_dst + 4 * i_dststride - 8], xmm6
+    add             p_dst, i_dststride
+    movdqa          xmm6, xmm0
+    pslld           xmm6, 8
+    movd            [p_dst - 4], xmm6
+    movd            [p_dst + i_dststride - 4], xmm0
+    lea             p_dst, [p_dst + 4 * i_dststride]
+    movlps          [p_dst - 8], xmm6
+    movlps          [p_dst + i_dststride - 8], xmm0
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    sub             i_height, 8
+    jle             .height_loop_unalign_exit
+    movd            xmm1, [p_src + 2 * i_srcstride]
+    palignr         xmm1, xmm5, 2
+    movd            xmm0, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    punpcklwd       xmm0, [p_src]
+    palignr         xmm0, xmm1, 4
+    jmp             .height_loop_unalign
+.height_loop_unalign_exit:
+    movddup         xmm6, [p_src + 2 * i_srcstride - 6]
+    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+    packuswb        xmm1, xmm1
+    movlps          [p_dst - 8], xmm1
+    jmp             .unalign_done
+.filter5_unalign:
+    pslldq          xmm0, 8
+    palignr         xmm1, xmm0, 2
+    movd            xmm2, [p_src + 2 * i_srcstride]
+    palignr         xmm2, xmm1, 2
+    movd            xmm3, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    palignr         xmm3, xmm2, 2
+    movd            xmm4, [p_src]
+    palignr         xmm4, xmm3, 2
+    movd            xmm5, [p_src + i_srcstride]
+    palignr         xmm5, xmm4, 2
+    movd            xmm6, [p_src + 2 * i_srcstride]
+    palignr         xmm6, xmm5, 2
+    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+    packuswb        xmm1, xmm1
+    movdqa          xmm0, xmm1
+    psrlq           xmm1,  8
+    movdqa          xmm2, xmm0
+    psrlq           xmm2, 16
+    movdqa          xmm3, xmm0
+    psrlq           xmm3, 24
+    movd            [p_dst - 4], xmm0
+    movd            [p_dst + i_dststride - 4], xmm1
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    movd            [p_dst - 4], xmm2
+    movd            [p_dst + i_dststride - 4], xmm3
+    movlps          [p_dst + 2 * i_dststride - 8], xmm0
+.unalign_done:
+    pop             p_dst
+    pop             p_src
+    mov             i_height, [r7]
+    sub             i_width, 1
+.width_loop:
+    push            p_src
+    push            p_dst
+    movdqa          xmm0, [p_src]
+    movdqa          xmm1, [p_src + i_srcstride]
+    movdqa          xmm2, [p_src + 2 * i_srcstride]
+    movdqa          xmm3, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    movdqa          xmm4, [p_src]
+.height_loop:
+    movdqa          xmm5, [p_src + i_srcstride]
+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+    movdqa          xmm6, [p_src + 2 * i_srcstride]
+    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+    movdqa          xmm7, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    packuswb        xmm0, xmm1
+    movlps          [p_dst], xmm0
+    movhps          [p_dst + i_dststride], xmm0
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0
+    movdqa          xmm0, [p_src]
+    SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
+    packuswb        xmm2, xmm3
+    movlps          [p_dst], xmm2
+    movhps          [p_dst + i_dststride], xmm2
+    cmp             i_height, 4
+    jl              .x_loop_dec
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    movdqa          xmm1, [p_src + i_srcstride]
+    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2
+    je              .store_xmm4_exit
+    movdqa          xmm2, [p_src + 2 * i_srcstride]
+    SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3
+    movdqa          xmm3, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    packuswb        xmm4, xmm5
+    movlps          [p_dst], xmm4
+    movhps          [p_dst + i_dststride], xmm4
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4
+    movdqa          xmm4, [p_src]
+    SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    packuswb        xmm6, xmm7
+    movlps          [p_dst], xmm6
+    movhps          [p_dst + i_dststride], xmm6
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    sub             i_height, 8
+    jg              .height_loop
+    jl              .x_loop_dec
+    movdqa          xmm5, [p_src + i_srcstride]
+    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+    packuswb        xmm0, xmm0
+    movlps          [p_dst], xmm0
+.x_loop_dec:
+    pop             p_dst
+    pop             p_src
+    sub             i_width, 8
+    jle             .done
+    mov             i_height, [r7]
+    add             p_src, 16
+    add             p_dst, 8
+    jmp             .width_loop
+.store_xmm4_exit:
+    packuswb        xmm4, xmm4
+    movlps          [p_dst], xmm4
+    pop             p_dst
+    pop             p_src
+.done:
+    pop             i_height
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop             r6
+%endif
+    ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_width
+%undef i_height
+%undef i_srcstride3
+
+
+%ifdef HAVE_AVX2
+
+; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
+%macro AVX2_FilterHorizontalbw_16px 6
+    vpshufb         %5, %1, %3
+    vpshufb         %1, %1, %2
+    vpshufd         %6, %1, 10110001b
+    vpmaddubsw      %1, %1, [db20_256]
+    vpmaddubsw      %5, %5, %4
+    vpmaddubsw      %6, %6, %4
+    vpaddw          %1, %1, %5
+    vpaddw          %1, %1, %6
+%endmacro
+
+; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 db20=%4 tmp=%5,%6
+%macro AVX2_FilterHorizontal_16px 6
+    AVX2_FilterHorizontalbw_16px %1, %2, %3, %4, %5, %6
+    vpaddw          %1, %1, [h264_w0x10_256]
+    vpsraw          %1, %1, 5
+%endmacro
+
+; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
+%macro AVX2_FilterHorizontalbw_4x4px 7
+    vpshufb         %6, %1, %4
+    vpshufb         %7, %2, %4
+    vpshufb         %1, %1, %3
+    vpshufb         %2, %2, %3
+    vpunpcklqdq     %1, %1, %2
+    vpunpcklqdq     %6, %6, %7
+    vpshufd         %7, %1, 10110001b
+    vpmaddubsw      %1, %1, [db20_256]
+    vpmaddubsw      %6, %6, %5
+    vpmaddubsw      %7, %7, %5
+    vpaddw          %1, %1, %6
+    vpaddw          %1, %1, %7
+%endmacro
+
+; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 db20=%5 tmp=%6,%7
+%macro AVX2_FilterHorizontal_4x4px 7
+    AVX2_FilterHorizontalbw_4x4px %1, %2, %3, %4, %5, %6, %7
+    vpaddw          %1, %1, [h264_w0x10_256]
+    vpsraw          %1, %1, 5
+%endmacro
+
+; pixels=%1 -32768>>scale=%2 tmp=%3
+%macro AVX2_FilterHorizontalbw_4px 3
+    vpmaddubsw      %1, %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_256]
+    vpmaddwd        %1, %1, %2
+    vpshufd         %3, %1, 10110001b
+    vpaddd          %1, %1, %3
+%endmacro
+
+; pixels=%1 tmp=%2
+%macro AVX2_FilterHorizontal_4px 2
+    AVX2_FilterHorizontalbw_4px %1, [dwm1024_256], %2
+    vpaddd          %1, %1, [dd32768_256]
+%endmacro
+
+; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
+%macro AVX2_FilterVertical_16px 7
+    vpmaddubsw      %1, %1, %4
+    vpmaddubsw      %7, %2, %5
+    vpaddw          %1, %1, %7
+    vpmaddubsw      %7, %3, %6
+    vpaddw          %1, %1, %7
+    vpaddw          %1, %1, [h264_w0x10_256]
+    vpsraw          %1, %1, 5
+%endmacro
+
+; px_a=%1 px_f=%2 px_bc=%3 px_de=%4 maddubsw_bc=%5 maddubsw_de=%6 tmp=%7,%8
+%macro AVX2_FilterVertical2_16px 8
+    vpxor           %7, %7, %7
+    vpunpcklbw      %1, %1, %7
+    vpunpcklbw      %8, %2, %7
+    vpaddw          %1, %1, %8
+    vpmaddubsw      %7, %3, %5
+    vpaddw          %1, %1, %7
+    vpmaddubsw      %7, %4, %6
+    vpaddw          %1, %1, %7
+    vpaddw          %1, %1, [h264_w0x10_256]
+    vpsraw          %1, %1, 5
+%endmacro
+
+; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
+%macro AVX2_FilterVerticalw_16px 7
+    vpaddw          %1, %1, %6
+    vpaddw          %7, %2, %5
+    vpsubw          %1, %1, %7
+    vpsraw          %1, %1, 2
+    vpsubw          %1, %1, %7
+    vpaddw          %7, %3, %4
+    vpaddw          %1, %1, %7
+    vpsraw          %1, %1, 2
+    vpaddw          %7, %7, [dw32_256]
+    vpaddw          %1, %1, %7
+    vpsraw          %1, %1, 6
+%endmacro
+
+;***********************************************************************
+; void McHorVer02_avx2(const uint8_t *pSrc,
+;                      int32_t iSrcStride,
+;                      uint8_t *pDst,
+;                      int32_t iDstStride,
+;                      int32_t iWidth,
+;                      int32_t iHeight)
+;***********************************************************************
+
+WELS_EXTERN McHorVer02_avx2
+%define p_src         r0
+%define i_srcstride   r1
+%define p_dst         r2
+%define i_dststride   r3
+%define i_width       r4
+%define i_height      r5
+%define i_srcstride3  r6
+    %assign push_num 0
+%ifdef X86_32
+    push            r6
+    %assign push_num 1
+%endif
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
+    sub             p_src, i_srcstride
+    sub             p_src, i_srcstride
+    lea             i_srcstride3, [3 * i_srcstride]
+    cmp             i_width, 8
+    je              .width8
+    jg              .width16
+; .width4:
+    vmovd           xmm0, [p_src]
+    vpbroadcastd    xmm5, [p_src + i_srcstride]
+    vpunpcklbw      xmm0, xmm0, xmm5
+    vpbroadcastd    ymm1, [p_src + 2 * i_srcstride]
+    vpunpcklbw      xmm5, xmm5, xmm1
+    vpblendd        xmm0, xmm0, xmm5, 1100b
+    vpbroadcastd    ymm5, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpunpcklbw      ymm1, ymm1, ymm5
+    vpbroadcastd    ymm2, [p_src]
+    vpunpcklbw      ymm5, ymm5, ymm2
+    vpblendd        ymm1, ymm1, ymm5, 11001100b
+    vpblendd        ymm0, ymm0, ymm1, 11110000b
+    vpbroadcastd    ymm5, [p_src + i_srcstride]
+    lea             p_src, [p_src + 2 * i_srcstride]
+    vpunpcklbw      ymm2, ymm2, ymm5
+    vpbroadcastd    ymm3, [p_src]
+    vpunpcklbw      ymm5, ymm5, ymm3
+    vpblendd        ymm2, ymm2, ymm5, 11001100b
+    vpblendd        ymm1, ymm1, ymm2, 11110000b
+    vpbroadcastd    ymm5, [p_src + i_srcstride]
+    vpunpcklbw      ymm3, ymm3, ymm5
+    vpbroadcastd    ymm4, [p_src + 2 * i_srcstride]
+    vpunpcklbw      ymm5, ymm5, ymm4
+    vpblendd        ymm3, ymm3, ymm5, 11001100b
+    vpblendd        ymm2, ymm2, ymm3, 11110000b
+    vbroadcasti128  ymm6, [db20_128]
+    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5
+    vpackuswb       ymm0, ymm0, ymm0
+    vmovd           [p_dst], xmm0
+    vpsrlq          xmm5, xmm0, 32
+    vmovd           [p_dst + i_dststride], xmm5
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    vextracti128    xmm0, ymm0, 1
+    vmovd           [p_dst], xmm0
+    vpsrlq          xmm5, xmm0, 32
+    vmovd           [p_dst + i_dststride], xmm5
+    cmp             i_height, 5
+    jl              .width4_done
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    vpbroadcastd    ymm5, [p_src + i_srcstride3]
+    vpunpcklbw      ymm4, ymm4, ymm5
+    jg              .width4_height_ge8
+    AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5
+    vpackuswb       xmm2, xmm2, xmm2
+    vmovd           [p_dst], xmm2
+    jmp             .width4_done
+.width4_height_ge8:
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpbroadcastd    ymm1, [p_src]
+    vpunpcklbw      ymm5, ymm5, ymm1
+    vpblendd        ymm4, ymm4, ymm5, 11001100b
+    vpblendd        ymm3, ymm3, ymm4, 11110000b
+    vpbroadcastd    ymm5, [p_src + i_srcstride]
+    vpunpcklbw      ymm1, ymm5
+    vpbroadcastd    ymm0, [p_src + 2 * i_srcstride]
+    vpunpcklbw      ymm5, ymm5, ymm0
+    vpblendd        ymm1, ymm1, ymm5, 11001100b
+    vpblendd        ymm4, ymm4, ymm1, 11110000b
+    AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5
+    vpackuswb       ymm2, ymm2, ymm2
+    vmovd           [p_dst], xmm2
+    vpsrlq          xmm5, xmm2, 32
+    vmovd           [p_dst + i_dststride], xmm5
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    vextracti128    xmm2, ymm2, 1
+    vmovd           [p_dst], xmm2
+    vpsrlq          xmm5, xmm2, 32
+    vmovd           [p_dst + i_dststride], xmm5
+    cmp             i_height, 9
+    jl              .width4_done
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    vmovd           xmm5, [p_src + i_srcstride3]
+    vpunpcklbw      xmm0, xmm0, xmm5
+    AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5
+    vpackuswb       xmm4, xmm4, xmm4
+    vmovd           [p_dst], xmm4
+.width4_done:
+    vzeroupper
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop             r6
+%endif
+    ret
+
+.width8:
+    sub             i_height, 1
+    vmovq           xmm0, [p_src]
+    vmovq           xmm4, [p_src + i_srcstride]
+    vpunpcklbw      xmm0, xmm0, xmm4
+    vmovq           xmm1, [p_src + 2 * i_srcstride]
+    vpunpcklbw      xmm4, xmm4, xmm1
+    vinserti128     ymm0, ymm0, xmm4, 1
+    vmovq           xmm4, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpunpcklbw      xmm1, xmm1, xmm4
+    vmovq           xmm6, [p_src]
+    vpunpcklbw      xmm4, xmm4, xmm6
+    vinserti128     ymm1, ymm1, xmm4, 1
+.width8_yloop:
+    vmovq           xmm4, [p_src + i_srcstride]
+    vpunpcklbw      xmm2, xmm6, xmm4
+    vmovq           xmm3, [p_src + 2 * i_srcstride]
+    vpunpcklbw      xmm4, xmm4, xmm3
+    vinserti128     ymm2, ymm2, xmm4, 1
+    vbroadcasti128  ymm5, [db20_128]
+    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
+    vmovq           xmm4, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpunpcklbw      xmm3, xmm3, xmm4
+    vmovq           xmm6, [p_src]
+    vpunpcklbw      xmm4, xmm4, xmm6
+    vinserti128     ymm3, ymm3, xmm4, 1
+    AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
+    vpackuswb       ymm0, ymm0, ymm1
+    vmovlps         [p_dst], xmm0
+    vextracti128    xmm1, ymm0, 1
+    vmovlps         [p_dst + i_dststride], xmm1
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    vmovhps         [p_dst], xmm0
+    vmovhps         [p_dst + i_dststride], xmm1
+    cmp             i_height, 4
+    jl              .width8_done
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    vmovq           xmm4, [p_src + i_srcstride]
+    vpunpcklbw      xmm0, xmm6, xmm4
+    jg              .width8_height_ge8
+    AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4
+    vpackuswb       xmm2, xmm2, xmm2
+    vmovlps         [p_dst], xmm2
+    jmp             .width8_done
+.width8_height_ge8:
+    vmovq           xmm1, [p_src + 2 * i_srcstride]
+    vpunpcklbw      xmm4, xmm4, xmm1
+    vinserti128     ymm0, ymm0, xmm4, 1
+    AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
+    vmovq           xmm4, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpunpcklbw      xmm1, xmm1, xmm4
+    vmovq           xmm6, [p_src]
+    vpunpcklbw      xmm4, xmm4, xmm6
+    vinserti128     ymm1, ymm1, xmm4, 1
+    AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
+    vpackuswb       ymm2, ymm2, ymm3
+    vmovlps         [p_dst], xmm2
+    vextracti128    xmm3, ymm2, 1
+    vmovlps         [p_dst + i_dststride], xmm3
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    vmovhps         [p_dst], xmm2
+    vmovhps         [p_dst + i_dststride], xmm3
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    sub             i_height, 8
+    jg              .width8_yloop
+    jl              .width8_done
+    vmovq           xmm4, [p_src + i_srcstride]
+    vpunpcklbw      xmm2, xmm6, xmm4
+    AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4
+    vpackuswb       xmm0, xmm0, xmm0
+    vmovlps         [p_dst], xmm0
+.width8_done:
+    vzeroupper
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop             r6
+%endif
+    ret
+
+.width16:
+    sub             i_height, 1
+    test            i_height, 1
+    jnz             .width16_yloop_begin_even
+    vmovq           xmm0, [p_src]
+    vpbroadcastq    ymm1, [p_src + 8]
+    vpblendd        ymm0, ymm0, ymm1, 11110000b
+    vmovq           xmm1, [p_src + i_srcstride]
+    vpbroadcastq    ymm2, [p_src + i_srcstride + 8]
+    vpblendd        ymm1, ymm1, ymm2, 11110000b
+    vpunpcklbw      ymm0, ymm0, ymm1
+    vmovq           xmm2, [p_src + 2 * i_srcstride]
+    vpbroadcastq    ymm3, [p_src + 2 * i_srcstride + 8]
+    vpblendd        ymm2, ymm2, ymm3, 11110000b
+    vmovq           xmm3, [p_src + i_srcstride3]
+    vpbroadcastq    ymm4, [p_src + i_srcstride3 + 8]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpblendd        ymm3, ymm3, ymm4, 11110000b
+    vpunpcklbw      ymm2, ymm2, ymm3
+    vmovq           xmm4, [p_src]
+    vpbroadcastq    ymm5, [p_src + 8]
+    vpblendd        ymm4, ymm4, ymm5, 11110000b
+    vmovq           xmm5, [p_src + i_srcstride]
+    vpbroadcastq    ymm6, [p_src + i_srcstride + 8]
+    lea             p_src, [p_src + 2 * i_srcstride]
+    vpblendd        ymm5, ymm5, ymm6, 11110000b
+    vpunpcklbw      ymm4, ymm4, ymm5
+    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm7
+    vpackuswb       ymm0, ymm0, ymm0
+    vpermq          ymm0, ymm0, 1000b
+    vmovdqa         [p_dst], xmm0
+    add             p_dst, i_dststride
+    jmp             .width16_yloop
+.width16_yloop_begin_even:
+    vmovq           xmm1, [p_src]
+    vpbroadcastq    ymm2, [p_src + 8]
+    vpblendd        ymm1, ymm1, ymm2, 11110000b
+    vmovq           xmm2, [p_src + i_srcstride]
+    vpbroadcastq    ymm3, [p_src + i_srcstride + 8]
+    vpblendd        ymm2, ymm2, ymm3, 11110000b
+    vmovq           xmm3, [p_src + 2 * i_srcstride]
+    vpbroadcastq    ymm4, [p_src + 2 * i_srcstride + 8]
+    add             p_src, i_srcstride3
+    vpblendd        ymm3, ymm3, ymm4, 11110000b
+    vpunpcklbw      ymm2, ymm2, ymm3
+    vmovq           xmm4, [p_src]
+    vpbroadcastq    ymm5, [p_src + 8]
+    vpblendd        ymm4, ymm4, ymm5, 11110000b
+    vmovq           xmm5, [p_src + i_srcstride]
+    vpbroadcastq    ymm6, [p_src + i_srcstride + 8]
+    lea             p_src, [p_src + 2 * i_srcstride]
+    vpblendd        ymm5, ymm5, ymm6, 11110000b
+    vpunpcklbw      ymm4, ymm4, ymm5
+.width16_yloop:
+    vmovq           xmm6, [p_src]
+    vpbroadcastq    ymm7, [p_src + 8]
+    vpblendd        ymm6, ymm6, ymm7, 11110000b
+    AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm0, ymm7
+    vmovq           xmm7, [p_src + i_srcstride]
+    vpbroadcastq    ymm0, [p_src + i_srcstride + 8]
+    vpblendd        ymm7, ymm7, ymm0, 11110000b
+    vpunpcklbw      ymm6, ymm6, ymm7
+    AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm0
+    vpackuswb       ymm1, ymm1, ymm2
+    vpermq          ymm1, ymm1, 11011000b
+    vmovdqa         [p_dst], xmm1
+    vextracti128    [p_dst + i_dststride], ymm1, 1
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    vmovq           xmm0, [p_src + 2 * i_srcstride]
+    vpbroadcastq    ymm1, [p_src + 2 * i_srcstride + 8]
+    vpblendd        ymm0, ymm0, ymm1, 11110000b
+    AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm2, ymm1
+    vmovq           xmm1, [p_src + i_srcstride3]
+    vpbroadcastq    ymm2, [p_src + i_srcstride3 + 8]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpblendd        ymm1, ymm1, ymm2, 11110000b
+    vpunpcklbw      ymm0, ymm0, ymm1
+    AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm2
+    vpackuswb       ymm3, ymm3, ymm4
+    vpermq          ymm3, ymm3, 11011000b
+    vmovdqa         [p_dst], xmm3
+    vextracti128    [p_dst + i_dststride], ymm3, 1
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    vmovq           xmm2, [p_src]
+    vpbroadcastq    ymm3, [p_src + 8]
+    vpblendd        ymm2, ymm2, ymm3, 11110000b
+    AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm4, ymm3
+    vmovq           xmm3, [p_src + i_srcstride]
+    vpbroadcastq    ymm4, [p_src + i_srcstride + 8]
+    vpblendd        ymm3, ymm3, ymm4, 11110000b
+    vpunpcklbw      ymm2, ymm2, ymm3
+    AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm4
+    vpackuswb       ymm5, ymm5, ymm6
+    vpermq          ymm5, ymm5, 11011000b
+    vmovdqa         [p_dst], xmm5
+    vextracti128    [p_dst + i_dststride], ymm5, 1
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    vmovq           xmm4, [p_src + 2 * i_srcstride]
+    vpbroadcastq    ymm5, [p_src + 2 * i_srcstride + 8]
+    vpblendd        ymm4, ymm4, ymm5, 11110000b
+    AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm6, ymm5
+    vmovq           xmm5, [p_src + i_srcstride3]
+    vpbroadcastq    ymm6, [p_src + i_srcstride3 + 8]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpblendd        ymm5, ymm5, ymm6, 11110000b
+    vpunpcklbw      ymm4, ymm4, ymm5
+    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm6
+    vpackuswb       ymm7, ymm7, ymm0
+    vpermq          ymm7, ymm7, 11011000b
+    vmovdqa         [p_dst], xmm7
+    vextracti128    [p_dst + i_dststride], ymm7, 1
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    sub             i_height, 8
+    jg              .width16_yloop
+    vzeroupper
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop             r6
+%endif
+    ret
+%undef p_src
+%undef i_srcstride
+%undef i_srcstride3
+%undef p_dst
+%undef i_dststride
+%undef i_width
+%undef i_height
+%undef i_ycnt
+
+
+;*******************************************************************************
+; void McHorVer20_avx2(const uint8_t *pSrc,
+;                      int iSrcStride,
+;                      uint8_t *pDst,
+;                      int iDstStride,
+;                      int iWidth,
+;                      int iHeight);
+;*******************************************************************************
+
+WELS_EXTERN McHorVer20_avx2
+%define p_src        r0
+%define i_srcstride  r1
+%define p_dst        r2
+%define i_dststride  r3
+%define i_width      r4
+%define i_height     r5
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 7
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
+    vbroadcasti128  ymm4, [shufb_32435465768798A9]
+    vbroadcasti128  ymm5, [shufb_011267784556ABBC]
+    vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+    cmp             i_width, 8
+    je              .width8
+    jg              .width16_yloop
+%xdefine i_srcstride3 i_width
+%undef i_width
+    lea             i_srcstride3, [3 * i_srcstride]
+.width4_yloop:
+    vmovdqu         xmm0, [p_src - 2]
+    vmovdqu         xmm1, [p_src + i_srcstride - 2]
+    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
+    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
+    lea             p_src, [p_src + 4 * i_srcstride]
+    AVX2_FilterHorizontal_4x4px ymm0, ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
+    vpackuswb       ymm0, ymm0, ymm0
+    vmovd           [p_dst], xmm0
+    vpsrlq          xmm1, xmm0, 32
+    vmovd           [p_dst + i_dststride], xmm1
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    vextracti128    xmm0, ymm0, 1
+    vmovd           [p_dst], xmm0
+    vpsrlq          xmm1, xmm0, 32
+    vmovd           [p_dst + i_dststride], xmm1
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    sub             i_height, 4
+    jg              .width4_yloop
+    vzeroupper
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+.width8:
+    lea             i_srcstride3, [3 * i_srcstride]
+.width8_yloop:
+    vmovdqu         xmm0, [p_src - 2]
+    vmovdqu         xmm1, [p_src + i_srcstride - 2]
+    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
+    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
+    lea             p_src, [p_src + 4 * i_srcstride]
+    AVX2_FilterHorizontal_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3
+    AVX2_FilterHorizontal_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
+    vpackuswb       ymm0, ymm0, ymm1
+    vmovlps         [p_dst], xmm0
+    vmovhps         [p_dst + i_dststride], xmm0
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    vextracti128    xmm0, ymm0, 1
+    vmovlps         [p_dst], xmm0
+    vmovhps         [p_dst + i_dststride], xmm0
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    sub             i_height, 4
+    jg              .width8_yloop
+    vzeroupper
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+%undef i_srcstride3
+.width16_yloop:
+    vmovdqu         xmm0, [p_src - 2]
+    vmovdqu         xmm1, [p_src + 6]
+    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
+    vinserti128     ymm1, ymm1, [p_src + i_srcstride + 6], 1
+    lea             p_src, [p_src + 2 * i_srcstride]
+    AVX2_FilterHorizontal_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3
+    AVX2_FilterHorizontal_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
+    vpackuswb       ymm0, ymm0, ymm1
+    vmovdqa         [p_dst], xmm0
+    vextracti128    [p_dst + i_dststride], ymm0, 1
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    sub             i_height, 2
+    jg              .width16_yloop
+    vzeroupper
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_width
+%undef i_height
+
+
+;***********************************************************************
+; void McHorVer20Width5Or9Or17_avx2(const uint8_t *pSrc,
+;                                   int32_t iSrcStride,
+;                                   uint8_t *pDst,
+;                                   int32_t iDstStride,
+;                                   int32_t iWidth,
+;                                   int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer20Width5Or9Or17_avx2
+%define p_src        r0
+%define i_srcstride  r1
+%define p_dst        r2
+%define i_dststride  r3
+%define i_width      r4
+%define i_height     r5
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
+    vbroadcasti128  ymm5, [shufb_32435465768798A9]
+    vbroadcasti128  ymm6, [shufb_011267784556ABBC]
+    vbroadcasti128  ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+    cmp             i_width, 9
+    je              .width9
+    jg              .width17
+.width5_yloop:
+    vmovdqu         xmm0, [p_src - 2]
+    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
+    lea             p_src, [p_src + 2 * i_srcstride]
+    AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
+    vpackuswb       ymm0, ymm0, ymm0
+    vpsrlq          xmm1, xmm0, 8
+    vmovd           [p_dst + 1], xmm1
+    vmovd           [p_dst], xmm0
+    add             p_dst, i_dststride
+    vextracti128    xmm0, ymm0, 1
+    vpsrlq          xmm1, xmm0, 8
+    vmovd           [p_dst + 1], xmm1
+    vmovd           [p_dst], xmm0
+    add             p_dst, i_dststride
+    sub             i_height, 2
+    jg              .width5_yloop
+    vzeroupper
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+.width9:
+%xdefine i_srcstride3 i_width
+%undef i_width
+    lea             i_srcstride3, [3 * i_srcstride]
+.width9_yloop:
+    vmovdqu         xmm0, [p_src - 2]
+    vmovdqu         xmm4, [p_src + i_srcstride - 2]
+    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
+    vinserti128     ymm4, ymm4, [p_src + i_srcstride3 - 2], 1
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpunpckhqdq     ymm3, ymm0, ymm4
+    AVX2_FilterHorizontal_4px ymm3, ymm2
+    AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
+    vpackuswb       ymm3, ymm3, ymm0
+    vmovd           [p_dst + 5], xmm3
+    vmovhps         [p_dst], xmm3
+    add             p_dst, i_dststride
+    AVX2_FilterHorizontal_16px ymm4, ymm5, ymm6, ymm7, ymm1, ymm2
+    vpackuswb       ymm4, ymm4, ymm4
+    vpsrlq          xmm2, xmm3, 32
+    vmovd           [p_dst + 5], xmm2
+    vmovlps         [p_dst], xmm4
+    add             p_dst, i_dststride
+    vextracti128    xmm3, ymm3, 1
+    vextracti128    xmm4, ymm4, 1
+    vmovd           [p_dst + 5], xmm3
+    vmovhps         [p_dst], xmm3
+    add             p_dst, i_dststride
+    vpsrlq          xmm2, xmm3, 32
+    vmovd           [p_dst + 5], xmm2
+    vmovlps         [p_dst], xmm4
+    add             p_dst, i_dststride
+    sub             i_height, 4
+    jg              .width9_yloop
+    vzeroupper
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+.width17:
+    lea             i_srcstride3, [3 * i_srcstride]
+.width17_yloop:
+    vmovdqu         xmm0, [p_src - 2]
+    vmovdqu         xmm3, [p_src + 6]
+    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
+    vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1
+    vmovdqa         ymm4, ymm3
+    AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
+    AVX2_FilterHorizontal_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
+    vpackuswb       ymm0, ymm0, ymm3
+    vmovdqu         xmm1, [p_src + 2 * i_srcstride - 2]
+    vmovdqu         xmm3, [p_src + 2 * i_srcstride + 6]
+    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
+    vinserti128     ymm3, ymm3, [p_src + i_srcstride3 + 6], 1
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpunpckhqdq     ymm4, ymm4, ymm3
+    AVX2_FilterHorizontal_4px ymm4, ymm2
+    vpackuswb       ymm4, ymm4, ymm4
+    vmovd           [p_dst + 13], xmm4
+    vmovdqa         [p_dst], xmm0
+    add             p_dst, i_dststride
+    vextracti128    xmm2, ymm4, 1
+    vmovd           [p_dst + 13], xmm2
+    vextracti128    [p_dst], ymm0, 1
+    add             p_dst, i_dststride
+    vpsrlq          xmm4, xmm4, 32
+    vmovd           [p_dst + 13], xmm4
+    AVX2_FilterHorizontal_16px ymm1, ymm5, ymm6, ymm7, ymm0, ymm4
+    AVX2_FilterHorizontal_16px ymm3, ymm5, ymm6, ymm7, ymm0, ymm4
+    vpackuswb       ymm1, ymm1, ymm3
+    vmovdqa         [p_dst], xmm1
+    add             p_dst, i_dststride
+    vpsrlq          xmm2, xmm2, 32
+    vmovd           [p_dst + 13], xmm2
+    vextracti128    [p_dst], ymm1, 1
+    add             p_dst, i_dststride
+    sub             i_height, 4
+    jg              .width17_yloop
+    vzeroupper
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+%undef i_srcstride3
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_width
+%undef i_height
+
+
+;*******************************************************************************
+; void McHorVer20Width4U8ToS16_avx2(const uint8_t *pSrc,
+;                                   int iSrcStride,
+;                                   int16_t *pDst,
+;                                   int iHeight);
+;*******************************************************************************
+
+WELS_EXTERN McHorVer20Width4U8ToS16_avx2
+%define p_src        r0
+%define i_srcstride  r1
+%define p_dst        r2
+%define i_height     r3
+%define i_srcstride3 r4
+%define i_dststride   8
+    %assign  push_num 0
+%ifdef X86_32
+    push            r4
+    %assign  push_num 1
+%endif
+    LOAD_4_PARA
+    PUSH_XMM 7
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    sub             p_src, i_srcstride
+    sub             p_src, i_srcstride
+    lea             i_srcstride3, [3 * i_srcstride]
+    vbroadcasti128  ymm4, [shufb_32435465768798A9]
+    vbroadcasti128  ymm5, [shufb_011267784556ABBC]
+    vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+    sub             i_height, 3
+.yloop:
+    vmovdqu         xmm0, [p_src - 2]
+    vmovdqu         xmm1, [p_src + i_srcstride - 2]
+    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
+    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
+    lea             p_src, [p_src + 4 * i_srcstride]
+    AVX2_FilterHorizontalbw_4x4px ymm0, ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
+    vmovdqa         [p_dst], ymm0
+    add             p_dst, 4 * i_dststride
+    sub             i_height, 4
+    jg              .yloop
+    ; Height % 4 remaining single.
+    vmovdqu         xmm0, [p_src - 2]
+    AVX2_FilterHorizontalbw_16px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
+    vmovlps         [p_dst], xmm0
+    vzeroupper
+    POP_XMM
+    LOAD_4_PARA_POP
+%ifdef X86_32
+    pop             r4
+%endif
+    ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_height
+%undef i_srcstride3
+%undef i_dststride
+
+
+;***********************************************************************
+; void McHorVer02Width4S16ToU8_avx2(const int16_t *pSrc,
+;                                   uint8_t *pDst,
+;                                   int32_t iDstStride,
+;                                   int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer02Width4S16ToU8_avx2
+%define p_src        r0
+%define p_dst        r1
+%define i_dststride  r2
+%define i_height     r3
+%define i_dststride3 r4
+%define i_srcstride  8
+    %assign  push_num 0
+%ifdef X86_32
+    push            r4
+    %assign  push_num 1
+%endif
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r2, r2d
+    SIGN_EXTENSION  r3, r3d
+    lea             i_dststride3, [3 * i_dststride]
+    vmovdqu         ymm0, [p_src +  0 * i_srcstride]
+    vmovdqu         ymm1, [p_src +  1 * i_srcstride]
+    vmovdqu         ymm2, [p_src +  2 * i_srcstride]
+    vmovdqu         ymm3, [p_src +  3 * i_srcstride]
+    vmovdqu         ymm4, [p_src +  4 * i_srcstride]
+    vmovdqu         ymm5, [p_src +  5 * i_srcstride]
+    vmovdqu         ymm6, [p_src +  6 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7
+    vpackuswb       ymm0, ymm0, ymm0
+    vmovd           [p_dst], xmm0
+    vpsrlq          xmm7, xmm0, 32
+    vmovd           [p_dst + i_dststride], xmm7
+    vextracti128    xmm0, ymm0, 1
+    vmovd           [p_dst + 2 * i_dststride], xmm0
+    vpsrlq          xmm7, xmm0, 32
+    vmovd           [p_dst + i_dststride3], xmm7
+    cmp             i_height, 4
+    jle             .done
+    lea             p_dst, [p_dst + 4 * i_dststride]
+    vmovdqu         ymm7, [p_src +  7 * i_srcstride]
+    vmovdqu         ymm0, [p_src +  8 * i_srcstride]
+    vmovdqu         ymm1, [p_src +  9 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3
+    vpackuswb       ymm4, ymm4, ymm4
+    vmovd           [p_dst], xmm4
+    vpsrlq          xmm3, xmm4, 32
+    vmovd           [p_dst + i_dststride], xmm3
+    vextracti128    xmm4, ymm4, 1
+    vmovd           [p_dst + 2 * i_dststride], xmm4
+    vpsrlq          xmm3, xmm4, 32
+    vmovd           [p_dst + i_dststride3], xmm3
+.done:
+    vzeroupper
+    POP_XMM
+    LOAD_4_PARA_POP
+%ifdef X86_32
+    pop             r4
+%endif
+    ret
+%undef p_src
+%undef p_dst
+%undef i_dststride
+%undef i_height
+%undef i_srcstride
+%undef i_dststride3
+
+
+;*******************************************************************************
+; void McHorVer20Width8U8ToS16_avx2(const uint8_t *pSrc,
+;                                   int iSrcStride,
+;                                   int16_t *pDst,
+;                                   int iHeight);
+;*******************************************************************************
+
+WELS_EXTERN McHorVer20Width8U8ToS16_avx2
+%define p_src        r0
+%define i_srcstride  r1
+%define p_dst        r2
+%define i_height     r3
+%define i_dststride  16
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 6
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    sub             p_src, i_srcstride
+    sub             p_src, i_srcstride
+    vbroadcasti128  ymm3, [shufb_32435465768798A9]
+    vbroadcasti128  ymm4, [shufb_011267784556ABBC]
+    vbroadcasti128  ymm5, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+    sub             i_height, 1
+.yloop:
+    vmovdqu         xmm0, [p_src - 2]
+    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
+    lea             p_src, [p_src + 2 * i_srcstride]
+    AVX2_FilterHorizontalbw_16px ymm0, ymm3, ymm4, ymm5, ymm1, ymm2
+    vmovdqu         [p_dst], ymm0
+    add             p_dst, 2 * i_dststride
+    sub             i_height, 2
+    jg              .yloop
+    jl              .done
+    vmovdqu         xmm0, [p_src - 2]
+    AVX2_FilterHorizontalbw_16px xmm0, xmm3, xmm4, xmm5, xmm1, xmm2
+    vmovdqa         [p_dst], xmm0
+.done:
+    vzeroupper
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_height
+%undef i_dststride
+
+
+;***********************************************************************
+; void McHorVer02Width5S16ToU8_avx2(const int16_t *pSrc,
+;                                   uint8_t *pDst,
+;                                   int32_t iDstStride,
+;                                   int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer02Width5S16ToU8_avx2
+%define p_src        r0
+%define p_dst        r1
+%define i_dststride  r2
+%define i_height     r3
+%define i_srcstride  16
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r2, r2d
+    SIGN_EXTENSION  r3, r3d
+    vmovdqu         ymm0, [p_src +  0 * i_srcstride]
+    vmovdqu         ymm2, [p_src +  2 * i_srcstride]
+    vmovdqu         ymm4, [p_src +  4 * i_srcstride]
+    vmovdqu         ymm6, [p_src +  6 * i_srcstride]
+    vperm2i128      ymm1, ymm0, ymm2, 00100001b
+    vperm2i128      ymm3, ymm2, ymm4, 00100001b
+    vperm2i128      ymm5, ymm4, ymm6, 00100001b
+    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7
+    vpackuswb       ymm0, ymm0, ymm0
+    vpsrlq          xmm7, xmm0, 8
+    vmovd           [p_dst + 1], xmm7
+    vmovd           [p_dst], xmm0
+    add             p_dst, i_dststride
+    vextracti128    xmm0, ymm0, 1
+    vpsrlq          xmm7, xmm0, 8
+    vmovd           [p_dst + 1], xmm7
+    vmovd           [p_dst], xmm0
+    add             p_dst, i_dststride
+    vmovdqu         ymm7, [p_src +  7 * i_srcstride]
+    vmovdqu         ymm0, [p_src +  8 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm1
+    vpackuswb       ymm2, ymm2, ymm2
+    vpsrlq          xmm1, xmm2, 8
+    vmovd           [p_dst + 1], xmm1
+    vmovd           [p_dst], xmm2
+    add             p_dst, i_dststride
+    vextracti128    xmm2, ymm2, 1
+    vpsrlq          xmm1, xmm2, 8
+    vmovd           [p_dst + 1], xmm1
+    vmovd           [p_dst], xmm2
+    add             p_dst, i_dststride
+    vmovdqu         ymm1, [p_src +  9 * i_srcstride]
+    vmovdqu         ymm2, [p_src + 10 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3
+    vpackuswb       ymm4, ymm4, ymm4
+    vpsrlq          xmm3, xmm4, 8
+    vmovd           [p_dst + 1], xmm3
+    vmovd           [p_dst], xmm4
+    cmp             i_height, 5
+    jle             .done
+    add             p_dst, i_dststride
+    vextracti128    xmm4, ymm4, 1
+    vpsrlq          xmm3, xmm4, 8
+    vmovd           [p_dst + 1], xmm3
+    vmovd           [p_dst], xmm4
+    add             p_dst, i_dststride
+    vmovdqu         ymm3, [p_src + 11 * i_srcstride]
+    vmovdqu         xmm4, [p_src + 12 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm5
+    vpackuswb       ymm6, ymm6, ymm6
+    vpsrlq          xmm5, xmm6, 8
+    vmovd           [p_dst + 1], xmm5
+    vmovd           [p_dst], xmm6
+    add             p_dst, i_dststride
+    vextracti128    xmm6, ymm6, 1
+    vpsrlq          xmm5, xmm6, 8
+    vmovd           [p_dst + 1], xmm5
+    vmovd           [p_dst], xmm6
+    add             p_dst, i_dststride
+    vmovdqu         xmm5, [p_src + 13 * i_srcstride]
+    AVX2_FilterVerticalw_16px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
+    vpackuswb       xmm0, xmm0, xmm0
+    vpsrlq          xmm7, xmm0, 8
+    vmovd           [p_dst + 1], xmm7
+    vmovd           [p_dst], xmm0
+.done:
+    vzeroupper
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
+%undef p_src
+%undef p_dst
+%undef i_dststride
+%undef i_height
+%undef i_srcstride
+
+
+;***********************************************************************
+; void McHorVer02Width8S16ToU8_avx2(const int16_t *pSrc,
+;                                   uint8_t *pDst,
+;                                   int32_t iDstStride,
+;                                   int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer02Width8S16ToU8_avx2
+%define p_src        r0
+%define p_dst        r1
+%define i_dststride  r2
+%define i_height     r3
+%define i_dststride3 r4
+%define i_srcstride  16
+    %assign  push_num 0
+%ifdef X86_32
+    push            r4
+    %assign  push_num 1
+%endif
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r2, r2d
+    SIGN_EXTENSION  r3, r3d
+    lea             i_dststride3, [3 * i_dststride]
+    vmovdqa         ymm0, [p_src +  0 * i_srcstride]
+    vmovdqa         ymm2, [p_src +  2 * i_srcstride]
+    vmovdqa         ymm4, [p_src +  4 * i_srcstride]
+    vperm2i128      ymm1, ymm0, ymm2, 00100001b
+    vperm2i128      ymm3, ymm2, ymm4, 00100001b
+.yloop:
+    vmovdqa         ymm6, [p_src +  6 * i_srcstride]
+    vperm2i128      ymm5, ymm4, ymm6, 00100001b
+    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7
+    vmovdqu         ymm7, [p_src +  7 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm1
+    vpackuswb       ymm1, ymm0, ymm2
+    vmovdqa         ymm0, [p_src +  8 * i_srcstride]
+    vextracti128    xmm2, ymm1, 1
+    vmovlps         [p_dst], xmm1
+    vmovlps         [p_dst + i_dststride], xmm2
+    vmovhps         [p_dst + 2 * i_dststride], xmm1
+    vmovhps         [p_dst + i_dststride3], xmm2
+    cmp             i_height, 4
+    jle             .done
+    lea             p_dst, [p_dst + 4 * i_dststride]
+    vmovdqu         ymm1, [p_src +  9 * i_srcstride]
+    vmovdqa         ymm2, [p_src + 10 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3
+    vmovdqu         ymm3, [p_src + 11 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm5
+    vpackuswb       ymm5, ymm4, ymm6
+    vmovdqa         ymm4, [p_src + 12 * i_srcstride]
+    add             p_src, 8 * i_srcstride
+    vextracti128    xmm6, ymm5, 1
+    vmovlps         [p_dst], xmm5
+    vmovlps         [p_dst + i_dststride], xmm6
+    vmovhps         [p_dst + 2 * i_dststride], xmm5
+    vmovhps         [p_dst + i_dststride3], xmm6
+    lea             p_dst, [p_dst + 4 * i_dststride]
+    sub             i_height, 8
+    jg              .yloop
+.done:
+    vzeroupper
+    POP_XMM
+    LOAD_4_PARA_POP
+%ifdef X86_32
+    pop             r4
+%endif
+    ret
+%undef p_src
+%undef p_dst
+%undef i_dststride
+%undef i_height
+%undef i_dststride3
+%undef i_srcstride
+
+
+;*******************************************************************************
+; void McHorVer20Width16U8ToS16_avx2(const uint8_t *pSrc,
+;                                    int32_t iSrcStride,
+;                                    int16_t *pDst,
+;                                    int32_t iHeight);
+;*******************************************************************************
+
+WELS_EXTERN McHorVer20Width16U8ToS16_avx2
+%define p_src        r0
+%define i_srcstride  r1
+%define p_dst        r2
+%define i_height     r3
+%define i_dststride  32
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 7
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    sub             p_src, i_srcstride
+    sub             p_src, i_srcstride
+    vbroadcasti128  ymm4, [shufb_32435465768798A9]
+    vbroadcasti128  ymm5, [shufb_011267784556ABBC]
+    vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+    sub             i_height, 1
+.yloop:
+    vmovdqu         xmm0, [p_src - 2]
+    vinserti128     ymm0, ymm0, [p_src + 6], 1
+    vmovdqu         xmm1, [p_src + i_srcstride - 2]
+    vinserti128     ymm1, ymm1, [p_src + i_srcstride + 6], 1
+    lea             p_src, [p_src + 2 * i_srcstride]
+    AVX2_FilterHorizontalbw_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3
+    vmovdqa         [p_dst], ymm0
+    AVX2_FilterHorizontalbw_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
+    vmovdqa         [p_dst + i_dststride], ymm1
+    add             p_dst, 2 * i_dststride
+    sub             i_height, 2
+    jg              .yloop
+    jl              .done
+    vmovdqu         xmm0, [p_src - 2]
+    vinserti128     ymm0, ymm0, [p_src + 6], 1
+    AVX2_FilterHorizontalbw_16px ymm0, ymm4, ymm5, ymm6, ymm1, ymm2
+    vmovdqa         [p_dst], ymm0
+.done:
+    vzeroupper
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_height
+%undef i_dststride
+
+
+;***********************************************************************
+; void McHorVer02Width9S16ToU8_avx2(const int16_t *pSrc,
+;                                   uint8_t *pDst,
+;                                   int32_t iDstStride,
+;                                   int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer02Width9S16ToU8_avx2
+%define p_src        r0
+%define p_dst        r1
+%define i_dststride  r2
+%define i_height     r3
+%define i_srcstride  32
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r2, r2d
+    SIGN_EXTENSION  r3, r3d
+    vmovdqa         ymm0, [p_src + 0 * i_srcstride]
+    vmovdqa         ymm1, [p_src + 1 * i_srcstride]
+    vmovdqa         ymm2, [p_src + 2 * i_srcstride]
+    vmovdqa         ymm3, [p_src + 3 * i_srcstride]
+    vmovdqa         ymm4, [p_src + 4 * i_srcstride]
+    sub             i_height, 1
+.height_loop:
+    vmovdqa         ymm5, [p_src + 5 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
+    vmovdqa         ymm6, [p_src + 6 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    vmovdqa         ymm7, [p_src + 7 * i_srcstride]
+    vpackuswb       ymm0, ymm0, ymm1
+    vextracti128    xmm1, ymm0, 1
+    vpsllq          xmm1, xmm1, 56
+    vmovlps         [p_dst + 1], xmm1
+    vmovlps         [p_dst], xmm0
+    add             p_dst, i_dststride
+    vmovhps         [p_dst + 1], xmm1
+    vmovhps         [p_dst], xmm0
+    add             p_dst, i_dststride
+    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm0
+    vmovdqa         ymm0, [p_src + 8 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm3, ymm4, ymm5, ymm6, ymm7, ymm0, ymm1
+    vpackuswb       ymm2, ymm2, ymm3
+    vextracti128    xmm3, ymm2, 1
+    vpsllq          xmm3, xmm3, 56
+    vmovlps         [p_dst + 1], xmm3
+    vmovlps         [p_dst], xmm2
+    add             p_dst, i_dststride
+    vmovhps         [p_dst + 1], xmm3
+    vmovhps         [p_dst], xmm2
+    add             p_dst, i_dststride
+    vmovdqa         ymm1, [p_src + 9 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm2
+    vmovdqa         ymm2, [p_src + 10 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm5, ymm6, ymm7, ymm0, ymm1, ymm2, ymm3
+    vmovdqa         ymm3, [p_src + 11 * i_srcstride]
+    vpackuswb       ymm4, ymm4, ymm5
+    vextracti128    xmm5, ymm4, 1
+    vpsllq          xmm5, xmm5, 56
+    vmovlps         [p_dst + 1], xmm5
+    vmovlps         [p_dst], xmm4
+    cmp             i_height, 4
+    jle             .done
+    add             p_dst, i_dststride
+    vmovhps         [p_dst + 1], xmm5
+    vmovhps         [p_dst], xmm4
+    add             p_dst, i_dststride
+    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm4
+    vmovdqa         ymm4, [p_src + 12 * i_srcstride]
+    add             p_src, 8 * i_srcstride
+    AVX2_FilterVerticalw_16px ymm7, ymm0, ymm1, ymm2, ymm3, ymm4, ymm5
+    vpackuswb       ymm6, ymm6, ymm7
+    vextracti128    xmm7, ymm6, 1
+    vpsllq          xmm7, xmm7, 56
+    vmovlps         [p_dst + 1], xmm7
+    vmovlps         [p_dst], xmm6
+    add             p_dst, i_dststride
+    vmovhps         [p_dst + 1], xmm7
+    vmovhps         [p_dst], xmm6
+    add             p_dst, i_dststride
+    sub             i_height, 8
+    jg              .height_loop
+    vmovdqa         ymm5, [p_src + 5 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
+    vpackuswb       ymm0, ymm0, ymm0
+    vextracti128    xmm1, ymm0, 1
+    vpsllq          xmm1, xmm1, 56
+    vmovlps         [p_dst + 1], xmm1
+    vmovlps         [p_dst], xmm0
+.done:
+    vzeroupper
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_height
+
+
+;*******************************************************************************
+; void McHorVer20Width17U8ToS16_avx2(const uint8_t *pSrc,
+;                                    int32_t iSrcStride,
+;                                    int16_t *pDst,
+;                                    int32_t iHeight);
+;*******************************************************************************
+
+WELS_EXTERN McHorVer20Width17U8ToS16_avx2
+%define p_src        r0
+%define i_srcstride  r1
+%define p_dst        r2
+%define i_height     r3
+%define i_srcstride3 r4
+%define i_dststride  64
+    %assign  push_num 0
+%ifdef X86_32
+    push            r4
+    %assign  push_num 1
+%endif
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    sub             p_src, i_srcstride
+    sub             p_src, i_srcstride
+    lea             i_srcstride3, [3 * i_srcstride]
+    vbroadcasti128  ymm5, [shufb_32435465768798A9]
+    vbroadcasti128  ymm6, [shufb_011267784556ABBC]
+    vbroadcasti128  ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+    sub             i_height, 3
+.yloop:
+    vmovdqu         xmm0, [p_src - 2]
+    vmovdqu         xmm3, [p_src + 6]
+    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
+    vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1
+    vmovdqa         ymm4, ymm3
+    AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
+    vmovdqa         [p_dst], xmm0
+    vextracti128    [p_dst + i_dststride], ymm0, 1
+    AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
+    vmovdqu         xmm1, [p_src + 2 * i_srcstride - 2]
+    vmovdqu         xmm0, [p_src + 2 * i_srcstride + 6]
+    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
+    vinserti128     ymm0, ymm0, [p_src + i_srcstride3 + 6], 1
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpunpckhqdq     ymm4, ymm4, ymm0
+    AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2
+    vmovlps         [p_dst + 26], xmm4
+    vmovdqa         [p_dst + 16], xmm3
+    vextracti128    xmm2, ymm4, 1
+    vmovlps         [p_dst + i_dststride + 26], xmm2
+    vextracti128    [p_dst + i_dststride + 16], ymm3, 1
+    vmovhps         [p_dst + 2 * i_dststride + 26], xmm4
+    AVX2_FilterHorizontalbw_16px ymm1, ymm5, ymm6, ymm7, ymm3, ymm4
+    vmovdqa         [p_dst + 2 * i_dststride], xmm1
+    AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm3, ymm4
+    vmovdqa         [p_dst + 2 * i_dststride + 16], xmm0
+    vextracti128    [p_dst + 3 * i_dststride], ymm1, 1
+    vmovhps         [p_dst + 3 * i_dststride + 26], xmm2
+    vextracti128    [p_dst + 3 * i_dststride + 16], ymm0, 1
+    add             p_dst, 4 * i_dststride
+    sub             i_height, 4
+    jg              .yloop
+    ; Handle remaining 2 lines after 4x unrolled loop.
+    vmovdqu         xmm0, [p_src - 2]
+    vinserti128     ymm0, ymm0, [p_src + 6], 1
+    vmovdqu         xmm3, [p_src + i_srcstride - 2]
+    vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1
+    vpunpckhqdq     ymm4, ymm0, ymm3
+    AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2
+    AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
+    AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
+    vextracti128    xmm4, ymm4, 1
+    vmovlps         [p_dst + 26], xmm4
+    vmovdqa         [p_dst], ymm0
+    vmovhps         [p_dst + i_dststride + 26], xmm4
+    vmovdqa         [p_dst + i_dststride], ymm3
+    vzeroupper
+    POP_XMM
+    LOAD_4_PARA_POP
+%ifdef X86_32
+    pop             r4
+%endif
+    ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_height
+%undef i_srcstride3
+
+
+;***********************************************************************
+; void McHorVer02Width16Or17S16ToU8_avx2(const int16_t *pSrc,
+;                                        int32_t iSrcStride,
+;                                        uint8_t *pDst,
+;                                        int32_t iDstStride,
+;                                        int32_t iWidth,
+;                                        int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer02Width16Or17S16ToU8_avx2
+%define p_src        r0
+%define i_srcstride  r1
+%define p_dst        r2
+%define i_dststride  r3
+%define i_width      r4
+%define i_height     r5
+%define i_srcstride3 r6
+    %assign  push_num 0
+%ifdef X86_32
+    push            r6
+    %assign  push_num 1
+%endif
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
+    sub             i_height, 1
+    lea             i_srcstride3, [3 * i_srcstride]
+    test            i_width, 1
+    jz              .align_begin
+    push            i_height
+    push            p_src
+    push            p_dst
+    lea             p_src, [p_src + 2 * i_width - 2]
+    add             p_dst, i_width
+    vmovd           xmm0, [p_src]
+    vpunpcklwd      xmm0, xmm0, [p_src + i_srcstride]
+    vmovd           xmm1, [p_src + 2 * i_srcstride]
+    add             p_src, i_srcstride3
+    vpunpcklwd      xmm1, xmm1, [p_src]
+    vpunpckldq      xmm0, xmm0, xmm1
+    vmovd           xmm1, [p_src + i_srcstride]
+    vpunpcklwd      xmm1, xmm1, [p_src + 2 * i_srcstride]
+    vmovd           xmm2, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpunpcklwd      xmm2, xmm2, [p_src]
+    vpunpckldq      xmm1, xmm1, xmm2
+    vpunpcklqdq     xmm0, xmm0, xmm1
+.height_loop_unalign:
+    vmovd           xmm1, [p_src + i_srcstride]
+    vpalignr        xmm1, xmm1, xmm0, 2
+    vmovd           xmm2, [p_src + 2 * i_srcstride]
+    vpalignr        xmm2, xmm2, xmm1, 2
+    vmovd           xmm3, [p_src + i_srcstride3]
+    vpalignr        xmm3, xmm3, xmm2, 2
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vmovd           xmm4, [p_src]
+    vpalignr        xmm4, xmm4, xmm3, 2
+    vmovd           xmm5, [p_src + i_srcstride]
+    vpalignr        xmm5, xmm5, xmm4, 2
+    AVX2_FilterVerticalw_16px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
+    vpackuswb       xmm0, xmm0, xmm0
+    vpslld          xmm6, xmm0, 24
+    vmovd           [p_dst - 4], xmm6
+    vmovlps         [p_dst + 4 * i_dststride - 8], xmm6
+    add             p_dst, i_dststride
+    vpslld          xmm6, xmm0, 16
+    vmovd           [p_dst - 4], xmm6
+    vmovlps         [p_dst + 4 * i_dststride - 8], xmm6
+    add             p_dst, i_dststride
+    vpslld          xmm6, xmm0, 8
+    vmovd           [p_dst - 4], xmm6
+    vmovd           [p_dst + i_dststride - 4], xmm0
+    lea             p_dst, [p_dst + 4 * i_dststride]
+    vmovlps         [p_dst - 8], xmm6
+    vmovlps         [p_dst + i_dststride - 8], xmm0
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    sub             i_height, 8
+    jle             .height_loop_unalign_exit
+    vmovd           xmm1, [p_src + 2 * i_srcstride]
+    vpalignr        xmm1, xmm1, xmm5, 2
+    vmovd           xmm0, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpunpcklwd      xmm0, xmm0, [p_src]
+    vpalignr        xmm0, xmm0, xmm1, 4
+    jmp             .height_loop_unalign
+.height_loop_unalign_exit:
+    vpbroadcastq    xmm6, [p_src + 2 * i_srcstride - 6]
+    AVX2_FilterVerticalw_16px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+    vpackuswb       xmm1, xmm1, xmm1
+    vmovlps         [p_dst - 8], xmm1
+    pop             p_dst
+    pop             p_src
+    pop             i_height
+.align_begin:
+    vmovdqa         ymm0, [p_src]
+    vmovdqa         ymm1, [p_src + i_srcstride]
+    vmovdqa         ymm2, [p_src + 2 * i_srcstride]
+    vmovdqa         ymm3, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vmovdqa         ymm4, [p_src]
+.height_loop:
+    vmovdqa         ymm5, [p_src + i_srcstride]
+    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
+    vmovdqa         ymm6, [p_src + 2 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    vmovdqa         ymm7, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpackuswb       ymm0, ymm0, ymm1
+    vpermq          ymm0, ymm0, 11011000b
+    vmovdqa         [p_dst], xmm0
+    vextracti128    [p_dst + i_dststride], ymm0, 1
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm0
+    vmovdqa         ymm0, [p_src]
+    AVX2_FilterVerticalw_16px ymm3, ymm4, ymm5, ymm6, ymm7, ymm0, ymm1
+    vpackuswb       ymm2, ymm2, ymm3
+    vpermq          ymm2, ymm2, 11011000b
+    vmovdqa         [p_dst], xmm2
+    vextracti128    [p_dst + i_dststride], ymm2, 1
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    vmovdqa         ymm1, [p_src + i_srcstride]
+    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm2
+    vmovdqa         ymm2, [p_src + 2 * i_srcstride]
+    AVX2_FilterVerticalw_16px ymm5, ymm6, ymm7, ymm0, ymm1, ymm2, ymm3
+    vmovdqa         ymm3, [p_src + i_srcstride3]
+    lea             p_src, [p_src + 4 * i_srcstride]
+    vpackuswb       ymm4, ymm4, ymm5
+    vpermq          ymm4, ymm4, 11011000b
+    vmovdqa        [p_dst], xmm4
+    vextracti128   [p_dst + i_dststride], ymm4, 1
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm4
+    vmovdqa         ymm4, [p_src]
+    AVX2_FilterVerticalw_16px ymm7, ymm0, ymm1, ymm2, ymm3, ymm4, ymm5
+    vpackuswb       ymm6, ymm6, ymm7
+    vpermq          ymm6, ymm6, 11011000b
+    vmovdqa         [p_dst], xmm6
+    vextracti128    [p_dst + i_dststride], ymm6, 1
+    lea             p_dst, [p_dst + 2 * i_dststride]
+    sub             i_height, 8
+    jg              .height_loop
+    jl              .done
+    vmovdqa         ymm5, [p_src + i_srcstride]
+    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
+    vpackuswb       ymm0, ymm0, ymm0
+    vpermq          ymm0, ymm0, 11011000b
+    vmovdqa         [p_dst], xmm0
+.done:
+    vzeroupper
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop             r6
+%endif
+    ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_width
+%undef i_height
+%undef i_srcstride3
+
+%endif ; HAVE_AVX2
--- a/test/encoder/EncUT_MotionCompensation.cpp
+++ b/test/encoder/EncUT_MotionCompensation.cpp
@@ -168,8 +168,8 @@
 DEF_MCCOPYTEST (8, 16)
 DEF_MCCOPYTEST (16, 16)
 
-#define DEF_LUMA_MCTEST(iW,iH) \
-TEST(McHorVer,iW##x##iH)  \
+#define DEF_LUMA_MCTEST(iW, iH, cpu_flags, name_suffix) \
+TEST(McHorVer, iW##x##iH##_##name_suffix) \
 {                       \
     for (int32_t a = 0; a < 4; a++) { \
     for (int32_t b = 0; b < 4; b++) { \
@@ -191,43 +191,41 @@
         uSrcAnchor[0][j][i] = uSrcTest[j][i] = rand()%256;  \
       }\
     }\
-    int32_t iCpuCores = 1; \
-    uint32_t uiCpuFlag;\
-    for(int32_t k =0; k<2; k++)\
-    {\
-      if(k==0)\
-      {\
-        uiCpuFlag = 0;\
-      }else \
-      {\
-        uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores); \
-      }\
-      InitMcFunc(&sMcFunc,uiCpuFlag);\
-      memset(uDstAnchor,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
-      memset(uDstTest,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
-      MCHalfPelFilterAnchor(uSrcInputAnchor[1],uSrcInputAnchor[2],uSrcInputAnchor[3],uSrcInputAnchor[0],MC_BUFF_SRC_STRIDE,iW+1,iH+1,pBuf+4); \
-      MCLumaAnchor(uDstAnchor[0],MC_BUFF_DST_STRIDE,uSrcInputAnchor,MC_BUFF_SRC_STRIDE,a,b,iW,iH); \
-      sMcFunc.pMcLumaFunc(&uSrcTest[4][4],MC_BUFF_SRC_STRIDE,uDstTest[0],MC_BUFF_DST_STRIDE,a,b,iW,iH);\
-      for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \
-      {                                                                             \
-          for(int32_t i=0;i<MC_BUFF_DST_STRIDE;i++)                                  \
-          {                                                                           \
-              ASSERT_EQ(uDstAnchor[j][i],uDstTest[j][i]);                              \
-          }                                                                             \
-      }                                                                                \
-    }\
+    InitMcFunc(&sMcFunc, WelsCPUFeatureDetect (0) & (cpu_flags)); \
+    memset(uDstAnchor,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
+    memset(uDstTest,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
+    MCHalfPelFilterAnchor(uSrcInputAnchor[1],uSrcInputAnchor[2],uSrcInputAnchor[3],uSrcInputAnchor[0],MC_BUFF_SRC_STRIDE,iW+1,iH+1,pBuf+4); \
+    MCLumaAnchor(uDstAnchor[0],MC_BUFF_DST_STRIDE,uSrcInputAnchor,MC_BUFF_SRC_STRIDE,a,b,iW,iH); \
+    sMcFunc.pMcLumaFunc(&uSrcTest[4][4],MC_BUFF_SRC_STRIDE,uDstTest[0],MC_BUFF_DST_STRIDE,a,b,iW,iH);\
+    for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \
+    {                                                                             \
+        for(int32_t i=0;i<MC_BUFF_DST_STRIDE;i++)                                  \
+        {                                                                           \
+            ASSERT_EQ(uDstAnchor[j][i],uDstTest[j][i]);                              \
+        }                                                                             \
+    }                                                                                \
     }\
     }\
 }
 
+#define DEF_LUMA_MCTESTS(cpu_flags, name_suffix) \
+    DEF_LUMA_MCTEST ( 4,  4, cpu_flags, name_suffix) \
+    DEF_LUMA_MCTEST ( 4,  8, cpu_flags, name_suffix) \
+    DEF_LUMA_MCTEST ( 8,  4, cpu_flags, name_suffix) \
+    DEF_LUMA_MCTEST ( 8,  8, cpu_flags, name_suffix) \
+    DEF_LUMA_MCTEST (16,  8, cpu_flags, name_suffix) \
+    DEF_LUMA_MCTEST ( 8, 16, cpu_flags, name_suffix) \
+    DEF_LUMA_MCTEST (16, 16, cpu_flags, name_suffix)
 
-DEF_LUMA_MCTEST (4, 4)
-DEF_LUMA_MCTEST (4, 8)
-DEF_LUMA_MCTEST (8, 4)
-DEF_LUMA_MCTEST (8, 8)
-DEF_LUMA_MCTEST (16, 8)
-DEF_LUMA_MCTEST (8, 16)
-DEF_LUMA_MCTEST (16, 16)
+DEF_LUMA_MCTESTS(0, c)
+DEF_LUMA_MCTESTS(~0, native)
+#ifdef X86_ASM
+DEF_LUMA_MCTESTS(WELS_CPU_SSE2, sse2)
+DEF_LUMA_MCTESTS(WELS_CPU_SSE2 | WELS_CPU_SSSE3, ssse3)
+#ifdef HAVE_AVX2
+DEF_LUMA_MCTESTS(WELS_CPU_SSE2 | WELS_CPU_SSSE3 | WELS_CPU_AVX2, avx2)
+#endif
+#endif
 
 #define DEF_CHROMA_MCTEST(iW,iH) \
 TEST(McChroma,iW##x##iH)  \
@@ -315,61 +313,89 @@
   }
 }
 
-#define DEF_HALFPEL_MCTEST(iW,iH) \
-TEST (EncMcHalfpel, iW##x##iH) { \
+#define DEF_HALFPEL_MCTEST(iW, iH, cpu_flags, name_suffix) \
+TEST (EncMcHalfpel, iW##x##iH##_##name_suffix) { \
     SMcFunc sMcFunc; \
-    for (int32_t k = 0; k < 2; k++) { \
-        for (int32_t w = 0; w < 2; w++) { \
-            int32_t width = iW ; \
-            int32_t height = iH; \
-            uint8_t uAnchor[4][MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \
-            uint8_t uSrcTest[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \
-            ENFORCE_STACK_ALIGN_2D (uint8_t, uDstTest, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
-            uint8_t* uAnchors[4]; \
-            int16_t pBuf[MC_BUFF_DST_STRIDE]; \
-            uAnchors[0] = &uAnchor[0][4][4]; \
-            uAnchors[1] = &uAnchor[1][4][4]; \
-            uAnchors[2] = &uAnchor[2][4][4]; \
-            uAnchors[3] = &uAnchor[3][4][4]; \
-             \
-            memset (uAnchor, 0, 4 * sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_SRC_STRIDE); \
-            memset (uDstTest, 0, sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_DST_STRIDE); \
-            for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \
-                for (int32_t i = 0; i < MC_BUFF_SRC_STRIDE; i++) { \
-                    uAnchor[0][j][i] = uSrcTest[j][i] = rand() % 256; \
-                } \
+    for (int32_t w = 0; w < 2; w++) { \
+        int32_t width = iW ; \
+        int32_t height = iH; \
+        uint8_t uAnchor[4][MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \
+        uint8_t uSrcTest[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \
+        uint8_t uRand[MC_BUFF_HEIGHT][MC_BUFF_DST_STRIDE]; \
+        ENFORCE_STACK_ALIGN_2D (uint8_t, uDstTest, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
+        uint8_t* uAnchors[4]; \
+        int16_t pBuf[MC_BUFF_DST_STRIDE]; \
+        uAnchors[0] = &uAnchor[0][4][4]; \
+        uAnchors[1] = &uAnchor[1][4][4]; \
+        uAnchors[2] = &uAnchor[2][4][4]; \
+        uAnchors[3] = &uAnchor[3][4][4]; \
+         \
+        memset (uAnchor, 0, 4 * sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_SRC_STRIDE); \
+        memset (uDstTest, 0, sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_DST_STRIDE); \
+        for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \
+            for (int32_t i = 0; i < MC_BUFF_SRC_STRIDE; i++) { \
+                uAnchor[0][j][i] = uSrcTest[j][i] = rand() % 256; \
+                uRand[j][i] = rand() % 256; \
             } \
-             \
-            uint32_t uiCpuFlag = k == 0 ? 0 : WelsCPUFeatureDetect (NULL); \
-            InitMcFunc (&sMcFunc, uiCpuFlag); \
-             \
-            MCHalfPelFilterAnchor (uAnchors[1], uAnchors[2], uAnchors[3], uAnchors[0], MC_BUFF_SRC_STRIDE, width + 1, height + 1, pBuf + 4); \
-            sMcFunc.pfLumaHalfpelHor (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height); \
-            for (int32_t j = 0; j < height; j++) { \
-                for (int32_t i = 0; i < width + 1; i++) { \
-                    ASSERT_EQ (uAnchor[1][4 + j][4 + i], uDstTest[j][i]); \
-                } \
+        } \
+         \
+        InitMcFunc (&sMcFunc, WelsCPUFeatureDetect (0) & (cpu_flags)); \
+         \
+        MCHalfPelFilterAnchor (uAnchors[1], uAnchors[2], uAnchors[3], uAnchors[0], MC_BUFF_SRC_STRIDE, width + 1, height + 1, pBuf + 4); \
+        memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \
+        sMcFunc.pfLumaHalfpelHor (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height); \
+        for (int32_t j = 0; j < height; j++) { \
+            for (int32_t i = 0; i < width + 1; i++) { \
+                ASSERT_EQ (uAnchor[1][4 + j][4 + i], uDstTest[j][i]); \
             } \
-            sMcFunc.pfLumaHalfpelVer (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width, height + 1); \
-            for (int32_t j = 0; j < height + 1; j++) { \
-                for (int32_t i = 0; i < width; i++) { \
-                    ASSERT_EQ (uAnchor[2][4 + j][4 + i], uDstTest[j][i]); \
-                } \
+        } \
+        for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \
+            for (int32_t i = j < height ? width + 1 : 0; i < MC_BUFF_DST_STRIDE; i++) { \
+                ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \
             } \
-            sMcFunc.pfLumaHalfpelCen (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height + 1); \
-            for (int32_t j = 0; j < height + 1; j++) { \
-                for (int32_t i = 0; i < width + 1; i++) { \
-                    ASSERT_EQ (uAnchor[3][4 + j][4 + i], uDstTest[j][i]); \
-                } \
+        } \
+        memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \
+        sMcFunc.pfLumaHalfpelVer (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width, height + 1); \
+        for (int32_t j = 0; j < height + 1; j++) { \
+            for (int32_t i = 0; i < width; i++) { \
+                ASSERT_EQ (uAnchor[2][4 + j][4 + i], uDstTest[j][i]); \
             } \
         } \
+        for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \
+            for (int32_t i = j < height + 1 ? width : 0; i < MC_BUFF_DST_STRIDE; i++) { \
+                ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \
+            } \
+        } \
+        memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \
+        sMcFunc.pfLumaHalfpelCen (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height + 1); \
+        for (int32_t j = 0; j < height + 1; j++) { \
+            for (int32_t i = 0; i < width + 1; i++) { \
+                ASSERT_EQ (uAnchor[3][4 + j][4 + i], uDstTest[j][i]); \
+            } \
+        } \
+        for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \
+            for (int32_t i = j < height + 1 ? width + 1 : 0; i < MC_BUFF_DST_STRIDE; i++) { \
+                ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \
+            } \
+        } \
     } \
 }
 
-DEF_HALFPEL_MCTEST(4,4)
-DEF_HALFPEL_MCTEST(4,8)
-DEF_HALFPEL_MCTEST(8,4)
-DEF_HALFPEL_MCTEST(8,8)
-DEF_HALFPEL_MCTEST(8,16)
-DEF_HALFPEL_MCTEST(16,8)
-DEF_HALFPEL_MCTEST(16,16)
+#define DEF_HALFPEL_MCTESTS(cpu_flags, name_suffix) \
+    DEF_HALFPEL_MCTEST( 4 , 4, cpu_flags, name_suffix) \
+    DEF_HALFPEL_MCTEST( 4,  8, cpu_flags, name_suffix) \
+    DEF_HALFPEL_MCTEST( 8,  4, cpu_flags, name_suffix) \
+    DEF_HALFPEL_MCTEST( 8,  8, cpu_flags, name_suffix) \
+    DEF_HALFPEL_MCTEST( 8, 16, cpu_flags, name_suffix) \
+    DEF_HALFPEL_MCTEST(16,  8, cpu_flags, name_suffix) \
+    DEF_HALFPEL_MCTEST(16, 16, cpu_flags, name_suffix)
+
+DEF_HALFPEL_MCTESTS(0, c)
+DEF_HALFPEL_MCTESTS(~0, native)
+#ifdef X86_ASM
+DEF_HALFPEL_MCTESTS(WELS_CPU_SSE2, sse2)
+DEF_HALFPEL_MCTESTS(WELS_CPU_SSE2 | WELS_CPU_SSSE3, ssse3)
+#ifdef HAVE_AVX2
+DEF_HALFPEL_MCTESTS(WELS_CPU_SSE2 | WELS_CPU_SSSE3 | WELS_CPU_AVX2, avx2)
+#endif
+#endif