shithub: openh264

Download patch

ref: 4390f83cecc820beea75016b9aa74544d9d5d3e4
parent: e9980927f7ddd347981839aec144441aa78258d4
author: gxw <guxiwei-hf@loongson.cn>
date: Wed Aug 1 05:42:14 EDT 2018

Add support for loongson platform

Add optimized file codec/commom/mips64/deblock_mmi.c and
corresponding unit tests for loongson platform

Change-Id: Icfbdd1f5f58d5e4a1abfb6150c7135ee9f227ba2

diff: cannot open b/codec/common/mips64//null: file does not exist: 'b/codec/common/mips64//null'
--- a/build/arch.mk
+++ b/build/arch.mk
@@ -29,3 +29,15 @@
 CFLAGS += -DHAVE_NEON_AARCH64
 endif
 endif
+
+#for loongson
+ifneq ($(filter mips64, $(ARCH)),)
+ifeq ($(USE_ASM), Yes)
+ASM_ARCH = mips64
+ASMFLAGS += -I$(SRC_PATH)codec/common/mips64/
+LOONGSON3A = $(shell g++ -dM -E - < /dev/null | grep '_MIPS_TUNE ' | cut -f 3 -d " ")
+ifeq ($(LOONGSON3A), "loongson3a")
+CFLAGS += -DHAVE_MMI
+endif
+endif
+endif
--- /dev/null
+++ b/codec/common/inc/asmdefs_mmi.h
@@ -1,0 +1,339 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Loongson Technology Co.,Ltd.
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef ASMDEFS_MMI_H_
+#define ASMDEFS_MMI_H_
+
+#define CACHE_LINE_SIZE 32
+
+#if defined(__mips64) && defined(__LP64__)
+# define mips_reg       int64_t
+# define PTR_ADDU       "daddu "
+# define PTR_ADDIU      "daddiu "
+# define PTR_ADDI       "daddi "
+# define PTR_SUBU       "dsubu "
+# define PTR_L          "ld "
+# define PTR_SRA        "dsra "
+# define PTR_SRL        "dsrl "
+# define PTR_SLL        "dsll "
+# define PTR_MTC1       "dmtc1 "
+# define PTR_LI         "dli "
+#else
+# define mips_reg       int32_t
+# define PTR_ADDU       "addu "
+# define PTR_ADDIU      "addiu "
+# define PTR_ADDI       "addi "
+# define PTR_SUBU       "subu "
+# define PTR_L          "lw "
+# define PTR_SRA        "sra "
+# define PTR_SRL        "srl "
+# define PTR_SLL        "sll "
+# define PTR_MTC1       "mtc1 "
+# define PTR_LI         "li "
+#endif
+
+#define MMI_XSawp_BH(f0, f2, f4, f6, f8, f10) \
+           "mov.d      "#f8", "#f2"                \n\t" \
+           "punpckhbh  "#f2", "#f0", "#f4"         \n\t" \
+           "punpcklbh  "#f0", "#f0", "#f4"         \n\t" \
+           "punpckhbh  "#f10", "#f8", "#f6"        \n\t" \
+           "punpcklbh  "#f8", "#f8", "#f6"         \n\t"
+
+#define MMI_XSawp_HW(f0, f2, f4, f6, f8, f10) \
+           "mov.d      "#f8", "#f2"                \n\t" \
+           "punpckhhw  "#f2", "#f0", "#f4"         \n\t" \
+           "punpcklhw  "#f0", "#f0", "#f4"         \n\t" \
+           "punpckhhw  "#f10", "#f8", "#f6"        \n\t" \
+           "punpcklhw  "#f8", "#f8", "#f6"         \n\t"
+
+#define MMI_XSawp_WD(f0, f2, f4, f6, f8, f10) \
+           "mov.d      "#f8", "#f2"                \n\t" \
+           "punpckhwd  "#f2", "#f0", "#f4"         \n\t" \
+           "punpcklwd  "#f0", "#f0", "#f4"         \n\t" \
+           "punpckhwd  "#f10", "#f8", "#f6"        \n\t" \
+           "punpcklwd  "#f8", "#f8", "#f6"         \n\t"
+
+#define MMI_XSawp_DQ(f0, f2, f4, f6, f8, f10) \
+           "mov.d      "#f8", "#f2"                \n\t" \
+           "mov.d      "#f2", "#f4"                \n\t" \
+           "mov.d      "#f10", "#f6"               \n\t"
+
+#define WELS_AbsH(f0, f2, f4, f6, f8, f10) \
+           "xor        "#f8", "#f8", "#f8"         \n\t" \
+           "psubh      "#f10", "#f8", "#f6"        \n\t" \
+           "psubh      "#f8", "#f8", "#f4"         \n\t" \
+           "pmaxsh     "#f0", "#f4", "#f8"         \n\t" \
+           "pmaxsh     "#f2", "#f6", "#f10"        \n\t"
+
+#define MMI_SumSub(f0, f2, f4, f6, f8, f10) \
+           "mov.d      "#f8", "#f4"                    \n\t" \
+           "mov.d      "#f10", "#f6"                   \n\t" \
+           "paddh      "#f4", "#f4", "#f0"             \n\t" \
+           "paddh      "#f6", "#f6", "#f2"             \n\t" \
+           "psubh      "#f0", "#f0", "#f8"             \n\t" \
+           "psubh      "#f2", "#f2", "#f10"            \n\t"
+
+#define MMI_LoadDiff8P(f0, f2, f4, f6, f8, r0, r1) \
+           "gsldlc1    "#f0", 0x7("#r0")               \n\t" \
+           "gsldlc1    "#f4", 0x7("#r1")               \n\t" \
+           "gsldrc1    "#f0", 0x0("#r0")               \n\t" \
+           "gsldrc1    "#f4", 0x0("#r1")               \n\t" \
+           "punpckhbh  "#f2", "#f0", "#f8"             \n\t" \
+           "punpcklbh  "#f0", "#f0", "#f8"             \n\t" \
+           "punpckhbh  "#f6", "#f4", "#f8"             \n\t" \
+           "punpcklbh  "#f4", "#f4", "#f8"             \n\t" \
+           "psubh      "#f0", "#f0", "#f4"             \n\t" \
+           "psubh      "#f2", "#f2", "#f6"             \n\t"
+
+#define MMI_TransTwo4x4H(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+           MMI_XSawp_HW(f0, f2, f4, f6, f16, f18)  \
+           MMI_XSawp_HW(f8, f10, f12, f14, f4, f6) \
+           MMI_XSawp_WD(f0, f2, f8, f10, f12, f14) \
+           MMI_XSawp_WD(f16, f18, f4, f6, f8, f10) \
+           MMI_XSawp_DQ(f0, f2, f16, f18, f4, f6)  \
+           MMI_XSawp_DQ(f12, f14, f8, f10, f16, f18)
+
+#define MMI_TransTwo8x8B(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26, f28, f30, r0, r1) \
+           "dmfc1      "#r0", "#f28"                   \n\t" \
+           "dmfc1      "#r1", "#f30"                   \n\t" \
+           MMI_XSawp_BH(f0, f2, f4, f6, f28, f30)            \
+           MMI_XSawp_BH(f8, f10, f12, f14, f4, f6)           \
+           MMI_XSawp_BH(f16, f18, f20, f22, f12, f14)        \
+           "dmtc1      "#r0", "#f20"                   \n\t" \
+           "dmtc1      "#r1", "#f22"                   \n\t" \
+           "dmfc1      "#r0", "#f12"                   \n\t" \
+           "dmfc1      "#r1", "#f14"                   \n\t" \
+           MMI_XSawp_BH(f24, f26, f20, f22, f12, f14)        \
+           MMI_XSawp_HW(f0, f2, f8, f10, f20, f22)           \
+           MMI_XSawp_HW(f28, f30, f4, f6, f8, f10)           \
+           MMI_XSawp_HW(f16, f18, f24, f26, f4, f6)          \
+           "dmtc1      "#r0", "#f24"                   \n\t" \
+           "dmtc1      "#r1", "#f26"                   \n\t" \
+           "dmfc1      "#r0", "#f8"                    \n\t" \
+           "dmfc1      "#r1", "#f10"                   \n\t" \
+           MMI_XSawp_HW(f24, f26, f12, f14, f8, f10)         \
+           MMI_XSawp_WD(f0, f2, f16, f18, f12, f14)          \
+           MMI_XSawp_WD(f20, f22, f4, f6, f16, f18)          \
+           MMI_XSawp_WD(f28, f30, f24, f26, f4, f6)          \
+           "dmtc1      "#r0", "#f24"                   \n\t" \
+           "dmtc1      "#r1", "#f26"                   \n\t" \
+           "dmfc1      "#r0", "#f16"                   \n\t" \
+           "dmfc1      "#r1", "#f18"                   \n\t" \
+           MMI_XSawp_WD(f24, f26, f8, f10, f16, f18)         \
+           MMI_XSawp_DQ(f0, f2, f28, f30, f8, f10)           \
+           MMI_XSawp_DQ(f12, f14, f4, f6, f28, f30)          \
+           MMI_XSawp_DQ(f20, f22, f24, f26, f4, f6)          \
+           "dmtc1      "#r0", "#f24"                   \n\t" \
+           "dmtc1      "#r1", "#f26"                   \n\t" \
+           "dmfc1      "#r0", "#f0"                    \n\t" \
+           "dmfc1      "#r1", "#f2"                    \n\t" \
+           MMI_XSawp_DQ(f24, f26, f16, f18, f0, f2)          \
+           "dmtc1      "#r0", "#f16"                   \n\t" \
+           "dmtc1      "#r1", "#f18"                   \n\t"
+
+#define MMI_XSwap_HW_SINGLE(f0, f2, f4) \
+           "mov.d      "#f4", "#f0"                    \n\t" \
+           "punpckhhw  "#f4", "#f4", "#f2"             \n\t" \
+           "punpcklhw  "#f0", "#f0", "#f2"             \n\t"
+
+#define MMI_XSwap_WD_SINGLE(f0, f2, f4) \
+           "mov.d      "#f4", "#f0"                    \n\t" \
+           "punpckhwd  "#f4", "#f4", "#f2"             \n\t" \
+           "punpcklwd  "#f0", "#f0", "#f2"             \n\t"
+
+#define MMI_Trans4x4H_SINGLE(f0, f2, f4, f6, f8) \
+           MMI_XSwap_HW_SINGLE(f0, f2, f8)              \
+           MMI_XSwap_HW_SINGLE(f4, f6, f2)              \
+           MMI_XSwap_WD_SINGLE(f0, f4, f6)              \
+           MMI_XSwap_WD_SINGLE(f8, f2, f4)
+
+#define MMI_SumSub_SINGLE(f0, f2, f4) \
+           "mov.d      "#f4", "#f2"                    \n\t" \
+           "psubh      "#f2", "#f2", "#f0"             \n\t" \
+           "paddh      "#f0", "#f0", "#f4"             \n\t"
+
+#define MMI_SumSubMul2_SINGLE(f0, f2, f4, f6) \
+           "mov.d      "#f4", "#f0"                    \n\t" \
+           "psllh      "#f0", "#f0", "#f6"             \n\t" \
+           "paddh      "#f0", "#f0", "#f2"             \n\t" \
+           "psllh      "#f2", "#f2", "#f6"             \n\t" \
+           "psubh      "#f4", "#f4", "#f2"             \n\t"
+
+//f4 should be 0x0
+#define MMI_Copy8Times(f0, f2, f4, r0) \
+           "dmtc1      "#r0", "#f0"                    \n\t" \
+           "pshufh     "#f0", "#f0", "#f4"             \n\t" \
+           "mov.d      "#f2", "#f0"                    \n\t"
+
+//f4 should be 0x0
+#define MMI_Copy16Times(f0, f2, f4, r0) \
+           "dmtc1      "#r0", "#f0"                    \n\t" \
+           "punpcklbh  "#f0", "#f0", "#f0"             \n\t" \
+           "pshufh     "#f0", "#f0", "#f4"             \n\t" \
+           "mov.d      "#f2", "#f0"                    \n\t"
+
+#define MMI_SumSubDiv2_SINGLE(f0, f2, f4, f6) \
+           "psrah      "#f4", "#f2", "#f6"             \n\t" \
+           "paddh      "#f4", "#f4", "#f0"             \n\t" \
+           "psrah      "#f0", "#f0", "#f6"             \n\t" \
+           "psubh      "#f0", "#f0", "#f2"             \n\t"
+
+#define MMI_IDCT_SINGLE(f0, f2, f4, f6, f8, f10, f12) \
+           MMI_SumSub_SINGLE(f6, f8, f10)             \
+           MMI_SumSubDiv2_SINGLE(f4, f2, f0, f12)     \
+           MMI_SumSub_SINGLE(f0, f6, f10)             \
+           MMI_SumSub_SINGLE(f4, f8, f10)
+
+#define MMI_StoreDiff4P_SINGLE(f0, f2, f4, f6, r0, r1, f8) \
+           "gsldlc1    "#f2", 0x7("#r1")               \n\t" \
+           "gsldrc1    "#f2", 0x0("#r1")               \n\t" \
+           "punpcklbh  "#f2", "#f2", "#f6"             \n\t" \
+           "paddh      "#f0", "#f0", "#f4"             \n\t" \
+           "psrah      "#f0", "#f0", "#f8"             \n\t" \
+           "paddsh     "#f0", "#f0", "#f2"             \n\t" \
+           "packushb   "#f0", "#f0", "#f2"             \n\t" \
+           "gsswlc1    "#f0", 0x3("#r0")               \n\t" \
+           "gsswrc1    "#f0", 0x0("#r0")               \n\t"
+
+#define SUMH_HORIZON(f0, f2, f4, f6, f8) \
+           "paddh      "#f0", "#f0", "#f2"                       \n\t" \
+           "punpckhhw  "#f2", "#f0", "#f8"                       \n\t" \
+           "punpcklhw  "#f0", "#f0", "#f8"                       \n\t" \
+           "paddw      "#f0", "#f0", "#f2"                       \n\t" \
+           "punpckhwd  "#f2", "#f0", "#f0"                       \n\t" \
+           "paddw      "#f0", "#f0", "#f2"                       \n\t"
+
+#define LOAD_COLUMN(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
+           "daddu      "#r2", "#r0", "#r1"                       \n\t" \
+           "gsldlc1    "#f0", 0x7("#r0")                         \n\t" \
+           "gsldlc1    "#f4", 0x7("#r2")                         \n\t" \
+           "gsldrc1    "#f0", 0x0("#r0")                         \n\t" \
+           "gsldrc1    "#f4", 0x0("#r2")                         \n\t" \
+           "punpcklbh  "#f0", "#f0", "#f4"                       \n\t" \
+           "daddu      "#r0", "#r2", "#r1"                       \n\t" \
+           "daddu      "#r2", "#r0", "#r1"                       \n\t" \
+           "gsldlc1    "#f8", 0x7("#r0")                         \n\t" \
+           "gsldlc1    "#f4", 0x7("#r2")                         \n\t" \
+           "gsldrc1    "#f8", 0x0("#r0")                         \n\t" \
+           "gsldrc1    "#f4", 0x0("#r2")                         \n\t" \
+           "punpcklbh  "#f8", "#f8", "#f4"                       \n\t" \
+           "punpckhhw  "#f2", "#f0", "#f8"                       \n\t" \
+           "punpcklhw  "#f0", "#f0", "#f8"                       \n\t" \
+           "daddu      "#r0", "#r2", "#r1"                       \n\t" \
+           "daddu      "#r2", "#r0", "#r1"                       \n\t" \
+           "gsldlc1    "#f12", 0x7("#r0")                        \n\t" \
+           "gsldlc1    "#f4", 0x7("#r2")                         \n\t" \
+           "gsldrc1    "#f12", 0x0("#r0")                        \n\t" \
+           "gsldrc1    "#f4", 0x0("#r2")                         \n\t" \
+           "punpcklbh  "#f12", "#f12", "#f4"                     \n\t" \
+           "daddu      "#r0", "#r2", "#r1"                       \n\t" \
+           "daddu      "#r2", "#r0", "#r1"                       \n\t" \
+           "gsldlc1    "#f8", 0x7("#r0")                         \n\t" \
+           "gsldlc1    "#f4", 0x7("#r2")                         \n\t" \
+           "gsldrc1    "#f8", 0x0("#r0")                         \n\t" \
+           "gsldrc1    "#f4", 0x0("#r2")                         \n\t" \
+           "punpcklbh  "#f8", "#f8", "#f4"                       \n\t" \
+           "punpckhhw  "#f14", "#f12", "#f8"                     \n\t" \
+           "punpcklhw  "#f12", "#f12", "#f8"                     \n\t" \
+           "daddu      "#r0", "#r2", "#r1"                       \n\t" \
+           "punpcklwd  "#f0", "#f2", "#f14"                      \n\t" \
+           "punpckhwd  "#f2", "#f2", "#f14"                      \n\t"
+
+#define LOAD_COLUMN_C(f0, f2, f4, f6, r0, r1, r2) \
+           "daddu      "#r2", "#r0", "#r1"                       \n\t" \
+           "gsldlc1    "#f0", 0x7("#r0")                         \n\t" \
+           "gsldlc1    "#f2", 0x7("#r2")                         \n\t" \
+           "gsldrc1    "#f0", 0x0("#r0")                         \n\t" \
+           "gsldrc1    "#f2", 0x0("#r2")                         \n\t" \
+           "punpcklbh  "#f0", "#f0", "#f2"                       \n\t" \
+           "daddu      "#r0", "#r2", "#r1"                       \n\t" \
+           "daddu      "#r2", "#r0", "#r1"                       \n\t" \
+           "gsldlc1    "#f4", 0x7("#r0")                         \n\t" \
+           "gsldlc1    "#f2", 0x7("#r2")                         \n\t" \
+           "gsldrc1    "#f4", 0x0("#r0")                         \n\t" \
+           "gsldrc1    "#f2", 0x0("#r2")                         \n\t" \
+           "punpcklbh  "#f4", "#f4", "#f2"                       \n\t" \
+           "punpckhhw  "#f0", "#f0", "#f4"                       \n\t" \
+           "daddu      "#r0", "#r2", "#r1"                       \n\t"
+/**
+ * backup register
+ */
+#define BACKUP_REG \
+           double __back_temp[8];                                      \
+           if (_MIPS_SIM == _ABI64)                                    \
+           __asm__ volatile (                                          \
+             "gssqc1       $f25,      $f24,       0x00(%[temp])  \n\t" \
+             "gssqc1       $f27,      $f26,       0x10(%[temp])  \n\t" \
+             "gssqc1       $f29,      $f28,       0x20(%[temp])  \n\t" \
+             "gssqc1       $f31,      $f30,       0x30(%[temp])  \n\t" \
+             :                                                         \
+             : [temp]"r"(__back_temp)                                  \
+             : "memory"                                                \
+           );                                                          \
+          else                                                         \
+           __asm__ volatile (                                          \
+             "gssqc1       $f22,      $f20,       0x00(%[temp])  \n\t" \
+             "gssqc1       $f26,      $f24,       0x10(%[temp])  \n\t" \
+             "gssqc1       $f30,      $f28,       0x20(%[temp])  \n\t" \
+             :                                                         \
+             : [temp]"r"(__back_temp)                                  \
+             : "memory"                                                \
+           );
+
+/**
+ * recover register
+ */
+#define RECOVER_REG \
+           if (_MIPS_SIM == _ABI64)                                    \
+           __asm__ volatile (                                          \
+             "gslqc1       $f25,      $f24,       0x00(%[temp])  \n\t" \
+             "gslqc1       $f27,      $f26,       0x10(%[temp])  \n\t" \
+             "gslqc1       $f29,      $f28,       0x20(%[temp])  \n\t" \
+             "gslqc1       $f31,      $f30,       0x30(%[temp])  \n\t" \
+             :                                                         \
+             : [temp]"r"(__back_temp)                                  \
+             : "memory"                                                \
+           );                                                          \
+           else                                                        \
+           __asm__ volatile (                                          \
+             "gslqc1       $f22,      $f20,       0x00(%[temp])  \n\t" \
+             "gslqc1       $f26,      $f24,       0x10(%[temp])  \n\t" \
+             "gslqc1       $f30,      $f28,       0x20(%[temp])  \n\t" \
+             :                                                         \
+             : [temp]"r"(__back_temp)                                  \
+             : "memory"                                                \
+           );
+
+# define OK             1
+# define NOTOK          0
+
+#endif  /* ASMDEFS_MMI_H_ */
--- a/codec/common/inc/cpu_core.h
+++ b/codec/common/inc/cpu_core.h
@@ -84,6 +84,9 @@
 #define WELS_CPU_VFPv3      0x000002    /* VFPv3 */
 #define WELS_CPU_NEON       0x000004    /* NEON */
 
+/* For loongson */
+#define WELS_CPU_MMI        0x00000001  /* mmi */
+
 /*
  *  Interfaces for CPU core feature detection as below
  */
--- a/codec/common/inc/deblocking_common.h
+++ b/codec/common/inc/deblocking_common.h
@@ -75,6 +75,22 @@
 void DeblockChromaEq4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
 void WelsNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
 #endif
+
+#if defined(HAVE_MMI)
+void DeblockLumaLt4V_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockLumaTransposeH2V_mmi (uint8_t* pPixY, int32_t iStride, uint8_t* pDst);
+void DeblockLumaTransposeV2H_mmi (uint8_t* pPixY, int32_t iStride, uint8_t* pSrc);
+void DeblockLumaLt4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaEq4V_mmi (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4V_mmi (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                            int8_t* pTC);
+void DeblockChromaEq4H_mmi (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4H_mmi (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                            int8_t* pTC);
+void WelsNonZeroCount_mmi (int8_t* pNonZeroCount);
+#endif//HAVE_MMI
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- /dev/null
+++ b/codec/common/mips64/deblock_mmi.c
@@ -1,0 +1,2826 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2018, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file    deblock_mmi.c
+ *
+ * \brief   Loongson optimize
+ *
+ * \date    20/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+void DeblockLumaLt4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
+                         int32_t iBeta, int8_t *pTC) {
+  unsigned char tmp[512] __attribute__((aligned(32)));
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "dsll       $8, %[iStride], 0x1                       \n\t"
+    "daddu      $8, $8, %[iStride]                        \n\t"
+    "dsubu      $14, %[pPix], $8                          \n\t"
+
+    "dsll       $8, %[iStride], 0x1                       \n\t"
+    "dsubu      $9, %[pPix], $8                           \n\t"
+
+    "dmtc1      %[iAlpha], $f0                            \n\t"
+    "dsubu      $13, %[pPix], %[iStride]                  \n\t"
+    "daddu      %[iStride], %[iStride], %[pPix]           \n\t"
+    "daddu      $12, $8, %[pPix]                          \n\t"
+
+    "punpcklhw  $f0, $f0, $f0                             \n\t"
+    "lb         $8, 0x0(%[pTC])                           \n\t"
+    "punpcklwd  $f0, $f0, $f0                             \n\t"
+    "mov.d      $f2, $f0                                  \n\t"
+    "gssqc1     $f2, $f0, 432-112(%[tmp])                 \n\t"
+    "dmtc1      %[iBeta], $f0                             \n\t"
+    "lb         %[iAlpha], 0x1(%[pTC])                    \n\t"
+    "dli        %[iBeta], 0xFFFF                          \n\t"
+    "punpcklhw  $f0, $f0, $f0                             \n\t"
+    "and        $10, %[iAlpha], %[iBeta]                  \n\t"
+    "punpcklwd  $f0, $f0, $f0                             \n\t"
+    "mov.d      $f2, $f0                                  \n\t"
+    "and        %[iAlpha], %[iAlpha], %[iBeta]            \n\t"
+    "dmtc1      $10, $f4                                  \n\t"
+    "mov.d      $f8, $f4                                  \n\t"
+    "dmtc1      %[iAlpha], $f16                           \n\t"
+    "and        %[iAlpha], $8, %[iBeta]                   \n\t"
+    "dmtc1      %[iAlpha], $f20                           \n\t"
+    "mov.d      $f24, $f20                                \n\t"
+    "mov.d      $f28, $f20                                \n\t"
+    "gssqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
+    "dmtc1      %[iAlpha], $f0                            \n\t"
+
+    "lb         %[iAlpha], 0x3(%[pTC])                    \n\t"
+    "lb         %[pTC], 0x2(%[pTC])                       \n\t"
+    "dmtc1      $10, $f12                                 \n\t"
+    "punpcklhw  $f0, $f0, $f16                            \n\t"
+    "and        $8, %[iAlpha], %[iBeta]                   \n\t"
+    "punpcklhw  $f24, $f24, $f8                           \n\t"
+    "punpcklhw  $f20, $f20, $f4                           \n\t"
+    "punpcklhw  $f0, $f0, $f24                            \n\t"
+    "punpcklhw  $f28, $f28, $f12                          \n\t"
+    "punpcklhw  $f28, $f28, $f20                          \n\t"
+    "punpckhhw  $f2, $f0, $f28                            \n\t"
+    "punpcklhw  $f0, $f0, $f28                            \n\t"
+    "gssqc1     $f2, $f0, 432-400(%[tmp])                 \n\t"
+    "dmtc1      $8, $f0                                   \n\t"
+    "and        %[iAlpha], %[iAlpha], %[iBeta]            \n\t"
+    "mov.d      $f8, $f0                                  \n\t"
+    "dmtc1      %[iAlpha], $f16                           \n\t"
+    "and        %[iAlpha], %[pTC], %[iBeta]               \n\t"
+    "dmtc1      $8, $f12                                  \n\t"
+    "dmtc1      %[iAlpha], $f20                           \n\t"
+    "punpcklhw  $f20, $f20, $f0                           \n\t"
+
+    "xor        $f0, $f0, $f0                             \n\t"
+    "dmtc1      %[iAlpha], $f24                           \n\t"
+    "and        %[pTC], %[pTC], %[iBeta]                  \n\t"
+    "punpcklhw  $f24, $f24, $f8                           \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "dmtc1      %[pTC], $f4                               \n\t"
+
+    "gslqc1     $f10, $f8, 0x0($9)                        \n\t"
+    "punpckhbh  $f10, $f8, $f0                            \n\t"
+    "punpcklbh  $f8, $f8, $f0                             \n\t"
+
+    "dli        %[iAlpha], 0x4                            \n\t"
+    "seh        %[pTC], %[iAlpha]                         \n\t"
+    "punpcklhw  $f28, $f28, $f12                          \n\t"
+    "punpcklhw  $f28, $f28, $f20                          \n\t"
+    "gslqc1     $f22, $f20, 0x0(%[iStride])               \n\t"
+    "gslqc1     $f14, $f12, 0x0($13)                      \n\t"
+    "gsldxc1    $f2, 0x0($12, $0)                         \n\t"
+    "punpckhbh  $f22, $f20, $f0                           \n\t"
+    "punpcklbh  $f20, $f20, $f0                           \n\t"
+    "gssqc1     $f22, $f20, 432-240(%[tmp])               \n\t"
+    "punpckhbh  $f22, $f2, $f0                            \n\t"
+    "punpcklbh  $f20, $f2, $f0                            \n\t"
+    "gssqc1     $f22, $f20, 432-352(%[tmp])               \n\t"
+    "punpcklhw  $f4, $f4, $f16                            \n\t"
+    "gslqc1     $f18, $f16, 0x0($14)                      \n\t"
+    "punpcklhw  $f4, $f4, $f24                            \n\t"
+    "gslqc1     $f26, $f24, 0x0(%[pPix])                  \n\t"
+    "punpckhhw  $f6, $f4, $f28                            \n\t"
+    "punpcklhw  $f4, $f4, $f28                            \n\t"
+    "punpckhbh  $f26, $f24, $f0                           \n\t"
+    "punpcklbh  $f24, $f24, $f0                           \n\t"
+    "punpckhbh  $f14, $f12, $f0                           \n\t"
+    "punpcklbh  $f12, $f12, $f0                           \n\t"
+    "punpckhbh  $f18, $f16, $f0                           \n\t"
+    "punpcklbh  $f16, $f16, $f0                           \n\t"
+    "psubh      $f28, $f12, $f16                          \n\t"
+    "psubh      $f30, $f14, $f18                          \n\t"
+    "gssqc1     $f18, $f16, 432-272(%[tmp])               \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f16, $f18)
+    "gslqc1     $f18, $f16, 432-336(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
+    "pcmpgth    $f20, $f16, $f28                          \n\t"
+    "pcmpgth    $f22, $f18, $f30                          \n\t"
+    "gssqc1     $f22, $f20, 432-288(%[tmp])               \n\t"
+    "psubh      $f28, $f24, $f0                           \n\t"
+    "psubh      $f30, $f26, $f2                           \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+    "pcmpgth    $f20, $f16, $f28                          \n\t"
+    "pcmpgth    $f22, $f18, $f30                          \n\t"
+    "gssqc1     $f22, $f20, 432-256(%[tmp])               \n\t"
+    "pavgh      $f20, $f12, $f24                          \n\t"
+    "pavgh      $f22, $f14, $f26                          \n\t"
+    "gssqc1     $f22, $f20, 432-304(%[tmp])               \n\t"
+    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 432-288(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 432-256(%[tmp])                 \n\t"
+    "psubh      $f20, $f20, $f28                          \n\t"
+    "psubh      $f22, $f22, $f30                          \n\t"
+    "psubh      $f20, $f20, $f0                           \n\t"
+    "psubh      $f22, $f22, $f2                           \n\t"
+    "gssqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 432-240(%[tmp])                 \n\t"
+    "psubh      $f20, $f24, $f12                          \n\t"
+    "psubh      $f22, $f26, $f14                          \n\t"
+    "gssqc1     $f26, $f24, 432-32(%[tmp])                \n\t"
+    "psubh      $f24, $f24, $f0                           \n\t"
+    "psubh      $f26, $f26, $f2                           \n\t"
+    "gssqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
+    WELS_AbsH($f28, $f30, $f20, $f22, $f28, $f30)
+    "gslqc1     $f22, $f20, 432-112(%[tmp])               \n\t"
+    "pcmpgth    $f20, $f20, $f28                          \n\t"
+    "pcmpgth    $f22, $f22, $f30                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+    "pcmpgth    $f28, $f16, $f24                          \n\t"
+    "pcmpgth    $f30, $f18, $f26                          \n\t"
+
+    "xor        $f0, $f0, $f0                             \n\t"
+    "and        $f20, $f20, $f28                          \n\t"
+    "and        $f22, $f22, $f30                          \n\t"
+    "psubh      $f24, $f12, $f8                           \n\t"
+    "psubh      $f26, $f14, $f10                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+    "pcmpgth    $f28, $f16, $f24                          \n\t"
+    "pcmpgth    $f30, $f18, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 432-400(%[tmp])               \n\t"
+    "and        $f20, $f20, $f28                          \n\t"
+    "and        $f22, $f22, $f30                          \n\t"
+    "pcmpgth    $f28, $f24, $f0                           \n\t"
+    "pcmpgth    $f30, $f26, $f0                           \n\t"
+    "pcmpeqh    $f24, $f24, $f0                           \n\t"
+    "pcmpeqh    $f26, $f26, $f0                           \n\t"
+    "or         $f28, $f28, $f24                          \n\t"
+    "or         $f30, $f30, $f26                          \n\t"
+    "and        $f20, $f20, $f28                          \n\t"
+    "and        $f22, $f22, $f30                          \n\t"
+    "gssqc1     $f22, $f20, 432-320(%[tmp])               \n\t"
+    "dmtc1      %[pTC], $f20                              \n\t"
+    "punpckhhw  $f26, $f20, $f20                          \n\t"
+    "punpcklhw  $f24, $f20, $f20                          \n\t"
+    "punpcklwd  $f20, $f24, $f24                          \n\t"
+    "mov.d      $f22, $f20                                \n\t"
+    "gssqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
+    "gslqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
+    "psubh      $f24, $f0, $f20                           \n\t"
+    "dli        $11, 0x2                                  \n\t"
+    "psubh      $f26, $f0, $f22                           \n\t"
+    "dmtc1      $11, $f28                                 \n\t"
+    "gslqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 432-240(%[tmp])                 \n\t"
+    "psllh      $f20, $f20, $f28                          \n\t"
+    "psllh      $f22, $f22, $f28                          \n\t"
+    "psubh      $f28, $f8, $f0                            \n\t"
+    "psubh      $f30, $f10, $f2                           \n\t"
+    "paddh      $f28, $f28, $f20                          \n\t"
+    "paddh      $f30, $f30, $f22                          \n\t"
+    "gslqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
+    "paddh      $f28, $f28, $f20                          \n\t"
+    "paddh      $f30, $f30, $f22                          \n\t"
+    "dli        $11, 0x3                                  \n\t"
+    "dmtc1      $11, $f20                                 \n\t"
+    "psrah      $f28, $f28, $f20                          \n\t"
+    "psrah      $f30, $f30, $f20                          \n\t"
+    "gslqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
+    "pmaxsh     $f24, $f24, $f28                          \n\t"
+    "pmaxsh     $f26, $f26, $f30                          \n\t"
+    "gslqc1     $f2, $f0, 432-320(%[tmp])                 \n\t"
+    "pminsh     $f20, $f20, $f24                          \n\t"
+    "pminsh     $f22, $f22, $f26                          \n\t"
+
+    "and        $f20, $f20, $f0                           \n\t"
+    "and        $f22, $f22, $f2                           \n\t"
+    "gslqc1     $f26, $f24, 432-400(%[tmp])               \n\t"
+    "gssqc1     $f22, $f20, 432-64(%[tmp])                \n\t"
+    "xor        $f0, $f0, $f0                             \n\t"
+    "gssqc1     $f26, $f24, 432-384(%[tmp])               \n\t"
+    "psubh      $f20, $f0, $f24                           \n\t"
+    "psubh      $f22, $f0, $f26                           \n\t"
+    "gssqc1     $f22, $f20, 432-368(%[tmp])               \n\t"
+    "mov.d      $f24, $f20                                \n\t"
+    "mov.d      $f26, $f22                                \n\t"
+    "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
+    "paddh      $f20, $f20, $f28                          \n\t"
+    "paddh      $f22, $f22, $f30                          \n\t"
+    "paddh      $f28, $f8, $f8                            \n\t"
+    "paddh      $f30, $f10, $f10                          \n\t"
+    "psubh      $f20, $f20, $f28                          \n\t"
+    "psubh      $f22, $f22, $f30                          \n\t"
+    "dli        $11, 0x1                                  \n\t"
+    "dmtc1      $11, $f28                                 \n\t"
+    "psrah      $f20, $f20, $f28                          \n\t"
+    "psrah      $f22, $f22, $f28                          \n\t"
+    "pmaxsh     $f24, $f24, $f20                          \n\t"
+    "pmaxsh     $f26, $f26, $f22                          \n\t"
+    "gslqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
+    "pminsh     $f20, $f20, $f24                          \n\t"
+    "pminsh     $f22, $f22, $f26                          \n\t"
+
+    "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 432-288(%[tmp])               \n\t"
+    "and        $f20, $f20, $f24                          \n\t"
+    "and        $f22, $f22, $f26                          \n\t"
+    "and        $f20, $f20, $f28                          \n\t"
+    "and        $f22, $f22, $f30                          \n\t"
+    "gslqc1     $f26, $f24, 432-240(%[tmp])               \n\t"
+    "gssqc1     $f22, $f20, 432-96(%[tmp])                \n\t"
+    "gslqc1     $f22, $f20, 432-352(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
+    "paddh      $f20, $f20, $f28                          \n\t"
+    "paddh      $f22, $f22, $f30                          \n\t"
+    "paddh      $f28, $f24, $f24                          \n\t"
+    "paddh      $f30, $f26, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 432-368(%[tmp])               \n\t"
+    "dli        $11, 0x1                                  \n\t"
+    "psubh      $f20, $f20, $f28                          \n\t"
+    "dmtc1      $11, $f28                                 \n\t"
+    "psubh      $f22, $f22, $f30                          \n\t"
+
+    "psrah      $f20, $f20, $f28                          \n\t"
+    "psrah      $f22, $f22, $f28                          \n\t"
+    "gslqc1     $f30, $f28, 0x0(%[iStride])               \n\t"
+    "pmaxsh     $f24, $f24, $f20                          \n\t"
+    "pmaxsh     $f26, $f26, $f22                          \n\t"
+    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
+    "pminsh     $f20, $f20, $f24                          \n\t"
+    "pminsh     $f22, $f22, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
+    "and        $f20, $f20, $f24                          \n\t"
+    "and        $f22, $f22, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 432-256(%[tmp])               \n\t"
+    "and        $f20, $f20, $f24                          \n\t"
+    "and        $f22, $f22, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 0x0($9)                       \n\t"
+    "punpcklbh  $f28, $f30, $f0                           \n\t"
+    "punpckhbh  $f30, $f30, $f0                           \n\t"
+    "gssqc1     $f30, $f28, 432-352(%[tmp])               \n\t"
+
+    "gslqc1     $f30, $f28, 0x0($12)                      \n\t"
+    "punpcklbh  $f24, $f26, $f0                           \n\t"
+    "punpckhbh  $f26, $f26, $f0                           \n\t"
+    "gssqc1     $f22, $f20, 432-48(%[tmp])                \n\t"
+    "gslqc1     $f22, $f20, 0x0($14)                      \n\t"
+    "gssqc1     $f26, $f24, 432-368(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 0x0($13)                      \n\t"
+    "punpcklbh  $f28, $f30, $f0                           \n\t"
+    "punpckhbh  $f30, $f30, $f0                           \n\t"
+    "punpcklbh  $f20, $f22, $f0                           \n\t"
+    "punpckhbh  $f22, $f22, $f0                           \n\t"
+    "gssqc1     $f30, $f28, 432-384(%[tmp])               \n\t"
+    "punpcklbh  $f24, $f26, $f0                           \n\t"
+    "punpckhbh  $f26, $f26, $f0                           \n\t"
+    "gssqc1     $f26, $f24, 432-400(%[tmp])               \n\t"
+
+    "gslqc1     $f30, $f28, 432-400(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 0x0(%[pPix])                  \n\t"
+    "psubh      $f28, $f28, $f20                          \n\t"
+    "psubh      $f30, $f30, $f22                          \n\t"
+    "gssqc1     $f22, $f20, 432-16(%[tmp])                \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+    "punpcklbh  $f24, $f26, $f0                           \n\t"
+    "punpckhbh  $f26, $f26, $f0                           \n\t"
+    "pcmpgth    $f20, $f16, $f28                          \n\t"
+    "pcmpgth    $f22, $f18, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 432-384(%[tmp])               \n\t"
+    "gssqc1     $f22, $f20, 432-288(%[tmp])               \n\t"
+
+    "psubh      $f28, $f24, $f28                          \n\t"
+    "psubh      $f30, $f26, $f30                          \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+    "pcmpgth    $f20, $f16, $f28                          \n\t"
+    "pcmpgth    $f22, $f18, $f30                          \n\t"
+    "gssqc1     $f22, $f20, 432-256(%[tmp])               \n\t"
+
+    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
+    "gssqc1     $f26, $f24, 432-80(%[tmp])                \n\t"
+    "pavgh      $f20, $f20, $f24                          \n\t"
+    "pavgh      $f22, $f22, $f26                          \n\t"
+    "gssqc1     $f22, $f20, 432-304(%[tmp])               \n\t"
+
+    "gslqc1     $f22, $f20, 432-288(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 432-256(%[tmp])               \n\t"
+    "psubh      $f20, $f4, $f20                           \n\t"
+    "psubh      $f22, $f6, $f22                           \n\t"
+    "psubh      $f20, $f20, $f28                          \n\t"
+    "psubh      $f22, $f22, $f30                          \n\t"
+    "gssqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
+    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 432-352(%[tmp])               \n\t"
+    "psubh      $f20, $f24, $f20                          \n\t"
+    "psubh      $f22, $f26, $f22                          \n\t"
+    "psubh      $f24, $f24, $f28                          \n\t"
+    "psubh      $f26, $f26, $f30                          \n\t"
+    "gssqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
+    "mov.d      $f28, $f20                                \n\t"
+    "mov.d      $f30, $f22                                \n\t"
+    WELS_AbsH($f28, $f30, $f20, $f22, $f0, $f2)
+    "gslqc1     $f22, $f20, 432-112(%[tmp])               \n\t"
+    "pcmpgth    $f20, $f20, $f28                          \n\t"
+    "pcmpgth    $f22, $f22, $f30                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+    "pcmpgth    $f28, $f16, $f24                          \n\t"
+    "pcmpgth    $f30, $f18, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 432-368(%[tmp])               \n\t"
+
+    "and        $f20, $f20, $f28                          \n\t"
+    "and        $f22, $f22, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 432-400(%[tmp])               \n\t"
+    "psubh      $f28, $f28, $f24                          \n\t"
+    "psubh      $f30, $f30, $f26                          \n\t"
+    "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
+    "psubh      $f24, $f24, $f0                           \n\t"
+    "psubh      $f26, $f26, $f2                           \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f0, $f2)
+    "pcmpgth    $f16, $f16, $f28                          \n\t"
+    "pcmpgth    $f18, $f18, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 432-96(%[tmp])                \n\t"
+    "and        $f20, $f20, $f16                          \n\t"
+    "and        $f22, $f22, $f18                          \n\t"
+    "xor        $f0, $f0, $f0                             \n\t"
+
+    "paddh      $f8, $f8, $f28                            \n\t"
+    "paddh      $f10, $f10, $f30                          \n\t"
+    "pcmpgth    $f16, $f4, $f0                            \n\t"
+    "pcmpgth    $f18, $f6, $f0                            \n\t"
+    "pcmpeqh    $f28, $f4, $f0                            \n\t"
+    "pcmpeqh    $f30, $f6, $f0                            \n\t"
+    "or         $f16, $f16, $f28                          \n\t"
+    "or         $f18, $f18, $f30                          \n\t"
+    "and        $f20, $f20, $f16                          \n\t"
+    "and        $f22, $f22, $f18                          \n\t"
+    "gslqc1     $f18, $f16, 432-224(%[tmp])               \n\t"
+    "gssqc1     $f22, $f20, 432-320(%[tmp])               \n\t"
+    "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
+    "dli        $11, 0x2                                  \n\t"
+    "psubh      $f28, $f0, $f16                           \n\t"
+    "psubh      $f30, $f0, $f18                           \n\t"
+    "psubh      $f2, $f0, $f6                             \n\t"
+    "psubh      $f0, $f0, $f4                             \n\t"
+    "dmfc1      %[iAlpha], $f28                           \n\t"
+    "dmtc1      $11, $f28                                 \n\t"
+    "psllh      $f20, $f20, $f28                          \n\t"
+    "psllh      $f22, $f22, $f28                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "paddh      $f24, $f24, $f20                          \n\t"
+    "paddh      $f26, $f26, $f22                          \n\t"
+    "gslqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
+    "paddh      $f24, $f24, $f20                          \n\t"
+    "paddh      $f26, $f26, $f22                          \n\t"
+    "gslqc1     $f22, $f20, 432-368(%[tmp])               \n\t"
+    "dli        $11, 0x3                                  \n\t"
+    "gssqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
+    "dmfc1      %[iAlpha], $f0                            \n\t"
+    "dmtc1      $11, $f0                                  \n\t"
+    "psrah      $f24, $f24, $f0                           \n\t"
+    "psrah      $f26, $f26, $f0                           \n\t"
+    "dmtc1      %[iAlpha], $f0                            \n\t"
+    "pmaxsh     $f28, $f28, $f24                          \n\t"
+    "pmaxsh     $f30, $f30, $f26                          \n\t"
+    "pminsh     $f16, $f16, $f28                          \n\t"
+    "pminsh     $f18, $f18, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 432-320(%[tmp])               \n\t"
+    "and        $f16, $f16, $f28                          \n\t"
+    "and        $f18, $f18, $f30                          \n\t"
+    "mov.d      $f24, $f0                                 \n\t"
+    "mov.d      $f26, $f2                                 \n\t"
+    "gslqc1     $f2, $f0, 432-16(%[tmp])                  \n\t"
+    "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
+    "paddh      $f0, $f0, $f28                            \n\t"
+    "paddh      $f2, $f2, $f30                            \n\t"
+    "gssqc1     $f18, $f16, 432-272(%[tmp])               \n\t"
+    "gslqc1     $f18, $f16, 432-368(%[tmp])               \n\t"
+    "dli        $11, 0x1                                  \n\t"
+    "paddh      $f16, $f16, $f16                          \n\t"
+    "paddh      $f18, $f18, $f18                          \n\t"
+    "psubh      $f0, $f0, $f16                            \n\t"
+    "psubh      $f2, $f2, $f18                            \n\t"
+
+    "dmtc1      $11, $f28                                 \n\t"
+    "gslqc1     $f18, $f16, 432-64(%[tmp])                \n\t"
+    "psrah      $f0, $f0, $f28                            \n\t"
+    "psrah      $f2, $f2, $f28                            \n\t"
+    "pmaxsh     $f24, $f24, $f0                           \n\t"
+    "pmaxsh     $f26, $f26, $f2                           \n\t"
+    "gslqc1     $f2, $f0, 432-400(%[tmp])                 \n\t"
+    "pminsh     $f28, $f4, $f24                           \n\t"
+    "pminsh     $f30, $f6, $f26                           \n\t"
+    "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
+    "and        $f28, $f28, $f24                          \n\t"
+    "and        $f30, $f30, $f26                          \n\t"
+    "dmfc1      %[iAlpha], $f24                           \n\t"
+    "dmfc1      %[iBeta], $f26                            \n\t"
+    "gslqc1     $f26, $f24, 432-288(%[tmp])               \n\t"
+    "and        $f28, $f28, $f24                          \n\t"
+    "and        $f30, $f30, $f26                          \n\t"
+    "paddh      $f20, $f20, $f28                          \n\t"
+    "paddh      $f22, $f22, $f30                          \n\t"
+    "packushb   $f8, $f8, $f10                            \n\t"
+    "packushb   $f10, $f20, $f22                          \n\t"
+    "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
+    "paddh      $f0, $f0, $f20                            \n\t"
+    "paddh      $f2, $f2, $f22                            \n\t"
+    "paddh      $f12, $f12, $f16                          \n\t"
+    "paddh      $f14, $f14, $f18                          \n\t"
+    "packushb   $f12, $f12, $f14                          \n\t"
+    "packushb   $f14, $f0, $f2                            \n\t"
+
+    "gslqc1     $f2, $f0, 432-32(%[tmp])                  \n\t"
+    "psubh      $f0, $f0, $f16                            \n\t"
+    "psubh      $f2, $f2, $f18                            \n\t"
+    "gslqc1     $f18, $f16, 432-80(%[tmp])                \n\t"
+    "psubh      $f16, $f16, $f20                          \n\t"
+    "gslqc1     $f26, $f24, 432-48(%[tmp])                \n\t"
+    "psubh      $f18, $f18, $f22                          \n\t"
+
+    "gslqc1     $f22, $f20, 432-240(%[tmp])               \n\t"
+    "paddh      $f20, $f20, $f24                          \n\t"
+    "paddh      $f22, $f22, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 432-304(%[tmp])               \n\t"
+    "packushb   $f0, $f0, $f2                             \n\t"
+    "packushb   $f2, $f16, $f18                           \n\t"
+    "gslqc1     $f18, $f16, 432-384(%[tmp])               \n\t"
+    "paddh      $f16, $f16, $f24                          \n\t"
+    "paddh      $f18, $f18, $f26                          \n\t"
+    "gssqc1     $f2, $f0, 480-208(%[tmp])                 \n\t"
+    "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
+    "mov.d      $f28, $f0                                 \n\t"
+    "mov.d      $f30, $f2                                 \n\t"
+    "paddh      $f0, $f0, $f0                             \n\t"
+    "paddh      $f2, $f2, $f2                             \n\t"
+
+    "dmtc1      %[iAlpha], $f24                           \n\t"
+    "dmtc1      %[iBeta], $f26                            \n\t"
+
+    "psubh      $f16, $f16, $f0                           \n\t"
+    "psubh      $f18, $f18, $f2                           \n\t"
+    "dli        $11, 0x1                                  \n\t"
+    "gslqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
+    "gssqc1     $f10, $f8, 0x0($9)                        \n\t"
+    "dmtc1      $11, $f8                                  \n\t"
+    "psrah      $f16, $f16, $f8                           \n\t"
+    "psrah      $f18, $f18, $f8                           \n\t"
+    "pmaxsh     $f0, $f0, $f16                            \n\t"
+    "pmaxsh     $f2, $f2, $f18                            \n\t"
+    "pminsh     $f4, $f4, $f0                             \n\t"
+    "pminsh     $f6, $f6, $f2                             \n\t"
+    "gslqc1     $f2, $f0, 480-208(%[tmp])                 \n\t"
+
+    "gslqc1     $f10, $f8, 428-256+4(%[tmp])              \n\t"
+    "and        $f4, $f4, $f24                            \n\t"
+    "and        $f6, $f6, $f26                            \n\t"
+    "and        $f4, $f4, $f8                             \n\t"
+    "and        $f6, $f6, $f10                            \n\t"
+    "gssqc1     $f14, $f12, 0x0($13)                      \n\t"
+    "paddh      $f28, $f28, $f4                           \n\t"
+    "paddh      $f30, $f30, $f6                           \n\t"
+    "packushb   $f20, $f20, $f22                          \n\t"
+    "packushb   $f22, $f28, $f30                          \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPix])                    \n\t"
+    "gssqc1     $f22, $f20, 0x0(%[iStride])               \n\t"
+    : [pPix]"+&r"((unsigned char *)pPix)
+    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
+      [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+      "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void DeblockLumaTransposeH2V_mmi(uint8_t *pPixY, int32_t iStride,
+                                 uint8_t *pDst) {
+  BACKUP_REG;
+  __asm__ volatile(
+    ".set       arch=loongson3a                           \n\t"
+    "dsll       $8, %[iStride], 0x3                       \n\t"
+    "daddu      $8, $8, %[pPixY]                          \n\t"
+
+    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
+    "daddu      $10, $8, %[iStride]                       \n\t"
+    "gsldlc1    $f0, 0x7(%[pPixY])                        \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldlc1    $f4, 0x7($9)                              \n\t"
+    "gsldlc1    $f6, 0x7($10)                             \n\t"
+    "gsldrc1    $f0, 0x0(%[pPixY])                        \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "gsldrc1    $f4, 0x0($9)                              \n\t"
+    "gsldrc1    $f6, 0x0($10)                             \n\t"
+    "daddu      %[pPixY], $9, %[iStride]                  \n\t"
+    "daddu      $8, $10, %[iStride]                       \n\t"
+    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
+    "daddu      $10, $8, %[iStride]                       \n\t"
+    "gsldlc1    $f8, 0x7(%[pPixY])                        \n\t"
+    "gsldlc1    $f10, 0x7($8)                             \n\t"
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7($10)                            \n\t"
+    "gsldrc1    $f8, 0x0(%[pPixY])                        \n\t"
+    "gsldrc1    $f10, 0x0($8)                             \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0($10)                            \n\t"
+
+    "daddu      %[pPixY], $9, %[iStride]                  \n\t"
+    "daddu      $8, $10, %[iStride]                       \n\t"
+    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
+    "daddu      $10, $8, %[iStride]                       \n\t"
+    "gsldlc1    $f16, 0x7(%[pPixY])                       \n\t"
+    "gsldlc1    $f18, 0x7($8)                             \n\t"
+    "gsldlc1    $f20, 0x7($9)                             \n\t"
+    "gsldlc1    $f22, 0x7($10)                            \n\t"
+    "gsldrc1    $f16, 0x0(%[pPixY])                       \n\t"
+    "gsldrc1    $f18, 0x0($8)                             \n\t"
+    "gsldrc1    $f20, 0x0($9)                             \n\t"
+    "gsldrc1    $f22, 0x0($10)                            \n\t"
+    "daddu      %[pPixY], $9, %[iStride]                  \n\t"
+    "daddu      $8, $10, %[iStride]                       \n\t"
+    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
+    "daddu      $10, $8, %[iStride]                       \n\t"
+    "gsldlc1    $f24, 0x7(%[pPixY])                       \n\t"
+    "gsldlc1    $f26, 0x7($8)                             \n\t"
+
+    "gsldlc1    $f28, 0x7($9)                             \n\t"
+    "gsldlc1    $f30, 0x7($10)                            \n\t"
+    "gsldrc1    $f24, 0x0(%[pPixY])                       \n\t"
+    "gsldrc1    $f26, 0x0($8)                             \n\t"
+    "gsldrc1    $f28, 0x0($9)                             \n\t"
+    "gsldrc1    $f30, 0x0($10)                            \n\t"
+
+    MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
+                     $f14, $f16, $f18, $f20, $f22, $f24,
+                     $f26, $f28, $f30, $9, $10)
+
+    "gssqc1     $f18, $f16, 0x0(%[pDst])                  \n\t"
+    "gssqc1     $f10, $f8, 0x10(%[pDst])                  \n\t"
+    "gssqc1     $f14, $f12, 0x20(%[pDst])                 \n\t"
+    "gssqc1     $f30, $f28, 0x30(%[pDst])                 \n\t"
+    "gssqc1     $f22, $f20, 0x40(%[pDst])                 \n\t"
+    "gssqc1     $f6, $f4, 0x50(%[pDst])                   \n\t"
+    "gssqc1     $f26, $f24, 0x60(%[pDst])                 \n\t"
+    "gssqc1     $f2, $f0, 0x70(%[pDst])                   \n\t"
+    : [pPixY] "+&r"((unsigned char *)pPixY)
+    : [iStride] "r"((int)iStride), [pDst] "r"((unsigned char *)pDst)
+    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+      "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
+      "$f30"
+  );
+  RECOVER_REG;
+}
+
+void DeblockLumaTransposeV2H_mmi(uint8_t *pPixY, int32_t iStride,
+                                 uint8_t *pSrc) {
+  BACKUP_REG;
+  __asm__ volatile(
+    ".set       arch=loongson3a                           \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[pSrc])                    \n\t"
+    "gslqc1     $f6, $f4, 0x10(%[pSrc])                   \n\t"
+    "gslqc1     $f10, $f8, 0x20(%[pSrc])                  \n\t"
+    "gslqc1     $f14, $f12, 0x30(%[pSrc])                 \n\t"
+    "gslqc1     $f18, $f16, 0x40(%[pSrc])                 \n\t"
+    "gslqc1     $f22, $f20, 0x50(%[pSrc])                 \n\t"
+    "gslqc1     $f26, $f24, 0x60(%[pSrc])                 \n\t"
+    "gslqc1     $f30, $f28, 0x70(%[pSrc])                 \n\t"
+
+    MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
+                     $f14, $f16, $f18, $f20, $f22, $f24,
+                     $f26, $f28, $f30, $9, $10)
+
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f16, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f8, 0x7($8)                              \n\t"
+    "gssdrc1    $f16, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f8, 0x0($8)                              \n\t"
+    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f12, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f28, 0x7($8)                             \n\t"
+    "gssdrc1    $f12, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f28, 0x0($8)                             \n\t"
+
+    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f20, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f4, 0x7($8)                              \n\t"
+    "gssdrc1    $f20, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f4, 0x0($8)                              \n\t"
+    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f24, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f0, 0x7($8)                              \n\t"
+    "gssdrc1    $f24, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f0, 0x0($8)                              \n\t"
+
+    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f18, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f10, 0x7($8)                             \n\t"
+    "gssdrc1    $f18, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f10, 0x0($8)                             \n\t"
+    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f14, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f30, 0x7($8)                             \n\t"
+    "gssdrc1    $f14, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f30, 0x0($8)                             \n\t"
+
+    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f22, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f6, 0x7($8)                              \n\t"
+    "gssdrc1    $f22, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f6, 0x0($8)                              \n\t"
+    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f26, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f2, 0x7($8)                              \n\t"
+    "gssdrc1    $f26, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f2, 0x0($8)                              \n\t"
+    : [pPixY] "+&r"((unsigned char *)pPixY)
+    : [iStride] "r"((int)iStride), [pSrc] "r"((unsigned char *)pSrc)
+    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+      "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
+      "$f30"
+  );
+  RECOVER_REG;
+}
+
+void DeblockLumaEq4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
+                         int32_t iBeta) {
+  unsigned char tmp[720] __attribute__((aligned(32)));
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "dsll       $11, %[iStride], 0x2                      \n\t"
+    "xor        $f8, $f8, $f8                             \n\t"
+    "daddu      $14, %[iStride], %[pPix]                  \n\t"
+    "dsubu      $8, %[pPix], $11                          \n\t"
+    "gslqc1     $f14, $f12, 0x0($8)                       \n\t"
+    "gslqc1     $f22, $f20, 0x0(%[pPix])                  \n\t"
+    "daddu      $9, %[iStride], %[iStride]                \n\t"
+    "daddu      $10, $9, %[iStride]                       \n\t"
+    "move       $12, $9                                   \n\t"
+    "dsubu      $8, %[pPix], $9                           \n\t"
+    "gslqc1     $f6, $f4, 0x0($8)                         \n\t"
+    "dsubu      $9, %[pPix], %[iStride]                   \n\t"
+    "gslqc1     $f18, $f16, 0x0($9)                       \n\t"
+    "daddu      $13, %[iStride], %[pPix]                  \n\t"
+
+    "move       %[iStride], $12                           \n\t"
+    "daddu      $15, $12, %[pPix]                         \n\t"
+
+    "daddu      $12, %[pPix], $10                         \n\t"
+    "dsubu      $11, %[pPix], $10                         \n\t"
+
+    "gslqc1     $f26, $f24, 0x0($11)                      \n\t"
+    "daddu      %[iStride], %[iStride], %[pPix]           \n\t"
+    "dmtc1      %[iAlpha], $f0                            \n\t"
+
+    "punpcklhw  $f28, $f0, $f0                            \n\t"
+    "punpcklwd  $f0, $f28, $f28                           \n\t"
+    "mov.d      $f2, $f0                                  \n\t"
+    "gssqc1     $f2, $f0, 640-320(%[tmp])                 \n\t"
+    "dmtc1      %[iBeta], $f0                             \n\t"
+    "gsldxc1    $f10, 0x0($15, $0)                        \n\t"
+    "punpcklhw  $f28, $f0, $f0                            \n\t"
+    "punpcklwd  $f0, $f28, $f28                           \n\t"
+    "punpckhbh  $f30, $f10, $f8                           \n\t"
+    "mov.d      $f2, $f0                                  \n\t"
+
+    "punpcklbh  $f28, $f10, $f8                           \n\t"
+    "gssqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
+    "gssqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
+    "mov.d      $f0, $f4                                  \n\t"
+    "gssqc1     $f22, $f20, 704-272(%[tmp])               \n\t"
+    "gssqc1     $f6, $f4, 672-272(%[tmp])                 \n\t"
+    "mov.d      $f4, $f16                                 \n\t"
+    "punpckhbh  $f22, $f20, $f8                           \n\t"
+    "punpcklbh  $f20, $f20, $f8                           \n\t"
+    "punpckhbh  $f6, $f4, $f8                             \n\t"
+    "punpcklbh  $f4, $f4, $f8                             \n\t"
+
+    "psubh      $f28, $f20, $f4                           \n\t"
+    "psubh      $f30, $f22, $f6                           \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f2, $f10)
+    "gssqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
+    "punpckhbh  $f2, $f0, $f8                             \n\t"
+    "punpcklbh  $f0, $f0, $f8                             \n\t"
+    "gssqc1     $f18, $f16, 688-272(%[tmp])               \n\t"
+    "gslqc1     $f18, $f16, 0x0($14)                      \n\t"
+    "gssqc1     $f2, $f0, 640-480(%[tmp])                 \n\t"
+
+    "psubh      $f28, $f4, $f0                            \n\t"
+    "psubh      $f30, $f6, $f2                            \n\t"
+
+    "gslqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f18, $f10)
+    "punpckhbh  $f18, $f16, $f8                           \n\t"
+    "punpcklbh  $f16, $f16, $f8                           \n\t"
+    "pcmpgth    $f0, $f0, $f28                            \n\t"
+    "pcmpgth    $f2, $f2, $f30                            \n\t"
+    "gssqc1     $f18, $f16, 640-384(%[tmp])               \n\t"
+    "psubh      $f28, $f20, $f16                          \n\t"
+    "psubh      $f30, $f22, $f18                          \n\t"
+    "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
+    "gssqc1     $f26, $f24, 656-272(%[tmp])               \n\t"
+    "punpckhbh  $f26, $f24, $f8                           \n\t"
+    "punpcklbh  $f24, $f24, $f8                           \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+    "gssqc1     $f26, $f24, 640-368(%[tmp])               \n\t"
+    "gssqc1     $f6, $f4, 640-144(%[tmp])                 \n\t"
+    "gssqc1     $f22, $f20, 640-400(%[tmp])               \n\t"
+    "pcmpgth    $f16, $f16, $f28                          \n\t"
+    "pcmpgth    $f18, $f18, $f30                          \n\t"
+    "and        $f0, $f0, $f16                            \n\t"
+    "and        $f2, $f2, $f18                            \n\t"
+    "gslqc1     $f18, $f16, 640-320(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
+    "dli        %[iAlpha], 0x2                            \n\t"
+    "dli        %[iBeta], 0x2                             \n\t"
+    "pcmpgth    $f16, $f16, $f28                          \n\t"
+    "pcmpgth    $f18, $f18, $f30                          \n\t"
+    "and        $f0, $f0, $f16                            \n\t"
+    "and        $f2, $f2, $f18                            \n\t"
+    "dmtc1      %[iAlpha], $f16                           \n\t"
+    "dmtc1      %[iBeta], $f10                            \n\t"
+    "gssqc1     $f2, $f0, 736-272(%[tmp])                 \n\t"
+    "gslqc1     $f2, $f0, 640-320(%[tmp])                 \n\t"
+
+    "punpcklhw  $f28, $f16, $f16                          \n\t"
+    "psrah      $f16, $f0, $f10                           \n\t"
+    "psrah      $f18, $f2, $f10                           \n\t"
+    "punpcklwd  $f28, $f28, $f28                          \n\t"
+    "mov.d      $f30, $f28                                \n\t"
+    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
+    "paddh      $f16, $f16, $f28                          \n\t"
+    "paddh      $f18, $f18, $f30                          \n\t"
+    "gssqc1     $f18, $f16, 640-576(%[tmp])               \n\t"
+    "pcmpgth    $f16, $f16, $f8                           \n\t"
+    "pcmpgth    $f18, $f18, $f10                          \n\t"
+    "gssqc1     $f18, $f16, 640-560(%[tmp])               \n\t"
+
+    "gssqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
+    "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
+    "psubh      $f28, $f4, $f24                           \n\t"
+    "psubh      $f30, $f6, $f26                           \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
+    "pcmpgth    $f16, $f16, $f28                          \n\t"
+    "pcmpgth    $f18, $f18, $f30                          \n\t"
+
+    "gslqc1     $f2, $f0, 640-416(%[tmp])                 \n\t"
+    "and        $f16, $f16, $f8                           \n\t"
+    "and        $f18, $f18, $f10                          \n\t"
+    "gssqc1     $f18, $f16, 640-544(%[tmp])               \n\t"
+    "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
+    "psubh      $f28, $f20, $f0                           \n\t"
+    "psubh      $f30, $f22, $f2                           \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
+    "pcmpgth    $f16, $f16, $f28                          \n\t"
+    "pcmpgth    $f18, $f18, $f30                          \n\t"
+
+    "and        $f16, $f16, $f8                           \n\t"
+    "and        $f18, $f18, $f10                          \n\t"
+    "gssqc1     $f18, $f16, 640-560(%[tmp])               \n\t"
+
+    "gslqc1     $f18, $f16, 640-544(%[tmp])               \n\t"
+    "xor        $f8, $f8, $f8                             \n\t"
+    "pandn      $f16, $f16, $f24                          \n\t"
+    "dli        %[iAlpha], 0x4                            \n\t"
+    "pandn      $f18, $f18, $f26                          \n\t"
+    "gssqc1     $f18, $f16, 640-16(%[tmp])                \n\t"
+    "dmtc1      %[iAlpha], $f16                           \n\t"
+    "punpcklhw  $f28, $f16, $f16                          \n\t"
+    "dli        %[iAlpha], 0x1                            \n\t"
+    "punpckhbh  $f18, $f12, $f8                           \n\t"
+    "dmtc1      %[iAlpha], $f30                           \n\t"
+    "punpcklbh  $f16, $f12, $f8                           \n\t"
+    "psllh      $f16, $f16, $f30                          \n\t"
+    "psllh      $f18, $f18, $f30                          \n\t"
+    "paddh      $f16, $f16, $f24                          \n\t"
+    "paddh      $f18, $f18, $f26                          \n\t"
+    "gslqc1     $f2, $f0, 640-480(%[tmp])                 \n\t"
+    "paddh      $f16, $f16, $f24                          \n\t"
+    "paddh      $f18, $f18, $f26                          \n\t"
+    "paddh      $f16, $f16, $f24                          \n\t"
+    "paddh      $f18, $f18, $f26                          \n\t"
+    "paddh      $f16, $f16, $f0                           \n\t"
+    "paddh      $f18, $f18, $f2                           \n\t"
+
+    "gslqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
+    "punpcklwd  $f28, $f28, $f28                          \n\t"
+    "mov.d      $f30, $f28                                \n\t"
+    "paddh      $f16, $f16, $f4                           \n\t"
+    "paddh      $f18, $f18, $f6                           \n\t"
+    "gssqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
+    "paddh      $f16, $f16, $f20                          \n\t"
+    "paddh      $f18, $f18, $f22                          \n\t"
+    "paddh      $f16, $f16, $f28                          \n\t"
+    "paddh      $f18, $f18, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 640-384(%[tmp])                 \n\t"
+    "pandn      $f24, $f24, $f28                          \n\t"
+    "pandn      $f26, $f26, $f30                          \n\t"
+    "gssqc1     $f26, $f24, 640-80(%[tmp])                \n\t"
+    "gslqc1     $f26, $f24, 0x0($12)                      \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "punpckhbh  $f26, $f24, $f8                           \n\t"
+    "punpcklbh  $f24, $f24, $f8                           \n\t"
+    "psllh      $f24, $f24, $f10                          \n\t"
+    "psllh      $f26, $f26, $f10                          \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "paddh      $f24, $f24, $f0                           \n\t"
+    "paddh      $f26, $f26, $f2                           \n\t"
+
+    "dli        %[iAlpha], 0x3                            \n\t"
+    "gslqc1     $f30, $f28, 640-480(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 640-592(%[tmp])                 \n\t"
+    "paddh      $f24, $f24, $f20                          \n\t"
+    "paddh      $f26, $f26, $f22                          \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "paddh      $f24, $f24, $f0                           \n\t"
+    "paddh      $f26, $f26, $f2                           \n\t"
+    "gslqc1     $f2, $f0, 640-560(%[tmp])                 \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "psrah      $f24, $f24, $f10                          \n\t"
+    "psrah      $f26, $f26, $f10                          \n\t"
+    "and        $f24, $f24, $f0                           \n\t"
+    "and        $f26, $f26, $f2                           \n\t"
+    "gssqc1     $f26, $f24, 640-112(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
+    "pandn      $f24, $f24, $f28                          \n\t"
+    "pandn      $f26, $f26, $f30                          \n\t"
+    "gssqc1     $f26, $f24, 640-336(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
+    "gssqc1     $f26, $f24, 640-528(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 640-368(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 640-544(%[tmp])                 \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "psrah      $f16, $f16, $f10                          \n\t"
+    "psrah      $f18, $f18, $f10                          \n\t"
+    "and        $f16, $f16, $f0                           \n\t"
+    "and        $f18, $f18, $f2                           \n\t"
+    "gslqc1     $f2, $f0, 640-624(%[tmp])                 \n\t"
+    "paddh      $f28, $f4, $f20                           \n\t"
+    "paddh      $f30, $f6, $f22                           \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "paddh      $f24, $f24, $f0                           \n\t"
+    "paddh      $f26, $f26, $f2                           \n\t"
+    "gslqc1     $f30, $f28, 640-528(%[tmp])               \n\t"
+    "dli        %[iAlpha], 0x2                            \n\t"
+
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "paddh      $f20, $f20, $f4                           \n\t"
+    "paddh      $f22, $f22, $f6                           \n\t"
+    "psrah      $f24, $f24, $f10                          \n\t"
+    "psrah      $f26, $f26, $f10                          \n\t"
+    "and        $f28, $f28, $f24                          \n\t"
+    "and        $f30, $f30, $f26                          \n\t"
+
+    "gslqc1     $f26, $f24, 640-384(%[tmp])               \n\t"
+    "gssqc1     $f30, $f28, 640-64(%[tmp])                \n\t"
+    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
+    "pandn      $f28, $f28, $f24                          \n\t"
+    "pandn      $f30, $f30, $f26                          \n\t"
+    "gssqc1     $f30, $f28, 640-304(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
+    "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
+    "paddh      $f28, $f28, $f24                          \n\t"
+    "paddh      $f30, $f30, $f26                          \n\t"
+    "paddh      $f28, $f28, $f20                          \n\t"
+    "paddh      $f30, $f30, $f22                          \n\t"
+    "paddh      $f28, $f28, $f8                           \n\t"
+    "paddh      $f30, $f30, $f10                          \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "gslqc1     $f22, $f20, 640-560(%[tmp])               \n\t"
+    "psrah      $f28, $f28, $f10                          \n\t"
+    "psrah      $f30, $f30, $f10                          \n\t"
+    "and        $f20, $f20, $f28                          \n\t"
+    "and        $f22, $f22, $f30                          \n\t"
+    "gssqc1     $f22, $f20, 640-32(%[tmp])                \n\t"
+
+    "gslqc1     $f22, $f20, 640-480(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 640-592(%[tmp])                 \n\t"
+    "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
+    "paddh      $f28, $f20, $f20                          \n\t"
+    "paddh      $f30, $f22, $f22                          \n\t"
+    "paddh      $f20, $f4, $f24                           \n\t"
+    "paddh      $f22, $f6, $f26                           \n\t"
+    "paddh      $f24, $f24, $f0                           \n\t"
+    "paddh      $f26, $f26, $f2                           \n\t"
+    "paddh      $f28, $f28, $f20                          \n\t"
+    "paddh      $f30, $f30, $f22                          \n\t"
+    "paddh      $f28, $f28, $f8                           \n\t"
+    "paddh      $f30, $f30, $f10                          \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "gslqc1     $f22, $f20, 640-544(%[tmp])               \n\t"
+    "psrah      $f28, $f28, $f10                          \n\t"
+    "psrah      $f30, $f30, $f10                          \n\t"
+    "dli        %[iAlpha], 0x1                            \n\t"
+    "pandn      $f20, $f20, $f28                          \n\t"
+    "pandn      $f22, $f22, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 640-480(%[tmp])               \n\t"
+    "paddh      $f28, $f28, $f4                           \n\t"
+    "paddh      $f30, $f30, $f6                           \n\t"
+    "gslqc1     $f6, $f4, 640-400(%[tmp])                 \n\t"
+    "paddh      $f28, $f28, $f4                           \n\t"
+    "paddh      $f30, $f30, $f6                           \n\t"
+    "gslqc1     $f6, $f4, 640-544(%[tmp])                 \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "gssqc1     $f22, $f20, 640-352(%[tmp])               \n\t"
+    "gslqc1     $f22, $f20, 640-368(%[tmp])               \n\t"
+    "psllh      $f28, $f28, $f10                          \n\t"
+    "psllh      $f30, $f30, $f10                          \n\t"
+    "dli        %[iAlpha], 0x3                            \n\t"
+    "paddh      $f28, $f28, $f24                          \n\t"
+    "paddh      $f30, $f30, $f26                          \n\t"
+    "paddh      $f20, $f20, $f28                          \n\t"
+    "paddh      $f22, $f22, $f30                          \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+
+    "dli        %[iAlpha], 0x2                            \n\t"
+    "gslqc1     $f30, $f28, 640-400(%[tmp])               \n\t"
+    "psrah      $f20, $f20, $f10                          \n\t"
+    "psrah      $f22, $f22, $f10                          \n\t"
+    "and        $f4, $f4, $f20                            \n\t"
+    "and        $f6, $f6, $f22                            \n\t"
+    "gslqc1     $f22, $f20, 640-480(%[tmp])               \n\t"
+    "gssqc1     $f6, $f4, 640-96(%[tmp])                  \n\t"
+    "gslqc1     $f6, $f4, 640-384(%[tmp])                 \n\t"
+    "gslqc1     $f10, $f8, 640-400(%[tmp])                \n\t"
+    "paddh      $f24, $f4, $f4                            \n\t"
+    "paddh      $f26, $f6, $f6                            \n\t"
+    "paddh      $f4, $f4, $f8                             \n\t"
+    "paddh      $f6, $f6, $f10                            \n\t"
+    "gslqc1     $f10, $f8, 640-144(%[tmp])                \n\t"
+    "paddh      $f28, $f28, $f20                          \n\t"
+    "paddh      $f30, $f30, $f22                          \n\t"
+    "paddh      $f4, $f4, $f8                             \n\t"
+    "paddh      $f6, $f6, $f10                            \n\t"
+    "gslqc1     $f10, $f8, 640-592(%[tmp])                \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "paddh      $f20, $f20, $f8                           \n\t"
+    "paddh      $f22, $f22, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
+    "paddh      $f24, $f24, $f8                           \n\t"
+    "dmtc1      %[iAlpha], $f8                            \n\t"
+    "paddh      $f26, $f26, $f10                          \n\t"
+    "dli        %[iAlpha], 0x1                            \n\t"
+    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "psrah      $f24, $f24, $f8                           \n\t"
+    "psrah      $f26, $f26, $f8                           \n\t"
+    "psllh      $f4, $f4, $f10                            \n\t"
+    "psllh      $f6, $f6, $f10                            \n\t"
+    "paddh      $f4, $f4, $f20                            \n\t"
+    "paddh      $f6, $f6, $f22                            \n\t"
+    "dli        %[iAlpha], 0x3                            \n\t"
+
+    "gslqc1     $f22, $f20, 656-272(%[tmp])               \n\t"
+    "pandn      $f28, $f28, $f24                          \n\t"
+    "pandn      $f30, $f30, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 640-416(%[tmp])               \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "gslqc1     $f6, $f4, 640-560(%[tmp])                 \n\t"
+    "psrah      $f24, $f24, $f10                          \n\t"
+    "psrah      $f26, $f26, $f10                          \n\t"
+    "and        $f4, $f4, $f24                            \n\t"
+    "and        $f6, $f6, $f26                            \n\t"
+
+    "xor        $f8, $f8, $f8                             \n\t"
+    "gslqc1     $f26, $f24, 704-272(%[tmp])               \n\t"
+    "gssqc1     $f6, $f4, 640-128(%[tmp])                 \n\t"
+    "gslqc1     $f6, $f4, 672-272(%[tmp])                 \n\t"
+    "punpcklbh  $f4, $f6, $f8                             \n\t"
+    "punpckhbh  $f6, $f6, $f8                             \n\t"
+    "gssqc1     $f6, $f4, 640-448(%[tmp])                 \n\t"
+    "gslqc1     $f6, $f4, 688-272(%[tmp])                 \n\t"
+    "punpcklbh  $f4, $f6, $f8                             \n\t"
+    "punpckhbh  $f6, $f6, $f8                             \n\t"
+    "punpcklbh  $f24, $f26, $f8                           \n\t"
+    "punpckhbh  $f26, $f26, $f8                           \n\t"
+    "gssqc1     $f30, $f28, 640-288(%[tmp])               \n\t"
+    "punpcklbh  $f20, $f22, $f8                           \n\t"
+    "punpckhbh  $f22, $f22, $f8                           \n\t"
+    "gslqc1     $f30, $f28, 0x0($14)                      \n\t"
+    "gssqc1     $f6, $f4, 640-496(%[tmp])                 \n\t"
+    "gssqc1     $f26, $f24, 640-432(%[tmp])               \n\t"
+
+    "gsldxc1    $f0, 0x8($15, $0)                         \n\t"
+    "punpcklbh  $f28, $f30, $f8                           \n\t"
+    "punpckhbh  $f30, $f30, $f8                           \n\t"
+    "gssqc1     $f30, $f28, 640-464(%[tmp])               \n\t"
+
+    "punpcklbh  $f28, $f0, $f8                            \n\t"
+    "punpckhbh  $f30, $f0, $f8                            \n\t"
+    "gslqc1     $f10, $f8, 640-464(%[tmp])                \n\t"
+    "gssqc1     $f30, $f28, 640-528(%[tmp])               \n\t"
+
+    "psubh      $f28, $f24, $f4                           \n\t"
+    "psubh      $f30, $f26, $f6                           \n\t"
+    "psubh      $f24, $f24, $f8                           \n\t"
+    "psubh      $f26, $f26, $f10                          \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+    "gslqc1     $f10, $f8, 640-16(%[tmp])                 \n\t"
+    "gssqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
+    "or         $f16, $f16, $f8                           \n\t"
+    "or         $f18, $f18, $f10                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+    "gslqc1     $f30, $f28, 640-448(%[tmp])               \n\t"
+    "psubh      $f28, $f4, $f28                           \n\t"
+    "psubh      $f30, $f6, $f30                           \n\t"
+
+    "gslqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+    "pcmpgth    $f4, $f0, $f28                            \n\t"
+    "pcmpgth    $f6, $f2, $f30                            \n\t"
+    "pcmpgth    $f28, $f0, $f24                           \n\t"
+    "pcmpgth    $f30, $f2, $f26                           \n\t"
+    "gslqc1     $f26, $f24, 640-320(%[tmp])               \n\t"
+    "and        $f4, $f4, $f28                            \n\t"
+    "and        $f6, $f6, $f30                            \n\t"
+    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
+    "pcmpgth    $f24, $f24, $f28                          \n\t"
+    "pcmpgth    $f26, $f26, $f30                          \n\t"
+    "and        $f4, $f4, $f24                            \n\t"
+    "and        $f6, $f6, $f26                            \n\t"
+
+    "gslqc1     $f26, $f24, 640-576(%[tmp])               \n\t"
+    "pcmpgth    $f24, $f24, $f28                          \n\t"
+    "pcmpgth    $f26, $f26, $f30                          \n\t"
+    "xor        $f8, $f8, $f8                             \n\t"
+    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
+    "punpcklbh  $f12, $f14, $f8                           \n\t"
+    "punpckhbh  $f14, $f14, $f8                           \n\t"
+    "gssqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 640-512(%[tmp])               \n\t"
+    "psubh      $f28, $f28, $f20                          \n\t"
+    "psubh      $f30, $f30, $f22                          \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+    "pcmpgth    $f24, $f24, $f28                          \n\t"
+    "pcmpgth    $f26, $f26, $f30                          \n\t"
+
+    "dli        %[iAlpha], 0x1                            \n\t"
+    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
+    "and        $f24, $f24, $f8                           \n\t"
+    "and        $f26, $f26, $f10                          \n\t"
+    "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
+    "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
+    "psubh      $f28, $f28, $f8                           \n\t"
+    "psubh      $f30, $f30, $f10                          \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+
+    "psllh      $f12, $f12, $f10                          \n\t"
+    "psllh      $f14, $f14, $f10                          \n\t"
+    "gssqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 640-512(%[tmp])               \n\t"
+
+    "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
+    "paddh      $f12, $f12, $f20                          \n\t"
+    "paddh      $f14, $f14, $f22                          \n\t"
+    "paddh      $f12, $f12, $f20                          \n\t"
+    "paddh      $f14, $f14, $f22                          \n\t"
+    "paddh      $f12, $f12, $f20                          \n\t"
+    "paddh      $f14, $f14, $f22                          \n\t"
+    "paddh      $f12, $f12, $f8                           \n\t"
+    "paddh      $f14, $f14, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
+    "gslqc1     $f2, $f0, 640-560(%[tmp])                 \n\t"
+    "paddh      $f12, $f12, $f8                           \n\t"
+    "paddh      $f14, $f14, $f10                          \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+    "pcmpgth    $f24, $f24, $f28                          \n\t"
+    "pcmpgth    $f26, $f26, $f30                          \n\t"
+    "and        $f24, $f24, $f0                           \n\t"
+    "and        $f26, $f26, $f2                           \n\t"
+    "gssqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
+    "gslqc1     $f10, $f8, 640-544(%[tmp])                \n\t"
+
+    "gslqc1     $f2, $f0, 736-272(%[tmp])                 \n\t"
+    "dli        %[iAlpha], 0x3                            \n\t"
+    "gslqc1     $f30, $f28, 640-368(%[tmp])               \n\t"
+    "and        $f24, $f0, $f16                           \n\t"
+    "and        $f26, $f2, $f18                           \n\t"
+    "pandn      $f16, $f0, $f28                           \n\t"
+    "pandn      $f18, $f2, $f30                           \n\t"
+    "or         $f24, $f24, $f16                          \n\t"
+    "or         $f26, $f26, $f18                          \n\t"
+    "gslqc1     $f18, $f16, 640-432(%[tmp])               \n\t"
+    "paddh      $f12, $f12, $f16                          \n\t"
+    "paddh      $f14, $f14, $f18                          \n\t"
+    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
+    "paddh      $f12, $f12, $f28                          \n\t"
+    "paddh      $f14, $f14, $f30                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "psrah      $f12, $f12, $f28                          \n\t"
+    "psrah      $f14, $f14, $f28                          \n\t"
+    "and        $f12, $f12, $f8                           \n\t"
+    "and        $f14, $f14, $f10                          \n\t"
+    "pandn      $f8, $f8, $f20                            \n\t"
+    "pandn      $f10, $f10, $f22                          \n\t"
+    "or         $f12, $f12, $f8                           \n\t"
+    "or         $f14, $f14, $f10                          \n\t"
+    "and        $f28, $f4, $f12                           \n\t"
+    "and        $f30, $f6, $f14                           \n\t"
+    "gslqc1     $f14, $f12, 640-64(%[tmp])                \n\t"
+    "gslqc1     $f10, $f8, 640-336(%[tmp])                \n\t"
+    "or         $f12, $f12, $f8                           \n\t"
+    "or         $f14, $f14, $f10                          \n\t"
+    "pandn      $f8, $f4, $f20                            \n\t"
+    "pandn      $f10, $f6, $f22                           \n\t"
+    "or         $f28, $f28, $f8                           \n\t"
+    "or         $f30, $f30, $f10                          \n\t"
+
+    "dli        %[iAlpha], 0x2                            \n\t"
+    "and        $f8, $f0, $f12                            \n\t"
+    "and        $f10, $f2, $f14                           \n\t"
+    "gslqc1     $f14, $f12, 640-480(%[tmp])               \n\t"
+    "pandn      $f12, $f0, $f12                           \n\t"
+    "pandn      $f14, $f2, $f14                           \n\t"
+    "or         $f8, $f8, $f12                            \n\t"
+    "or         $f10, $f10, $f14                          \n\t"
+    "packushb   $f24, $f24, $f26                          \n\t"
+    "packushb   $f26, $f28, $f30                          \n\t"
+    "gssqc1     $f10, $f8, 640-336(%[tmp])                \n\t"
+    "gssqc1     $f26, $f24, 656-272(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
+    "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
+    "paddh      $f8, $f20, $f8                            \n\t"
+    "paddh      $f10, $f22, $f10                          \n\t"
+    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
+    "paddh      $f28, $f28, $f16                          \n\t"
+    "paddh      $f30, $f30, $f18                          \n\t"
+    "paddh      $f8, $f8, $f28                            \n\t"
+    "paddh      $f10, $f10, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
+    "paddh      $f8, $f8, $f28                            \n\t"
+    "paddh      $f10, $f10, $f30                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "psrah      $f8, $f8, $f28                            \n\t"
+    "psrah      $f10, $f10, $f28                          \n\t"
+    "dli        %[iAlpha], 0x1                            \n\t"
+    "gslqc1     $f30, $f28, 640-544(%[tmp])               \n\t"
+    "and        $f24, $f24, $f8                           \n\t"
+    "and        $f26, $f26, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
+    "pandn      $f28, $f28, $f8                           \n\t"
+    "pandn      $f30, $f30, $f10                          \n\t"
+    "or         $f24, $f24, $f28                          \n\t"
+    "or         $f26, $f26, $f30                          \n\t"
+    "and        $f12, $f4, $f24                           \n\t"
+    "and        $f14, $f6, $f26                           \n\t"
+    "pandn      $f24, $f4, $f8                            \n\t"
+    "pandn      $f26, $f6, $f10                           \n\t"
+    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
+    "paddh      $f8, $f8, $f28                            \n\t"
+    "paddh      $f10, $f10, $f30                          \n\t"
+    "paddh      $f8, $f8, $f16                            \n\t"
+    "paddh      $f10, $f10, $f18                          \n\t"
+    "or         $f12, $f12, $f24                          \n\t"
+    "or         $f14, $f14, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 640-336(%[tmp])               \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "packushb   $f24, $f24, $f26                          \n\t"
+    "packushb   $f26, $f12, $f14                          \n\t"
+    "psllh      $f8, $f8, $f28                            \n\t"
+    "psllh      $f10, $f10, $f28                          \n\t"
+    "gssqc1     $f26, $f24, 672-272(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 640-96(%[tmp])                \n\t"
+    "gslqc1     $f30, $f28, 640-352(%[tmp])               \n\t"
+    "or         $f24, $f24, $f28                          \n\t"
+    "or         $f26, $f26, $f30                          \n\t"
+    "dli        %[iAlpha], 0x3                            \n\t"
+
+    "and        $f12, $f0, $f24                           \n\t"
+    "and        $f14, $f2, $f26                           \n\t"
+    "gslqc1     $f26, $f24, 640-144(%[tmp])               \n\t"
+    "pandn      $f24, $f0, $f24                           \n\t"
+    "pandn      $f26, $f2, $f26                           \n\t"
+    "or         $f12, $f12, $f24                          \n\t"
+    "or         $f14, $f14, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
+    "gssqc1     $f14, $f12, 640-352(%[tmp])               \n\t"
+    "gslqc1     $f14, $f12, 640-464(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
+    "paddh      $f12, $f12, $f28                          \n\t"
+    "paddh      $f14, $f14, $f30                          \n\t"
+    "paddh      $f8, $f8, $f12                            \n\t"
+    "paddh      $f10, $f10, $f14                          \n\t"
+    "gslqc1     $f14, $f12, 640-448(%[tmp])               \n\t"
+    "paddh      $f20, $f20, $f8                           \n\t"
+    "paddh      $f22, $f22, $f10                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
+    "psrah      $f20, $f20, $f28                          \n\t"
+    "psrah      $f22, $f22, $f28                          \n\t"
+    "and        $f24, $f24, $f20                          \n\t"
+    "and        $f26, $f26, $f22                          \n\t"
+    "gslqc1     $f22, $f20, 640-464(%[tmp])               \n\t"
+    "paddh      $f8, $f8, $f20                            \n\t"
+    "paddh      $f10, $f10, $f22                          \n\t"
+    "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
+    "dli        %[iAlpha], 0x2                            \n\t"
+    "paddh      $f20, $f20, $f28                          \n\t"
+    "paddh      $f22, $f22, $f30                          \n\t"
+    "paddh      $f16, $f12, $f12                          \n\t"
+    "paddh      $f18, $f14, $f14                          \n\t"
+    "paddh      $f16, $f16, $f8                           \n\t"
+    "paddh      $f18, $f18, $f10                          \n\t"
+    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
+    "paddh      $f16, $f16, $f28                          \n\t"
+    "paddh      $f18, $f18, $f30                          \n\t"
+    "gslqc1     $f10, $f8, 640-544(%[tmp])                \n\t"
+    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
+    "paddh      $f12, $f12, $f28                          \n\t"
+    "paddh      $f14, $f14, $f30                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "psrah      $f16, $f16, $f28                          \n\t"
+    "psrah      $f18, $f18, $f28                          \n\t"
+    "pandn      $f8, $f8, $f16                            \n\t"
+    "pandn      $f10, $f10, $f18                          \n\t"
+    "or         $f24, $f24, $f8                           \n\t"
+    "or         $f26, $f26, $f10                          \n\t"
+    "and        $f28, $f4, $f24                           \n\t"
+    "and        $f30, $f6, $f26                           \n\t"
+    "gslqc1     $f26, $f24, 640-496(%[tmp])               \n\t"
+    "pandn      $f8, $f4, $f24                            \n\t"
+    "pandn      $f10, $f6, $f26                           \n\t"
+    "or         $f28, $f28, $f8                           \n\t"
+    "or         $f30, $f30, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-352(%[tmp])                \n\t"
+    "packushb   $f8, $f8, $f10                            \n\t"
+    "packushb   $f10, $f28, $f30                          \n\t"
+    "gssqc1     $f10, $f8, 688-272(%[tmp])                \n\t"
+    "gslqc1     $f10, $f8, 640-128(%[tmp])                \n\t"
+    "gslqc1     $f30, $f28, 640-288(%[tmp])               \n\t"
+    "or         $f8, $f8, $f28                            \n\t"
+    "or         $f10, $f10, $f30                          \n\t"
+    "dli        %[iAlpha], 0x1                            \n\t"
+
+    "and        $f16, $f0, $f8                            \n\t"
+    "and        $f18, $f2, $f10                           \n\t"
+    "paddh      $f20, $f20, $f24                          \n\t"
+    "paddh      $f22, $f22, $f26                          \n\t"
+    "gslqc1     $f30, $f28, 640-400(%[tmp])               \n\t"
+    "pandn      $f8, $f0, $f28                            \n\t"
+    "pandn      $f10, $f2, $f30                           \n\t"
+    "or         $f16, $f16, $f8                           \n\t"
+    "or         $f18, $f18, $f10                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
+    "dli        %[iAlpha], 0x3                            \n\t"
+    "psllh      $f20, $f20, $f28                          \n\t"
+    "psllh      $f22, $f22, $f28                          \n\t"
+    "paddh      $f20, $f20, $f12                          \n\t"
+    "paddh      $f22, $f22, $f14                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "gslqc1     $f14, $f12, 640-560(%[tmp])               \n\t"
+    "paddh      $f8, $f8, $f20                            \n\t"
+    "paddh      $f10, $f10, $f22                          \n\t"
+    "psrah      $f8, $f8, $f28                            \n\t"
+    "psrah      $f10, $f10, $f28                          \n\t"
+    "gssqc1     $f18, $f16, 640-288(%[tmp])               \n\t"
+    "gslqc1     $f18, $f16, 640-560(%[tmp])               \n\t"
+    "and        $f16, $f16, $f8                           \n\t"
+    "and        $f18, $f18, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-464(%[tmp])                \n\t"
+    "paddh      $f20, $f8, $f8                            \n\t"
+    "paddh      $f22, $f10, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-432(%[tmp])                \n\t"
+    "gslqc1     $f30, $f28, 640-448(%[tmp])               \n\t"
+    "paddh      $f8, $f8, $f28                            \n\t"
+    "paddh      $f10, $f10, $f30                          \n\t"
+    "dli        %[iAlpha], 0x2                            \n\t"
+    "paddh      $f20, $f20, $f8                           \n\t"
+    "paddh      $f22, $f22, $f10                          \n\t"
+    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
+    "paddh      $f20, $f20, $f28                          \n\t"
+    "paddh      $f22, $f22, $f30                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "gslqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
+    "psrah      $f20, $f20, $f28                          \n\t"
+    "psrah      $f22, $f22, $f28                          \n\t"
+    "pandn      $f12, $f12, $f20                          \n\t"
+    "pandn      $f14, $f14, $f22                          \n\t"
+    "or         $f16, $f16, $f12                          \n\t"
+    "or         $f18, $f18, $f14                          \n\t"
+    "gslqc1     $f14, $f12, 640-32(%[tmp])                \n\t"
+    "gslqc1     $f30, $f28, 640-304(%[tmp])               \n\t"
+    "or         $f12, $f12, $f28                          \n\t"
+    "or         $f14, $f14, $f30                          \n\t"
+    "and        $f28, $f4, $f16                           \n\t"
+    "and        $f30, $f6, $f18                           \n\t"
+    "gslqc1     $f18, $f16, 640-432(%[tmp])               \n\t"
+    "gslqc1     $f22, $f20, 640-464(%[tmp])               \n\t"
+    "pandn      $f8, $f4, $f16                            \n\t"
+    "pandn      $f10, $f6, $f18                           \n\t"
+    "or         $f28, $f28, $f8                           \n\t"
+    "or         $f30, $f30, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
+    "paddh      $f16, $f16, $f8                           \n\t"
+    "paddh      $f18, $f18, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-288(%[tmp])                \n\t"
+    "packushb   $f8, $f8, $f10                            \n\t"
+    "packushb   $f10, $f28, $f30                          \n\t"
+    "dli        %[iAlpha], 0x2                            \n\t"
+    "gssqc1     $f10, $f8, 704-272(%[tmp])                \n\t"
+
+    "and        $f8, $f0, $f12                            \n\t"
+    "and        $f10, $f2, $f14                           \n\t"
+    "gslqc1     $f30, $f28, 640-384(%[tmp])               \n\t"
+    "pandn      $f12, $f0, $f28                           \n\t"
+    "pandn      $f14, $f2, $f30                           \n\t"
+    "or         $f8, $f8, $f12                            \n\t"
+    "or         $f10, $f10, $f14                          \n\t"
+    "gssqc1     $f10, $f8, 640-304(%[tmp])                \n\t"
+    "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
+    "gslqc1     $f30, $f28, 640-464(%[tmp])               \n\t"
+    "paddh      $f12, $f8, $f28                           \n\t"
+    "paddh      $f14, $f10, $f30                          \n\t"
+    "paddh      $f12, $f12, $f16                          \n\t"
+    "paddh      $f14, $f14, $f18                          \n\t"
+    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
+    "paddh      $f12, $f12, $f28                          \n\t"
+    "paddh      $f14, $f14, $f30                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "psrah      $f12, $f12, $f28                          \n\t"
+    "psrah      $f14, $f14, $f28                          \n\t"
+    "and        $f24, $f24, $f12                          \n\t"
+    "and        $f26, $f26, $f14                          \n\t"
+    "gslqc1     $f14, $f12, 640-560(%[tmp])               \n\t"
+    "pandn      $f16, $f12, $f20                          \n\t"
+    "pandn      $f18, $f14, $f22                          \n\t"
+    "or         $f24, $f24, $f16                          \n\t"
+    "or         $f26, $f26, $f18                          \n\t"
+    "and        $f28, $f4, $f24                           \n\t"
+    "and        $f30, $f6, $f26                           \n\t"
+    "gslqc1     $f26, $f24, 640-304(%[tmp])               \n\t"
+    "pandn      $f16, $f4, $f20                           \n\t"
+    "pandn      $f18, $f6, $f22                           \n\t"
+    "or         $f28, $f28, $f16                          \n\t"
+    "or         $f30, $f30, $f18                          \n\t"
+    "dli        %[iAlpha], 0x1                            \n\t"
+
+    "packushb   $f24, $f24, $f26                          \n\t"
+    "packushb   $f26, $f28, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 640-112(%[tmp])               \n\t"
+    "gslqc1     $f18, $f16, 640-80(%[tmp])                \n\t"
+    "or         $f28, $f28, $f16                          \n\t"
+    "or         $f30, $f30, $f18                          \n\t"
+    "and        $f16, $f0, $f28                           \n\t"
+    "and        $f18, $f2, $f30                           \n\t"
+    "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
+    "pandn      $f0, $f0, $f28                            \n\t"
+    "pandn      $f2, $f2, $f30                            \n\t"
+    "or         $f16, $f16, $f0                           \n\t"
+    "or         $f18, $f18, $f2                           \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "xor        $f30, $f30, $f30                          \n\t"
+    "gslqc1     $f2, $f0, 0x0($12)                        \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "punpcklbh  $f0, $f2, $f30                            \n\t"
+    "punpckhbh  $f2, $f2, $f30                            \n\t"
+    "psllh      $f0, $f0, $f28                            \n\t"
+    "psllh      $f2, $f2, $f28                            \n\t"
+    "paddh      $f0, $f0, $f8                             \n\t"
+    "paddh      $f2, $f2, $f10                            \n\t"
+    "paddh      $f0, $f0, $f8                             \n\t"
+    "paddh      $f2, $f2, $f10                            \n\t"
+    "paddh      $f0, $f0, $f8                             \n\t"
+    "paddh      $f2, $f2, $f10                            \n\t"
+    "paddh      $f0, $f0, $f20                            \n\t"
+    "paddh      $f2, $f2, $f22                            \n\t"
+    "dli        %[iAlpha], 0x3                            \n\t"
+    "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
+    "paddh      $f0, $f0, $f28                            \n\t"
+    "paddh      $f2, $f2, $f30                            \n\t"
+    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
+    "paddh      $f0, $f0, $f28                            \n\t"
+    "paddh      $f2, $f2, $f30                            \n\t"
+    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
+    "paddh      $f0, $f0, $f28                            \n\t"
+    "paddh      $f2, $f2, $f30                            \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "psrah      $f0, $f0, $f28                            \n\t"
+    "psrah      $f2, $f2, $f28                            \n\t"
+    "and        $f0, $f0, $f12                            \n\t"
+    "and        $f2, $f2, $f14                            \n\t"
+    "pandn      $f12, $f12, $f8                           \n\t"
+    "pandn      $f14, $f14, $f10                          \n\t"
+    "or         $f0, $f0, $f12                            \n\t"
+    "or         $f2, $f2, $f14                            \n\t"
+    "and        $f28, $f4, $f0                            \n\t"
+    "and        $f30, $f6, $f2                            \n\t"
+
+    "gslqc1     $f2, $f0, 656-272(%[tmp])                 \n\t"
+    "gssqc1     $f2, $f0, 0x0($11)                        \n\t"
+
+    "gslqc1     $f2, $f0, 672-272(%[tmp])                 \n\t"
+
+    "gssqc1     $f2, $f0, 0x0($8)                         \n\t"
+    "gslqc1     $f2, $f0, 688-272(%[tmp])                 \n\t"
+    "gssqc1     $f2, $f0, 0x0($9)                         \n\t"
+    "gslqc1     $f2, $f0, 704-272(%[tmp])                 \n\t"
+
+    "pandn      $f4, $f4, $f8                             \n\t"
+    "pandn      $f6, $f6, $f10                            \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPix])                    \n\t"
+    "or         $f28, $f28, $f4                           \n\t"
+    "or         $f30, $f30, $f6                           \n\t"
+    "packushb   $f16, $f16, $f18                          \n\t"
+    "packushb   $f18, $f28, $f30                          \n\t"
+    "gssqc1     $f26, $f24, 0x0($13)                      \n\t"
+    "gssqc1     $f18, $f16, 0x0(%[iStride])               \n\t"
+    : [pPix]"+&r"((unsigned char *)pPix)
+    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
+      "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+      "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void DeblockChromaLt4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+                           int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
+  unsigned char tmp[256] __attribute__((aligned(32)));
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "lb         $8, 0x2(%[pTC])                           \n\t"
+    "lb         $9, 0x3(%[pTC])                           \n\t"
+    "move       $11, $8                                   \n\t"
+    "lb         $8, 0x1(%[pTC])                           \n\t"
+    "lb         %[pTC], 0x0(%[pTC])                       \n\t"
+    "move       $12, %[pTC]                               \n\t"
+    "and        %[pTC], $9, 0xFFFF                        \n\t"
+    "dmtc1      %[pTC], $f4                               \n\t"
+    "and        %[pTC], $9, 0xFFFF                        \n\t"
+    "dmtc1      %[pTC], $f8                               \n\t"
+    "move       %[pTC], $11                               \n\t"
+    "and        $9, %[pTC], 0xFFFF                        \n\t"
+    "and        %[pTC], %[pTC], 0xFFFF                    \n\t"
+    "dmtc1      %[pTC], $f16                              \n\t"
+    "and        %[pTC], $8, 0xFFFF                        \n\t"
+    "dmtc1      %[pTC], $f20                              \n\t"
+    "dmtc1      $9, $f12                                  \n\t"
+    "and        %[pTC], $8, 0xFFFF                        \n\t"
+    "dmtc1      %[pTC], $f24                              \n\t"
+    "move       %[pTC], $12                               \n\t"
+    "and        $9, %[pTC], 0xFFFF                        \n\t"
+    "and        %[pTC], %[pTC], 0xFFFF                    \n\t"
+    "punpcklhw  $f24, $f24, $f8                           \n\t"
+    "xor        $f0, $f0, $f0                             \n\t"
+    "xor        $f2, $f2, $f2                             \n\t"
+    "gssqc1     $f2, $f0, 0x40(%[tmp])                    \n\t"
+    "dmtc1      $9, $f28                                  \n\t"
+    "dmtc1      %[pTC], $f0                               \n\t"
+    "daddu      %[pTC], %[iStride], %[iStride]            \n\t"
+    "dsubu      $9, %[pPixCb], %[pTC]                     \n\t"
+    "punpcklhw  $f20, $f20, $f4                           \n\t"
+    "gslqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
+    "punpcklhw  $f0, $f0, $f16                            \n\t"
+    "gsldxc1    $f16, 0x0(%[iStride], %[pPixCr])          \n\t"
+    "punpcklhw  $f28, $f28, $f12                          \n\t"
+    "gsldxc1    $f12, 0x0(%[pPixCb], $0)                  \n\t"
+    "punpcklhw  $f0, $f0, $f24                            \n\t"
+    "gsldxc1    $f24, 0x0($9, $0)                         \n\t"
+    "punpcklhw  $f28, $f28, $f20                          \n\t"
+    "punpckhhw  $f2, $f0, $f28                            \n\t"
+    "punpcklhw  $f0, $f0, $f28                            \n\t"
+    "dsubu      $9, %[pPixCr], %[pTC]                     \n\t"
+    "psubh      $f8, $f4, $f0                             \n\t"
+    "psubh      $f10, $f6, $f2                            \n\t"
+    "gssqc1     $f10, $f8, 0x60(%[tmp])                   \n\t"
+    "gsldxc1    $f8, 0x0($9, $0)                          \n\t"
+    "mov.d      $f26, $f8                                 \n\t"
+    "dsubu      %[pTC], %[pPixCb], %[iStride]             \n\t"
+    "gsldxc1    $f28, 0x0(%[pTC], $0)                     \n\t"
+    "dsubu      $9, %[pPixCr], %[iStride]                 \n\t"
+    "gsldxc1    $f8, 0x0($9, $0)                          \n\t"
+    "mov.d      $f30, $f8                                 \n\t"
+    "gsldxc1    $f8, 0x0(%[pPixCr], $0)                   \n\t"
+    "mov.d      $f14, $f8                                 \n\t"
+    "gsldxc1    $f8, 0x0(%[iStride], %[pPixCb])           \n\t"
+    "mov.d      $f10, $f16                                \n\t"
+    "gssqc1     $f10, $f8, 0xE0(%[tmp])                   \n\t"
+    "dmtc1      %[iAlpha], $f8                            \n\t"
+    "punpcklhw  $f16, $f8, $f8                            \n\t"
+    "dmtc1      %[iBeta], $f8                             \n\t"
+    "punpcklhw  $f20, $f8, $f8                            \n\t"
+    "punpcklwd  $f8, $f20, $f20                           \n\t"
+    "mov.d      $f10, $f8                                 \n\t"
+    "gssqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
+    "punpckhbh  $f10, $f24, $f4                           \n\t"
+    "punpcklbh  $f8, $f24, $f4                            \n\t"
+    "gssqc1     $f14, $f12, 0xd0(%[tmp])                  \n\t"
+    "punpcklwd  $f16, $f16, $f16                          \n\t"
+    "mov.d      $f18, $f16                                \n\t"
+    "gssqc1     $f10, $f8, 0x30(%[tmp])                   \n\t"
+    "punpcklbh  $f24, $f26, $f6                           \n\t"
+    "punpckhbh  $f26, $f26, $f6                           \n\t"
+    "gssqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0xd0(%[tmp])                  \n\t"
+    "punpcklbh  $f24, $f26, $f6                           \n\t"
+    "punpckhbh  $f26, $f26, $f6                           \n\t"
+    "gssqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0xe0(%[tmp])                  \n\t"
+    "punpcklbh  $f24, $f26, $f6                           \n\t"
+    "punpckhbh  $f26, $f26, $f6                           \n\t"
+    "gssqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
+    "gslqc1     $f22, $f20, 0xe0(%[tmp])                  \n\t"
+    "mov.d      $f8, $f28                                 \n\t"
+    "mov.d      $f10, $f30                                \n\t"
+    "punpcklbh  $f28, $f30, $f6                           \n\t"
+    "punpckhbh  $f30, $f30, $f6                           \n\t"
+    "punpckhbh  $f22, $f20, $f4                           \n\t"
+    "punpcklbh  $f20, $f20, $f4                           \n\t"
+    "gssqc1     $f30, $f28, 0xa0(%[tmp])                  \n\t"
+    "punpckhbh  $f14, $f12, $f4                           \n\t"
+    "punpcklbh  $f12, $f12, $f4                           \n\t"
+    "dli        %[iBeta], 0x4                             \n\t"
+    "punpckhbh  $f10, $f8, $f4                            \n\t"
+    "punpcklbh  $f8, $f8, $f4                             \n\t"
+    "dmtc1      %[iBeta], $f24                            \n\t"
+    "punpcklhw  $f28, $f24, $f24                          \n\t"
+    "punpcklwd  $f24, $f28, $f28                          \n\t"
+    "mov.d      $f26, $f24                                \n\t"
+    "gslqc1     $f30, $f28, 0x30(%[tmp])                  \n\t"
+    "gssqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
+    "psubh      $f28, $f28, $f20                          \n\t"
+    "psubh      $f30, $f30, $f22                          \n\t"
+    "pcmpgth    $f24, $f0, $f4                            \n\t"
+    "pcmpgth    $f26, $f2, $f6                            \n\t"
+    "gslqc1     $f6, $f4, 0x60(%[tmp])                    \n\t"
+    "gssqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
+    "psubh      $f24, $f12, $f8                           \n\t"
+    "psubh      $f26, $f14, $f10                          \n\t"
+    "dmfc1      %[iAlpha], $f12                           \n\t"
+    "dmfc1      %[iBeta], $f14                            \n\t"
+    "dli        $10, 0x2                                  \n\t"
+    "dmtc1      $10, $f12                                 \n\t"
+    "dli        $10, 0x3                                  \n\t"
+    "dmtc1      $10, $f14                                 \n\t"
+    "psllh      $f24, $f24, $f12                          \n\t"
+    "psllh      $f26, $f26, $f12                          \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 0x20(%[tmp])                  \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
+    "psrah      $f24, $f24, $f14                          \n\t"
+    "psrah      $f26, $f26, $f14                          \n\t"
+    "dmtc1      %[iAlpha], $f12                           \n\t"
+    "dmtc1      %[iBeta], $f14                            \n\t"
+    "pmaxsh     $f4, $f4, $f24                            \n\t"
+    "pmaxsh     $f6, $f6, $f26                            \n\t"
+    "gssqc1     $f2, $f0, 0x10(%[tmp])                    \n\t"
+    "gslqc1     $f26, $f24, 0x10(%[tmp])                  \n\t"
+    "pminsh     $f24, $f24, $f4                           \n\t"
+    "pminsh     $f26, $f26, $f6                           \n\t"
+    "gssqc1     $f26, $f24, 0x10(%[tmp])                  \n\t"
+    "psubh      $f4, $f8, $f12                            \n\t"
+    "psubh      $f6, $f10, $f14                           \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f24, $f26)
+    "pcmpgth    $f24, $f16, $f4                           \n\t"
+    "pcmpgth    $f26, $f18, $f6                           \n\t"
+    "gslqc1     $f6, $f4, 0x30(%[tmp])                    \n\t"
+    "psubh      $f4, $f4, $f8                             \n\t"
+    "psubh      $f6, $f6, $f10                            \n\t"
+    "dmfc1      %[iAlpha], $f8                            \n\t"
+    "dmfc1      %[iBeta], $f10                            \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f8, $f10)
+    "pcmpgth    $f28, $f28, $f4                           \n\t"
+    "pcmpgth    $f30, $f30, $f6                           \n\t"
+    "gslqc1     $f6, $f4, 0x50(%[tmp])                    \n\t"
+    "and        $f24, $f24, $f28                          \n\t"
+    "and        $f26, $f26, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
+    "psubh      $f20, $f20, $f12                          \n\t"
+    "psubh      $f22, $f22, $f14                          \n\t"
+    WELS_AbsH($f20, $f22, $f20, $f22, $f8, $f10)
+    "pcmpgth    $f4, $f4, $f20                            \n\t"
+    "pcmpgth    $f6, $f6, $f22                            \n\t"
+    "gslqc1     $f22, $f20, 0x80(%[tmp])                  \n\t"
+    "gslqc1     $f10, $f8, 0x90(%[tmp])                   \n\t"
+    "psubh      $f20, $f20, $f8                           \n\t"
+    "psubh      $f22, $f22, $f10                          \n\t"
+    "and        $f24, $f24, $f4                           \n\t"
+    "and        $f26, $f26, $f6                           \n\t"
+    "gslqc1     $f10, $f8, 0x40(%[tmp])                   \n\t"
+    "and        $f24, $f24, $f8                           \n\t"
+    "and        $f26, $f26, $f10                          \n\t"
+    "gslqc1     $f6, $f4, 0x10(%[tmp])                    \n\t"
+    "and        $f4, $f4, $f24                            \n\t"
+    "and        $f6, $f6, $f26                            \n\t"
+    "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
+    "gssqc1     $f6, $f4, 0x30(%[tmp])                    \n\t"
+    "gslqc1     $f6, $f4, 0xa0(%[tmp])                    \n\t"
+    "psubh      $f24, $f24, $f4                           \n\t"
+    "psubh      $f26, $f26, $f6                           \n\t"
+    "dli        $10, 0x2                                  \n\t"
+    "dmtc1      $10, $f8                                  \n\t"
+    "psllh      $f24, $f24, $f8                           \n\t"
+    "psllh      $f26, $f26, $f8                           \n\t"
+    "paddh      $f24, $f24, $f20                          \n\t"
+    "paddh      $f26, $f26, $f22                          \n\t"
+    "dli        $10, 0x3                                  \n\t"
+    "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
+    "paddh      $f24, $f24, $f8                           \n\t"
+    "paddh      $f26, $f26, $f10                          \n\t"
+    "dmtc1      $10, $f8                                  \n\t"
+    "gslqc1     $f22, $f20, 0x60(%[tmp])                  \n\t"
+    "psrah      $f24, $f24, $f8                           \n\t"
+    "psrah      $f26, $f26, $f8                           \n\t"
+    "pmaxsh     $f20, $f20, $f24                          \n\t"
+    "pmaxsh     $f22, $f22, $f26                          \n\t"
+    "pminsh     $f0, $f0, $f20                            \n\t"
+    "pminsh     $f2, $f2, $f22                            \n\t"
+    "gslqc1     $f22, $f20, 0x70(%[tmp])                  \n\t"
+    "psubh      $f24, $f4, $f20                           \n\t"
+    "psubh      $f26, $f6, $f22                           \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+    "pcmpgth    $f16, $f16, $f24                          \n\t"
+    "pcmpgth    $f18, $f18, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f4                           \n\t"
+    "psubh      $f26, $f26, $f6                           \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+    "pcmpgth    $f28, $f28, $f24                          \n\t"
+    "pcmpgth    $f30, $f30, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
+    "and        $f16, $f16, $f28                          \n\t"
+    "and        $f18, $f18, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f20                          \n\t"
+    "psubh      $f26, $f26, $f22                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+    "dmtc1      %[iAlpha], $f8                            \n\t"
+    "dmtc1      %[iBeta], $f10                            \n\t"
+    "pcmpgth    $f28, $f28, $f24                          \n\t"
+    "pcmpgth    $f30, $f30, $f26                          \n\t"
+    "and        $f16, $f16, $f28                          \n\t"
+    "and        $f18, $f18, $f30                          \n\t"
+    "gslqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
+    "and        $f16, $f16, $f24                          \n\t"
+    "and        $f18, $f18, $f26                          \n\t"
+    "and        $f0, $f0, $f16                            \n\t"
+    "and        $f2, $f2, $f18                            \n\t"
+    "gslqc1     $f18, $f16, 0x30(%[tmp])                  \n\t"
+    "paddh      $f8, $f8, $f16                            \n\t"
+    "paddh      $f10, $f10, $f18                          \n\t"
+    "paddh      $f4, $f4, $f0                             \n\t"
+    "paddh      $f6, $f6, $f2                             \n\t"
+    "packushb   $f8, $f8, $f10                            \n\t"
+    "packushb   $f10, $f4, $f6                            \n\t"
+    "gssdxc1    $f8, 0x0(%[pTC], $0)                      \n\t"
+    "psubh      $f12, $f12, $f16                          \n\t"
+    "psubh      $f14, $f14, $f18                          \n\t"
+    "psubh      $f20, $f20, $f0                           \n\t"
+    "psubh      $f22, $f22, $f2                           \n\t"
+    "packushb   $f12, $f12, $f14                          \n\t"
+    "packushb   $f14, $f20, $f22                          \n\t"
+    "gssdxc1    $f12, 0x0(%[pPixCb], $0)                  \n\t"
+    "gssdxc1    $f10, 0x0($9, $0)                         \n\t"
+    "gssdxc1    $f14, 0x0(%[pPixCr], $0)                  \n\t"
+    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
+      [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
+      "$f10", "$f12",  "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+      "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void DeblockChromaEq4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+                           int32_t iAlpha, int32_t iBeta) {
+  unsigned char tmp[128] __attribute__((aligned(32)));
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                          \n\t"
+    "daddu      $8, %[iStride], %[iStride]               \n\t"
+    "dsubu      $9, %[pPixCb], $8                        \n\t"
+    "gsldxc1    $f16, 0x0(%[pPixCr], $0)                 \n\t"
+    "gsldxc1    $f20, 0x0(%[iStride], %[pPixCr])         \n\t"
+    "gsldxc1    $f4, 0x0($9, $0)                         \n\t"
+    "dsubu      $9, %[pPixCr], $8                        \n\t"
+    "gsldxc1    $f8, 0x0($9, $0)                         \n\t"
+    "mov.d      $f6, $f8                                 \n\t"
+    "dsubu      $8, %[pPixCb], %[iStride]                \n\t"
+    "gsldxc1    $f8, 0x0($8, $0)                         \n\t"
+    "dsubu      $9, %[pPixCr], %[iStride]                \n\t"
+    "gsldxc1    $f12, 0x0($9, $0)                        \n\t"
+    "mov.d      $f10, $f12                               \n\t"
+    "gsldxc1    $f12, 0x0(%[pPixCb], $0)                 \n\t"
+    "mov.d      $f14, $f16                               \n\t"
+    "gsldxc1    $f16, 0x0(%[iStride], %[pPixCb])         \n\t"
+    "mov.d      $f18, $f20                               \n\t"
+    "dmtc1      %[iAlpha], $f20                          \n\t"
+    "xor        $f0, $f0, $f0                            \n\t"
+    "xor        $f2, $f2, $f2                            \n\t"
+    "punpcklhw  $f24, $f20, $f20                         \n\t"
+    "punpcklwd  $f20, $f24, $f24                         \n\t"
+    "mov.d      $f22, $f20                               \n\t"
+    "dmtc1      %[iBeta], $f24                           \n\t"
+    "punpcklhw  $f28, $f24, $f24                         \n\t"
+    "punpcklwd  $f24, $f28, $f28                         \n\t"
+    "mov.d      $f26, $f24                               \n\t"
+    "mov.d      $f28, $f4                                \n\t"
+    "punpcklbh  $f4, $f6, $f2                            \n\t"
+    "punpckhbh  $f6, $f6, $f2                            \n\t"
+    "punpckhbh  $f30, $f28, $f0                          \n\t"
+    "punpcklbh  $f28, $f28, $f0                          \n\t"
+    "gssqc1     $f6, $f4, 0x40(%[tmp])                   \n\t"
+    "gssqc1     $f30, $f28, 0x60(%[tmp])                 \n\t"
+    "punpckhbh  $f30, $f8, $f0                           \n\t"
+    "punpcklbh  $f28, $f8, $f0                           \n\t"
+    "gssqc1     $f30, $f28, 0x10(%[tmp])                 \n\t"
+    "punpckhbh  $f30, $f12, $f0                          \n\t"
+    "punpcklbh  $f28, $f12, $f0                          \n\t"
+    "punpcklbh  $f12, $f14, $f2                          \n\t"
+    "punpckhbh  $f14, $f14, $f2                          \n\t"
+    "gssqc1     $f30, $f28, 0x50(%[tmp])                 \n\t"
+    "mov.d      $f28, $f16                               \n\t"
+    "punpcklbh  $f16, $f18, $f2                          \n\t"
+    "punpckhbh  $f18, $f18, $f2                          \n\t"
+    "punpcklbh  $f8, $f10, $f2                           \n\t"
+    "punpckhbh  $f10, $f10, $f2                          \n\t"
+    "punpckhbh  $f30, $f28, $f0                          \n\t"
+    "punpcklbh  $f28, $f28, $f0                          \n\t"
+    "gssqc1     $f14, $f12, 0x30(%[tmp])                 \n\t"
+    "gslqc1     $f14, $f12, 0x10(%[tmp])                 \n\t"
+    "gslqc1     $f2, $f0, 0x50(%[tmp])                   \n\t"
+    "psubh      $f4, $f12, $f0                           \n\t"
+    "psubh      $f6, $f14, $f2                           \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+    "gssqc1     $f18, $f16, 0x20(%[tmp])                 \n\t"
+    "pcmpgth    $f0, $f20, $f4                           \n\t"
+    "pcmpgth    $f2, $f22, $f6                           \n\t"
+    "gslqc1     $f6, $f4, 0x60(%[tmp])                   \n\t"
+    "psubh      $f4, $f4, $f12                           \n\t"
+    "psubh      $f6, $f6, $f14                           \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+    "pcmpgth    $f16, $f24, $f4                          \n\t"
+    "pcmpgth    $f18, $f26, $f6                          \n\t"
+    "and        $f0, $f0, $f16                           \n\t"
+    "and        $f2, $f2, $f18                           \n\t"
+    "gslqc1     $f18, $f16, 0x50(%[tmp])                 \n\t"
+    "psubh      $f4, $f28, $f16                          \n\t"
+    "psubh      $f6, $f30, $f18                          \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+    "pcmpgth    $f16, $f24, $f4                          \n\t"
+    "pcmpgth    $f18, $f26, $f6                          \n\t"
+    "gslqc1     $f6, $f4, 0x30(%[tmp])                   \n\t"
+    "psubh      $f4, $f8, $f4                            \n\t"
+    "psubh      $f6, $f10, $f6                           \n\t"
+    "dmfc1      %[iAlpha], $f28                          \n\t"
+    "dmfc1      %[iBeta], $f30                           \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
+    "pcmpgth    $f20, $f20, $f4                          \n\t"
+    "pcmpgth    $f22, $f22, $f6                          \n\t"
+    "gslqc1     $f6, $f4, 0x40(%[tmp])                   \n\t"
+    "and        $f0, $f0, $f16                           \n\t"
+    "and        $f2, $f2, $f18                           \n\t"
+    "psubh      $f4, $f4, $f8                            \n\t"
+    "psubh      $f6, $f6, $f10                           \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+    "pcmpgth    $f16, $f24, $f4                          \n\t"
+    "pcmpgth    $f18, $f26, $f6                          \n\t"
+    "gslqc1     $f6, $f4, 0x20(%[tmp])                   \n\t"
+    "gslqc1     $f30, $f28, 0x30(%[tmp])                 \n\t"
+    "psubh      $f4, $f4, $f28                           \n\t"
+    "psubh      $f6, $f6, $f30                           \n\t"
+    "and        $f20, $f20, $f16                         \n\t"
+    "and        $f22, $f22, $f18                         \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
+    "dmtc1      %[iAlpha], $f28                          \n\t"
+    "dmtc1      %[iBeta], $f30                           \n\t"
+    "pcmpgth    $f24, $f24, $f4                          \n\t"
+    "pcmpgth    $f26, $f26, $f6                          \n\t"
+    "and        $f20, $f20, $f24                         \n\t"
+    "and        $f22, $f22, $f26                         \n\t"
+    "dli        %[iBeta], 0x2                            \n\t"
+    "dmtc1      %[iBeta], $f4                            \n\t"
+    "punpcklhw  $f16, $f4, $f4                           \n\t"
+    "punpcklwd  $f4, $f16, $f16                          \n\t"
+    "mov.d      $f6, $f4                                 \n\t"
+    "gslqc1     $f18, $f16, 0x60(%[tmp])                 \n\t"
+    "paddh      $f24, $f16, $f16                         \n\t"
+    "paddh      $f26, $f18, $f18                         \n\t"
+    "paddh      $f24, $f24, $f12                         \n\t"
+    "paddh      $f26, $f26, $f14                         \n\t"
+    "paddh      $f24, $f24, $f28                         \n\t"
+    "paddh      $f26, $f26, $f30                         \n\t"
+    "gssqc1     $f6, $f4, 0x10(%[tmp])                   \n\t"
+    "gslqc1     $f18, $f16, 0x10(%[tmp])                 \n\t"
+    "paddh      $f24, $f24, $f16                         \n\t"
+    "paddh      $f26, $f26, $f18                         \n\t"
+    "dmtc1      %[iBeta], $f16                           \n\t"
+    "psrah      $f24, $f24, $f16                         \n\t"
+    "psrah      $f26, $f26, $f16                         \n\t"
+    "pandn      $f16, $f0, $f12                          \n\t"
+    "pandn      $f18, $f2, $f14                          \n\t"
+    "gslqc1     $f14, $f12, 0x40(%[tmp])                 \n\t"
+    "and        $f4, $f0, $f24                           \n\t"
+    "and        $f6, $f2, $f26                           \n\t"
+    "or         $f4, $f4, $f16                           \n\t"
+    "or         $f6, $f6, $f18                           \n\t"
+    "paddh      $f24, $f12, $f12                         \n\t"
+    "paddh      $f26, $f14, $f14                         \n\t"
+    "gslqc1     $f14, $f12, 0x10(%[tmp])                 \n\t"
+    "paddh      $f24, $f24, $f8                          \n\t"
+    "paddh      $f26, $f26, $f10                         \n\t"
+    "gslqc1     $f18, $f16, 0x20(%[tmp])                 \n\t"
+    "paddh      $f24, $f24, $f16                         \n\t"
+    "paddh      $f26, $f26, $f18                         \n\t"
+    "dmtc1      %[iBeta], $f16                           \n\t"
+    "paddh      $f24, $f24, $f12                         \n\t"
+    "paddh      $f26, $f26, $f14                         \n\t"
+    "psrah      $f24, $f24, $f16                         \n\t"
+    "psrah      $f26, $f26, $f16                         \n\t"
+    "and        $f16, $f20, $f24                         \n\t"
+    "and        $f18, $f22, $f26                         \n\t"
+    "pandn      $f24, $f20, $f8                          \n\t"
+    "pandn      $f26, $f22, $f10                         \n\t"
+    "or         $f16, $f16, $f24                         \n\t"
+    "or         $f18, $f18, $f26                         \n\t"
+    "packushb   $f4, $f4, $f6                            \n\t"
+    "packushb   $f6, $f16, $f18                          \n\t"
+    "gslqc1     $f18, $f16, 0x50(%[tmp])                 \n\t"
+    "paddh      $f24, $f28, $f28                         \n\t"
+    "paddh      $f26, $f30, $f30                         \n\t"
+    "paddh      $f24, $f24, $f16                         \n\t"
+    "paddh      $f26, $f26, $f18                         \n\t"
+    "gslqc1     $f10, $f8, 0x60(%[tmp])                  \n\t"
+    "paddh      $f24, $f24, $f8                          \n\t"
+    "paddh      $f26, $f26, $f10                         \n\t"
+    "dmtc1      %[iBeta], $f28                           \n\t"
+    "paddh      $f24, $f24, $f12                         \n\t"
+    "paddh      $f26, $f26, $f14                         \n\t"
+    "psrah      $f24, $f24, $f28                         \n\t"
+    "psrah      $f26, $f26, $f28                         \n\t"
+    "and        $f8, $f0, $f24                           \n\t"
+    "and        $f10, $f2, $f26                          \n\t"
+    "pandn      $f0, $f0, $f16                           \n\t"
+    "pandn      $f2, $f2, $f18                           \n\t"
+    "or         $f8, $f8, $f0                            \n\t"
+    "or         $f10, $f10, $f2                          \n\t"
+    "gslqc1     $f2, $f0, 0x20(%[tmp])                   \n\t"
+    "paddh      $f24, $f0, $f0                           \n\t"
+    "paddh      $f26, $f2, $f2                           \n\t"
+    "gslqc1     $f2, $f0, 0x30(%[tmp])                   \n\t"
+    "paddh      $f24, $f24, $f0                          \n\t"
+    "paddh      $f26, $f26, $f2                          \n\t"
+    "gslqc1     $f18, $f16, 0x40(%[tmp])                 \n\t"
+    "paddh      $f24, $f24, $f16                         \n\t"
+    "paddh      $f26, $f26, $f18                         \n\t"
+    "paddh      $f24, $f24, $f12                         \n\t"
+    "paddh      $f26, $f26, $f14                         \n\t"
+    "gssdxc1    $f4, 0x0($8, $0)                         \n\t"
+    "psrah      $f24, $f24, $f28                         \n\t"
+    "psrah      $f26, $f26, $f28                         \n\t"
+    "and        $f16, $f20, $f24                         \n\t"
+    "and        $f18, $f22, $f26                         \n\t"
+    "pandn      $f20, $f20, $f0                          \n\t"
+    "pandn      $f22, $f22, $f2                          \n\t"
+    "or         $f16, $f16, $f20                         \n\t"
+    "or         $f18, $f18, $f22                         \n\t"
+    "packushb   $f8, $f8, $f10                           \n\t"
+    "packushb   $f10, $f16, $f18                         \n\t"
+    "gssdxc1    $f8, 0x0(%[pPixCb], $0)                  \n\t"
+    "gssdxc1    $f6, 0x0($9, $0)                         \n\t"
+    "gssdxc1    $f10, 0x0(%[pPixCr], $0)                 \n\t"
+    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
+      "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+      "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void DeblockChromaEq4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+                           int32_t iAlpha, int32_t iBeta) {
+  unsigned char tmp[256] __attribute__((aligned(32)));
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "daddiu     %[pPixCb], %[pPixCb], -0x2                \n\t"
+    "daddiu     %[pPixCr], %[pPixCr], -0x2                \n\t"
+    "move       $9, %[pPixCb]                             \n\t"
+    "move       $10, %[pPixCr]                            \n\t"
+    "dsll       $11, %[iStride], 0x2                      \n\t"
+    "daddu      %[pPixCb], %[pPixCb], $11                 \n\t"
+    "daddu      %[pPixCr], %[pPixCr], $11                 \n\t"
+    "daddiu     $11, %[tmp], 0x80                         \n\t"
+    "gsldlc1    $f0, 0x7($9)                              \n\t"
+    "gsldrc1    $f0, 0x0($9)                              \n\t"
+    "daddu      $12, $9, %[iStride]                       \n\t"
+    "gsldlc1    $f4, 0x7($12)                             \n\t"
+    "gsldrc1    $f4, 0x0($12)                             \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsldlc1    $f8, 0x7($12)                             \n\t"
+    "gsldrc1    $f8, 0x0($12)                             \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsldlc1    $f12, 0x7($12)                            \n\t"
+    "gsldlc1    $f16, 0x7($10)                            \n\t"
+    "gsldrc1    $f12, 0x0($12)                            \n\t"
+    "gsldrc1    $f16, 0x0($10)                            \n\t"
+    "daddu      $12, $10, %[iStride]                      \n\t"
+    "gsldlc1    $f20, 0x7($12)                            \n\t"
+    "gsldrc1    $f20, 0x0($12)                            \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsldlc1    $f24, 0x7($12)                            \n\t"
+    "gsldrc1    $f24, 0x0($12)                            \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsldlc1    $f28, 0x7($12)                            \n\t"
+    "gsldrc1    $f28, 0x0($12)                            \n\t"
+    "punpcklwd  $f0, $f0, $f16                            \n\t"
+    "punpcklwd  $f4, $f4, $f20                            \n\t"
+    "punpcklwd  $f8, $f8, $f24                            \n\t"
+    "punpcklwd  $f12, $f12, $f28                          \n\t"
+    "gsldlc1    $f16, 0x7(%[pPixCb])                      \n\t"
+    "gsldlc1    $f20, 0x7(%[pPixCr])                      \n\t"
+    "gsldrc1    $f16, 0x0(%[pPixCb])                      \n\t"
+    "gsldrc1    $f20, 0x0(%[pPixCr])                      \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f2, $f16                                 \n\t"
+    "daddu      $12, %[pPixCb], %[iStride]                \n\t"
+    "daddu      $13, %[pPixCr], %[iStride]                \n\t"
+    "gsldlc1    $f16, 0x7($12)                            \n\t"
+    "gsldlc1    $f20, 0x7($13)                            \n\t"
+    "gsldrc1    $f16, 0x0($12)                            \n\t"
+    "gsldrc1    $f20, 0x0($13)                            \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f6, $f16                                 \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "daddu      $13, $13, %[iStride]                      \n\t"
+    "gsldlc1    $f16, 0x7($12)                            \n\t"
+    "gsldlc1    $f20, 0x7($13)                            \n\t"
+    "gsldrc1    $f16, 0x0($12)                            \n\t"
+    "gsldrc1    $f20, 0x0($13)                            \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f10, $f16                                \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "daddu      $13, $13, %[iStride]                      \n\t"
+    "gsldlc1    $f16, 0x7($12)                            \n\t"
+    "gsldlc1    $f20, 0x7($13)                            \n\t"
+    "gsldrc1    $f16, 0x0($12)                            \n\t"
+    "gsldrc1    $f20, 0x0($13)                            \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f14, $f16                                \n\t"
+    "punpcklbh  $f24, $f2, $f6                            \n\t"
+    "punpckhbh  $f26, $f2, $f6                            \n\t"
+    "punpckhbh  $f2, $f0, $f4                             \n\t"
+    "punpcklbh  $f0, $f0, $f4                             \n\t"
+    "punpcklbh  $f28, $f10, $f14                          \n\t"
+    "punpckhbh  $f30, $f10, $f14                          \n\t"
+    "punpckhbh  $f10, $f8, $f12                           \n\t"
+    "punpcklbh  $f8, $f8, $f12                            \n\t"
+    "punpcklhw  $f16, $f2, $f10                           \n\t"
+    "punpckhhw  $f18, $f2, $f10                           \n\t"
+    "punpckhhw  $f2, $f0, $f8                             \n\t"
+    "punpcklhw  $f0, $f0, $f8                             \n\t"
+    "punpcklhw  $f20, $f26, $f30                          \n\t"
+    "punpckhhw  $f22, $f26, $f30                          \n\t"
+    "punpckhhw  $f26, $f24, $f28                          \n\t"
+    "punpcklhw  $f24, $f24, $f28                          \n\t"
+    "punpcklwd  $f4, $f2, $f26                            \n\t"
+    "punpckhwd  $f6, $f2, $f26                            \n\t"
+    "punpckhwd  $f2, $f0, $f24                            \n\t"
+    "punpcklwd  $f0, $f0, $f24                            \n\t"
+    "punpcklwd  $f8, $f18, $f22                           \n\t"
+    "punpckhwd  $f10, $f18, $f22                          \n\t"
+    "punpckhwd  $f18, $f16, $f20                          \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f20, $f2                                 \n\t"
+    "mov.d      $f22, $f18                                \n\t"
+    "mov.d      $f2, $f16                                 \n\t"
+    "mov.d      $f24, $f6                                 \n\t"
+    "mov.d      $f26, $f10                                \n\t"
+    "mov.d      $f6, $f8                                  \n\t"
+    "gssqc1     $f2, $f0, 0x0($11)                        \n\t"
+    "gssqc1     $f22, $f20, 0x10($11)                     \n\t"
+    "gssqc1     $f6, $f4, 0x20($11)                       \n\t"
+    "gssqc1     $f26, $f24, 0x30($11)                     \n\t"
+    "gslqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
+    "gslqc1     $f18, $f16, 0x90(%[tmp])                  \n\t"
+    "gslqc1     $f22, $f20, 0xa0(%[tmp])                  \n\t"
+    "gslqc1     $f30, $f28, 0xb0(%[tmp])                  \n\t"
+    "xor        $f0, $f0, $f0                             \n\t"
+    "dmtc1      %[iAlpha], $f4                            \n\t"
+    "punpcklhw  $f8, $f4, $f4                             \n\t"
+    "punpcklwd  $f4, $f8, $f8                             \n\t"
+    "mov.d      $f6, $f4                                  \n\t"
+    "dmtc1      %[iBeta], $f8                             \n\t"
+    "punpcklhw  $f12, $f8, $f8                            \n\t"
+    "punpcklwd  $f8, $f12, $f12                           \n\t"
+    "mov.d      $f10, $f8                                 \n\t"
+    "mov.d      $f12, $f24                                \n\t"
+    "punpcklbh  $f24, $f26, $f0                           \n\t"
+    "punpckhbh  $f26, $f26, $f0                           \n\t"
+    "gssqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
+    "punpcklbh  $f24, $f26, $f0                           \n\t"
+    "punpckhbh  $f26, $f26, $f0                           \n\t"
+    "gssqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0xa0(%[tmp])                  \n\t"
+    "punpcklbh  $f24, $f26, $f0                           \n\t"
+    "punpckhbh  $f26, $f26, $f0                           \n\t"
+    "gssqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0xb0(%[tmp])                  \n\t"
+    "punpcklbh  $f24, $f26, $f0                           \n\t"
+    "punpckhbh  $f26, $f26, $f0                           \n\t"
+    "gssqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
+    "punpckhbh  $f30, $f28, $f0                           \n\t"
+    "punpcklbh  $f28, $f28, $f0                           \n\t"
+    "punpckhbh  $f18, $f16, $f0                           \n\t"
+    "punpcklbh  $f16, $f16, $f0                           \n\t"
+    "punpckhbh  $f22, $f20, $f0                           \n\t"
+    "punpcklbh  $f20, $f20, $f0                           \n\t"
+    "punpckhbh  $f14, $f12, $f0                           \n\t"
+    "punpcklbh  $f12, $f12, $f0                           \n\t"
+    "gssqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
+    "psubh      $f24, $f16, $f20                          \n\t"
+    "psubh      $f26, $f18, $f22                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+    "pcmpgth    $f0, $f4, $f24                            \n\t"
+    "pcmpgth    $f2, $f6, $f26                            \n\t"
+    "psubh      $f24, $f12, $f16                          \n\t"
+    "psubh      $f26, $f14, $f18                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+    "pcmpgth    $f28, $f8, $f24                           \n\t"
+    "pcmpgth    $f30, $f10, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 0x50(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f20                          \n\t"
+    "psubh      $f26, $f26, $f22                          \n\t"
+    "and        $f0, $f0, $f28                            \n\t"
+    "and        $f2, $f2, $f30                            \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+    "dmfc1      %[iAlpha], $f20                           \n\t"
+    "dmfc1      %[iBeta], $f22                            \n\t"
+    "pcmpgth    $f28, $f8, $f24                           \n\t"
+    "pcmpgth    $f30, $f10, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
+    "gslqc1     $f22, $f20, 0x40(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f20                          \n\t"
+    "psubh      $f26, $f26, $f22                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+    "pcmpgth    $f4, $f4, $f24                            \n\t"
+    "pcmpgth    $f6, $f6, $f26                            \n\t"
+    "gslqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
+    "gslqc1     $f22, $f20, 0x30(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f20                          \n\t"
+    "psubh      $f26, $f26, $f22                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+    "and        $f0, $f0, $f28                            \n\t"
+    "and        $f2, $f2, $f30                            \n\t"
+    "pcmpgth    $f28, $f8, $f24                           \n\t"
+    "pcmpgth    $f30, $f10, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
+    "gslqc1     $f22, $f20, 0x40(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f20                          \n\t"
+    "psubh      $f26, $f26, $f22                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+    "dli        $8, 0x2                                   \n\t"
+    "and        $f4, $f4, $f28                            \n\t"
+    "and        $f6, $f6, $f30                            \n\t"
+    "pcmpgth    $f8, $f8, $f24                            \n\t"
+    "pcmpgth    $f10, $f10, $f26                          \n\t"
+    "and        $f4, $f4, $f8                             \n\t"
+    "and        $f6, $f6, $f10                            \n\t"
+    "dmtc1      $8, $f8                                   \n\t"
+    "punpcklhw  $f24, $f8, $f8                            \n\t"
+    "punpcklwd  $f8, $f24, $f24                           \n\t"
+    "mov.d      $f10, $f8                                 \n\t"
+    "gssqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
+    "paddh      $f8, $f12, $f12                           \n\t"
+    "paddh      $f10, $f14, $f14                          \n\t"
+    "paddh      $f8, $f8, $f16                            \n\t"
+    "paddh      $f10, $f10, $f18                          \n\t"
+    "gslqc1     $f22, $f20, 0x50(%[tmp])                  \n\t"
+    "paddh      $f8, $f8, $f20                            \n\t"
+    "paddh      $f10, $f10, $f22                          \n\t"
+    "gslqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
+    "paddh      $f8, $f8, $f24                            \n\t"
+    "paddh      $f10, $f10, $f26                          \n\t"
+    "dmtc1      $8, $f20                                  \n\t"
+    "psrah      $f8, $f8, $f20                            \n\t"
+    "psrah      $f10, $f10, $f20                          \n\t"
+    "and        $f24, $f0, $f8                            \n\t"
+    "and        $f26, $f2, $f10                           \n\t"
+    "pandn      $f8, $f0, $f16                            \n\t"
+    "pandn      $f10, $f2, $f18                           \n\t"
+    "or         $f24, $f24, $f8                           \n\t"
+    "or         $f26, $f26, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 0x60(%[tmp])                   \n\t"
+    "paddh      $f28, $f8, $f8                            \n\t"
+    "paddh      $f30, $f10, $f10                          \n\t"
+    "gslqc1     $f22, $f20, 0x30(%[tmp])                  \n\t"
+    "paddh      $f28, $f28, $f20                          \n\t"
+    "paddh      $f30, $f30, $f22                          \n\t"
+    "gslqc1     $f18, $f16, 0x70(%[tmp])                  \n\t"
+    "paddh      $f28, $f28, $f16                          \n\t"
+    "paddh      $f30, $f30, $f18                          \n\t"
+    "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
+    "paddh      $f28, $f28, $f8                           \n\t"
+    "paddh      $f30, $f30, $f10                          \n\t"
+    "pandn      $f8, $f4, $f20                            \n\t"
+    "pandn      $f10, $f6, $f22                           \n\t"
+    "dmtc1      $8, $f20                                  \n\t"
+    "psrah      $f28, $f28, $f20                          \n\t"
+    "psrah      $f30, $f30, $f20                          \n\t"
+    "and        $f16, $f4, $f28                           \n\t"
+    "and        $f18, $f6, $f30                           \n\t"
+    "or         $f16, $f16, $f8                           \n\t"
+    "or         $f18, $f18, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
+    "packushb   $f24, $f24, $f26                          \n\t"
+    "packushb   $f26, $f16, $f18                          \n\t"
+    "gssqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
+    "paddh      $f24, $f8, $f8                            \n\t"
+    "paddh      $f26, $f10, $f10                          \n\t"
+    "dmtc1      %[iAlpha], $f20                           \n\t"
+    "dmtc1      %[iBeta], $f22                            \n\t"
+    "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
+    "paddh      $f24, $f24, $f20                          \n\t"
+    "paddh      $f26, $f26, $f22                          \n\t"
+    "paddh      $f24, $f24, $f12                          \n\t"
+    "paddh      $f26, $f26, $f14                          \n\t"
+    "mov.d      $f16, $f0                                 \n\t"
+    "mov.d      $f18, $f2                                 \n\t"
+    "pandn      $f0, $f0, $f20                            \n\t"
+    "pandn      $f2, $f2, $f22                            \n\t"
+    "dmtc1      $8, $f20                                  \n\t"
+    "paddh      $f24, $f24, $f8                           \n\t"
+    "paddh      $f26, $f26, $f10                          \n\t"
+    "psrah      $f24, $f24, $f20                          \n\t"
+    "psrah      $f26, $f26, $f20                          \n\t"
+    "and        $f16, $f16, $f24                          \n\t"
+    "and        $f18, $f18, $f26                          \n\t"
+    "or         $f16, $f16, $f0                           \n\t"
+    "or         $f18, $f18, $f2                           \n\t"
+    "gslqc1     $f2, $f0, 0x70(%[tmp])                    \n\t"
+    "paddh      $f20, $f0, $f0                            \n\t"
+    "paddh      $f22, $f2, $f2                            \n\t"
+    "gslqc1     $f2, $f0, 0x40(%[tmp])                    \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+    "gslqc1     $f14, $f12, 0x60(%[tmp])                  \n\t"
+    "paddh      $f20, $f20, $f12                          \n\t"
+    "paddh      $f22, $f22, $f14                          \n\t"
+    "paddh      $f20, $f20, $f8                           \n\t"
+    "paddh      $f22, $f22, $f10                          \n\t"
+    "dmtc1      $8, $f8                                   \n\t"
+    "psrah      $f20, $f20, $f8                           \n\t"
+    "psrah      $f22, $f22, $f8                           \n\t"
+    "and        $f12, $f4, $f20                           \n\t"
+    "and        $f14, $f6, $f22                           \n\t"
+    "pandn      $f4, $f4, $f0                             \n\t"
+    "pandn      $f6, $f6, $f2                             \n\t"
+    "or         $f12, $f12, $f4                           \n\t"
+    "or         $f14, $f14, $f6                           \n\t"
+    "packushb   $f16, $f16, $f18                          \n\t"
+    "packushb   $f18, $f12, $f14                          \n\t"
+    "gssqc1     $f18, $f16, 0xa0(%[tmp])                  \n\t"
+    "gslqc1     $f2, $f0, 0x0($11)                        \n\t"
+    "gslqc1     $f6, $f4, 0x10($11)                       \n\t"
+    "gslqc1     $f10, $f8, 0x20($11)                      \n\t"
+    "gslqc1     $f14, $f12, 0x30($11)                     \n\t"
+    "mov.d      $f26, $f2                                 \n\t"
+    "punpckhbh  $f2, $f0, $f4                             \n\t"
+    "punpcklbh  $f0, $f0, $f4                             \n\t"
+    "punpcklbh  $f24, $f26, $f6                           \n\t"
+    "punpckhbh  $f26, $f26, $f6                           \n\t"
+    "mov.d      $f30, $f10                                \n\t"
+    "punpckhbh  $f10, $f8, $f12                           \n\t"
+    "punpcklbh  $f8, $f8, $f12                            \n\t"
+    "punpcklbh  $f28, $f30, $f14                          \n\t"
+    "punpckhbh  $f30, $f30, $f14                          \n\t"
+    "punpcklhw  $f16, $f2, $f10                           \n\t"
+    "punpckhhw  $f18, $f2, $f10                           \n\t"
+    "punpcklhw  $f20, $f26, $f30                          \n\t"
+    "punpckhhw  $f22, $f26, $f30                          \n\t"
+    "punpckhhw  $f2, $f0, $f8                             \n\t"
+    "punpcklhw  $f0, $f0, $f8                             \n\t"
+    "punpckhhw  $f26, $f24, $f28                          \n\t"
+    "punpcklhw  $f24, $f24, $f28                          \n\t"
+    "punpcklwd  $f4, $f2, $f26                            \n\t"
+    "punpckhwd  $f6, $f2, $f26                            \n\t"
+    "punpcklwd  $f8, $f18, $f22                           \n\t"
+    "punpckhwd  $f10, $f18, $f22                          \n\t"
+    "punpckhwd  $f2, $f0, $f24                            \n\t"
+    "punpcklwd  $f0, $f0, $f24                            \n\t"
+    "punpckhwd  $f18, $f16, $f20                          \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f20, $f2                                 \n\t"
+    "mov.d      $f24, $f6                                 \n\t"
+    "mov.d      $f2, $f16                                 \n\t"
+    "mov.d      $f22, $f18                                \n\t"
+    "mov.d      $f6, $f8                                  \n\t"
+    "mov.d      $f26, $f10                                \n\t"
+    "dli        %[iAlpha], 0x20                           \n\t"
+    "dmtc1      %[iAlpha], $f8                            \n\t"
+    "gsswlc1    $f0, 0x3($9)                              \n\t"
+    "gsswrc1    $f0, 0x0($9)                              \n\t"
+    "daddu      $12, $9, %[iStride]                       \n\t"
+    "gsswlc1    $f20, 0x3($12)                            \n\t"
+    "gsswrc1    $f20, 0x0($12)                            \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsswlc1    $f4, 0x3($12)                             \n\t"
+    "gsswrc1    $f4, 0x0($12)                             \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsswlc1    $f24, 0x3($12)                            \n\t"
+    "gsswrc1    $f24, 0x0($12)                            \n\t"
+    "dsrl       $f0, $f0, $f8                             \n\t"
+    "dsrl       $f20, $f20, $f8                           \n\t"
+    "dsrl       $f4, $f4, $f8                             \n\t"
+    "dsrl       $f24, $f24, $f8                           \n\t"
+    "gsswlc1    $f0, 0x3($10)                             \n\t"
+    "gsswrc1    $f0, 0x0($10)                             \n\t"
+    "daddu      $13, $10, %[iStride]                      \n\t"
+    "daddu      $8, $13, %[iStride]                       \n\t"
+    "gsswlc1    $f20, 0x3($13)                            \n\t"
+    "gsswrc1    $f20, 0x0($13)                            \n\t"
+    "daddu      $13, $8, %[iStride]                       \n\t"
+    "gsswlc1    $f4, 0x3($8)                              \n\t"
+    "gsswrc1    $f4, 0x0($8)                              \n\t"
+    "gsswlc1    $f24, 0x3($13)                            \n\t"
+    "gsswrc1    $f24, 0x0($13)                            \n\t"
+    "gsswlc1    $f2, 0x3(%[pPixCb])                       \n\t"
+    "gsswrc1    $f2, 0x0(%[pPixCb])                       \n\t"
+    "daddu      $12, %[pPixCb], %[iStride]                \n\t"
+    "gsswlc1    $f22, 0x3($12)                            \n\t"
+    "gsswrc1    $f22, 0x0($12)                            \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsswlc1    $f6, 0x3($12)                             \n\t"
+    "gsswrc1    $f6, 0x0($12)                             \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsswlc1    $f26, 0x3($12)                            \n\t"
+    "gsswrc1    $f26, 0x0($12)                            \n\t"
+    "dsrl       $f2, $f2, $f8                             \n\t"
+    "dsrl       $f22, $f22, $f8                           \n\t"
+    "dsrl       $f6, $f6, $f8                             \n\t"
+    "dsrl       $f26, $f26, $f8                           \n\t"
+    "gsswlc1    $f2, 0x3(%[pPixCr])                       \n\t"
+    "gsswrc1    $f2, 0x0(%[pPixCr])                       \n\t"
+    "daddu      $13, %[pPixCr], %[iStride]                \n\t"
+    "daddu      $8, $13, %[iStride]                       \n\t"
+    "gsswlc1    $f22, 0x3($13)                            \n\t"
+    "gsswrc1    $f22, 0x0($13)                            \n\t"
+    "daddu      $13, $8, %[iStride]                       \n\t"
+    "gsswlc1    $f6, 0x3($8)                              \n\t"
+    "gsswrc1    $f6, 0x0($8)                              \n\t"
+    "gsswlc1    $f26, 0x3($13)                            \n\t"
+    "gsswrc1    $f26, 0x0($13)                            \n\t"
+    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
+      "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+      "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void DeblockChromaLt4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+                           int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
+  unsigned char tmp[320] __attribute__((aligned(32)));
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "daddiu     %[pPixCb], %[pPixCb], -0x2                \n\t"
+    "daddiu     %[pPixCr], %[pPixCr], -0x2                \n\t"
+    "daddu      $8, %[pPixCb], %[iStride]                 \n\t"
+    "gsldlc1    $f0, 0x7(%[pPixCb])                       \n\t"
+    "gsldlc1    $f4, 0x7($8)                              \n\t"
+    "gsldrc1    $f0, 0x0(%[pPixCb])                       \n\t"
+    "gsldrc1    $f4, 0x0($8)                              \n\t"
+    "daddu      $9, $8, %[iStride]                        \n\t"
+    "daddu      $8, $9, %[iStride]                        \n\t"
+    "gsldlc1    $f8, 0x7($9)                              \n\t"
+    "gsldlc1    $f12, 0x7($8)                             \n\t"
+    "gsldrc1    $f8, 0x0($9)                              \n\t"
+    "gsldrc1    $f12, 0x0($8)                             \n\t"
+    "daddu      $9, $8, %[iStride]                        \n\t"
+
+    "daddu      $10, %[pPixCr], %[iStride]                \n\t"
+    "gsldlc1    $f16, 0x7(%[pPixCr])                      \n\t"
+    "gsldlc1    $f20, 0x7($10)                            \n\t"
+    "gsldrc1    $f16, 0x0(%[pPixCr])                      \n\t"
+    "gsldrc1    $f20, 0x0($10)                            \n\t"
+    "daddu      $11, $10, %[iStride]                      \n\t"
+    "daddu      $10, $11, %[iStride]                      \n\t"
+    "gsldlc1    $f24, 0x7($11)                            \n\t"
+    "gsldlc1    $f28, 0x7($10)                            \n\t"
+    "gsldrc1    $f24, 0x0($11)                            \n\t"
+    "gsldrc1    $f28, 0x0($10)                            \n\t"
+    "daddu      $11, $10, %[iStride]                      \n\t"
+
+    "punpcklwd  $f0, $f0, $f16                            \n\t"
+    "punpcklwd  $f4, $f4, $f20                            \n\t"
+    "punpcklwd  $f8, $f8, $f24                            \n\t"
+    "punpcklwd  $f12, $f12, $f28                          \n\t"
+    "gsldlc1    $f16, 0x7($9)                             \n\t"
+    "gsldlc1    $f20, 0x7($11)                            \n\t"
+    "gsldrc1    $f16, 0x0($9)                             \n\t"
+    "gsldrc1    $f20, 0x0($11)                            \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f2, $f16                                 \n\t"
+    "daddu      $8, $9, %[iStride]                        \n\t"
+    "daddu      $10, $11, %[iStride]                      \n\t"
+    "gsldlc1    $f16, 0x7($8)                             \n\t"
+    "gsldlc1    $f20, 0x7($10)                            \n\t"
+    "gsldrc1    $f16, 0x0($8)                             \n\t"
+    "gsldrc1    $f20, 0x0($10)                            \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f6, $f16                                 \n\t"
+    "daddu      $9, $8, %[iStride]                        \n\t"
+    "daddu      $11, $10, %[iStride]                      \n\t"
+
+    "gsldlc1    $f16, 0x7($9)                             \n\t"
+    "gsldlc1    $f20, 0x7($11)                            \n\t"
+    "gsldrc1    $f16, 0x0($9)                             \n\t"
+    "gsldrc1    $f20, 0x0($11)                            \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f10, $f16                                \n\t"
+    "daddu      $8, $9, %[iStride]                        \n\t"
+    "daddu      $10, $11, %[iStride]                      \n\t"
+
+    "gsldlc1    $f16, 0x7($8)                             \n\t"
+    "gsldlc1    $f20, 0x7($10)                            \n\t"
+    "gsldrc1    $f16, 0x0($8)                             \n\t"
+    "gsldrc1    $f20, 0x0($10)                            \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f14, $f16                                \n\t"
+
+    "punpcklbh  $f24, $f2, $f6                            \n\t"
+    "punpckhbh  $f26, $f2, $f6                            \n\t"
+    "punpckhbh  $f2, $f0, $f4                             \n\t"
+    "punpcklbh  $f0, $f0, $f4                             \n\t"
+    "punpcklbh  $f28, $f10, $f14                          \n\t"
+    "punpckhbh  $f30, $f10, $f14                          \n\t"
+    "punpckhbh  $f10, $f8, $f12                           \n\t"
+    "punpcklbh  $f8, $f8, $f12                            \n\t"
+
+    "punpcklhw  $f16, $f2, $f10                           \n\t"
+    "punpckhhw  $f18, $f2, $f10                           \n\t"
+    "punpckhhw  $f2, $f0, $f8                             \n\t"
+    "punpcklhw  $f0, $f0, $f8                             \n\t"
+    "punpcklhw  $f20, $f26, $f30                          \n\t"
+    "punpckhhw  $f22, $f26, $f30                          \n\t"
+    "punpckhhw  $f26, $f24, $f28                          \n\t"
+    "punpcklhw  $f24, $f24, $f28                          \n\t"
+
+    "punpcklwd  $f4, $f2, $f26                            \n\t"
+    "punpckhwd  $f6, $f2, $f26                            \n\t"
+    "punpckhwd  $f2, $f0, $f24                            \n\t"
+    "punpcklwd  $f0, $f0, $f24                            \n\t"
+    "punpcklwd  $f8, $f18, $f22                           \n\t"
+    "punpckhwd  $f10, $f18, $f22                          \n\t"
+    "punpckhwd  $f18, $f16, $f20                          \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+
+    "mov.d      $f20, $f2                                 \n\t"
+    "mov.d      $f22, $f18                                \n\t"
+    "mov.d      $f2, $f16                                 \n\t"
+    "mov.d      $f24, $f6                                 \n\t"
+    "mov.d      $f26, $f10                                \n\t"
+    "mov.d      $f6, $f8                                  \n\t"
+    "daddiu     $11, %[tmp], 0x70                         \n\t"
+
+    "gssqc1     $f2, $f0, 0x0($11)                        \n\t"
+    "gssqc1     $f22, $f20, 0x10($11)                     \n\t"
+    "gssqc1     $f6, $f4, 0x20($11)                       \n\t"
+    "gssqc1     $f26, $f24, 0x30($11)                     \n\t"
+
+    "lb         $8, 0x3(%[pTC])                           \n\t"
+    "lb         $9, 0x2(%[pTC])                           \n\t"
+    "lb         $10, 0x1(%[pTC])                          \n\t"
+    "lb         $11, 0x0(%[pTC])                          \n\t"
+
+    "and        $12, $8, 0xFFFF                           \n\t"
+    "dmtc1      $12, $f8                                  \n\t"
+
+    "and        $9, $9, 0xFFFF                            \n\t"
+    "dmtc1      $9, $f12                                  \n\t"
+    "mov.d      $f16, $f12                                \n\t"
+
+    "and        $9, $10, 0xFFFF                           \n\t"
+    "dmtc1      $9, $f20                                  \n\t"
+    "xor        $f0, $f0, $f0                             \n\t"
+    "mov.d      $f24, $f20                                \n\t"
+    "and        $9, $11, 0xFFFF                           \n\t"
+    "punpcklhw  $f24, $f24, $f8                           \n\t"
+
+    "mov.d      $f4, $f8                                  \n\t"
+    "dmtc1      $9, $f28                                  \n\t"
+    "mov.d      $f0, $f28                                 \n\t"
+
+    "punpcklhw  $f28, $f28, $f12                          \n\t"
+    "punpcklhw  $f20, $f20, $f4                           \n\t"
+    "xor        $f4, $f4, $f4                             \n\t"
+    "xor        $f6, $f6, $f6                             \n\t"
+    "punpcklhw  $f28, $f28, $f20                          \n\t"
+    "gslqc1     $f22, $f20, 0xA0(%[tmp])                  \n\t"
+    "punpcklhw  $f0, $f0, $f16                            \n\t"
+    "punpcklhw  $f0, $f0, $f24                            \n\t"
+
+    "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
+    "punpckhhw  $f2, $f0, $f28                            \n\t"
+    "punpcklhw  $f0, $f0, $f28                            \n\t"
+    "gslqc1     $f30, $f28, 0x80(%[tmp])                  \n\t"
+    "psubh      $f8, $f4, $f0                             \n\t"
+    "psubh      $f10, $f6, $f2                            \n\t"
+    "gssqc1     $f10, $f8, 0xD0(%[tmp])                   \n\t"
+    "dmtc1      %[iAlpha], $f8                            \n\t"
+    "punpcklhw  $f12, $f8, $f8                            \n\t"
+    "punpcklwd  $f16, $f12, $f12                          \n\t"
+    "mov.d      $f18, $f16                                \n\t"
+
+    "dmtc1      %[iBeta], $f8                             \n\t"
+    "punpcklhw  $f12, $f8, $f8                            \n\t"
+    "punpcklwd  $f8, $f12, $f12                           \n\t"
+    "mov.d      $f10, $f8                                 \n\t"
+
+    "gslqc1     $f14, $f12, 0x90(%[tmp])                  \n\t"
+    "gssqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
+    "punpckhbh  $f10, $f24, $f4                           \n\t"
+    "punpcklbh  $f8, $f24, $f4                            \n\t"
+    "punpcklbh  $f24, $f26, $f6                           \n\t"
+    "punpckhbh  $f26, $f26, $f6                           \n\t"
+
+    "gssqc1     $f10, $f8, 0x40(%[tmp])                   \n\t"
+    "gssqc1     $f26, $f24, 0xB0(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
+    "punpcklbh  $f8, $f28, $f4                            \n\t"
+    "punpckhbh  $f10, $f28, $f4                           \n\t"
+    "punpcklbh  $f28, $f30, $f6                           \n\t"
+    "punpckhbh  $f30, $f30, $f6                           \n\t"
+    "punpcklbh  $f24, $f26, $f6                           \n\t"
+    "punpckhbh  $f26, $f26, $f6                           \n\t"
+    "punpckhbh  $f14, $f12, $f4                           \n\t"
+    "punpcklbh  $f12, $f12, $f4                           \n\t"
+    "punpckhbh  $f22, $f20, $f4                           \n\t"
+    "punpcklbh  $f20, $f20, $f4                           \n\t"
+    "gssqc1     $f30, $f28, 0xF0(%[tmp])                  \n\t"
+    "gssqc1     $f26, $f24, 0xC0(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0xA0(%[tmp])                  \n\t"
+    "punpcklbh  $f24, $f26, $f6                           \n\t"
+    "punpckhbh  $f26, $f26, $f6                           \n\t"
+
+    "dli        $13, 0x4                                  \n\t"
+    "gssqc1     $f26, $f24, 0xE0(%[tmp])                  \n\t"
+    "dmtc1      $13, $f24                                 \n\t"
+    "punpcklhw  $f28, $f24, $f24                          \n\t"
+    "punpcklwd  $f24, $f28, $f28                          \n\t"
+    "mov.d      $f26, $f24                                \n\t"
+    "dli        $12, 0x2                                  \n\t"
+    "dli        $13, 0x3                                  \n\t"
+
+    "gssqc1     $f2, $f0, 0x20(%[tmp])                    \n\t"
+    "dmfc1      %[iAlpha], $f0                            \n\t"
+    "dmfc1      %[iBeta], $f2                             \n\t"
+    "gssqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
+    "gslqc1     $f30, $f28, 0x40(%[tmp])                  \n\t"
+    "psubh      $f28, $f28, $f20                          \n\t"
+    "psubh      $f30, $f30, $f22                          \n\t"
+    "pcmpgth    $f24, $f0, $f4                            \n\t"
+    "pcmpgth    $f26, $f2, $f6                            \n\t"
+
+    "dmtc1      $12, $f0                                  \n\t"
+    "dmtc1      $13, $f2                                  \n\t"
+    "gssqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
+    "gslqc1     $f6, $f4, 0xD0(%[tmp])                    \n\t"
+    "psubh      $f24, $f12, $f8                           \n\t"
+    "psubh      $f26, $f14, $f10                          \n\t"
+    "psllh      $f24, $f24, $f0                           \n\t"
+    "psllh      $f26, $f26, $f0                           \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 0x30(%[tmp])                  \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "psrah      $f24, $f24, $f2                           \n\t"
+    "psrah      $f26, $f26, $f2                           \n\t"
+    "pmaxsh     $f4, $f4, $f24                            \n\t"
+    "pmaxsh     $f6, $f6, $f26                            \n\t"
+
+    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
+    "pminsh     $f24, $f24, $f4                           \n\t"
+    "pminsh     $f26, $f26, $f6                           \n\t"
+
+    "gssqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
+    "psubh      $f4, $f8, $f12                            \n\t"
+    "psubh      $f6, $f10, $f14                           \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+    "pcmpgth    $f24, $f16, $f4                           \n\t"
+    "pcmpgth    $f26, $f18, $f6                           \n\t"
+    "gslqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
+    "psubh      $f4, $f4, $f8                             \n\t"
+    "psubh      $f6, $f6, $f10                            \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+    "pcmpgth    $f28, $f28, $f4                           \n\t"
+    "pcmpgth    $f30, $f30, $f6                           \n\t"
+
+    "gslqc1     $f6, $f4, 0x50(%[tmp])                    \n\t"
+    "and        $f24, $f24, $f28                          \n\t"
+    "and        $f26, $f26, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
+    "psubh      $f20, $f20, $f12                          \n\t"
+    "psubh      $f22, $f22, $f14                          \n\t"
+    WELS_AbsH($f20, $f22, $f20, $f22, $f0, $f2)
+    "pcmpgth    $f4, $f4, $f20                            \n\t"
+    "pcmpgth    $f6, $f6, $f22                            \n\t"
+
+    "gslqc1     $f22, $f20, 0xB0(%[tmp])                  \n\t"
+    "gslqc1     $f2, $f0, 0xE0(%[tmp])                    \n\t"
+    "psubh      $f20, $f20, $f0                           \n\t"
+    "psubh      $f22, $f22, $f2                           \n\t"
+    "and        $f24, $f24, $f4                           \n\t"
+    "and        $f26, $f26, $f6                           \n\t"
+    "gslqc1     $f2, $f0, 0x60(%[tmp])                    \n\t"
+    "and        $f24, $f24, $f0                           \n\t"
+    "and        $f26, $f26, $f2                           \n\t"
+
+    "gslqc1     $f6, $f4, 0x20(%[tmp])                    \n\t"
+    "and        $f4, $f4, $f24                            \n\t"
+    "and        $f6, $f6, $f26                            \n\t"
+    "gslqc1     $f26, $f24, 0xC0(%[tmp])                  \n\t"
+    "gssqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
+    "gslqc1     $f6, $f4, 0xF0(%[tmp])                    \n\t"
+
+    "dmtc1      $12, $f0                                  \n\t"
+    "psubh      $f24, $f24, $f4                           \n\t"
+    "psubh      $f26, $f26, $f6                           \n\t"
+    "psllh      $f24, $f24, $f0                           \n\t"
+    "psllh      $f26, $f26, $f0                           \n\t"
+    "paddh      $f24, $f24, $f20                          \n\t"
+    "paddh      $f26, $f26, $f22                          \n\t"
+    "gslqc1     $f2, $f0, 0x30(%[tmp])                    \n\t"
+    "paddh      $f24, $f24, $f0                           \n\t"
+    "paddh      $f26, $f26, $f2                           \n\t"
+    "dmtc1      %[iBeta], $f2                             \n\t"
+
+    "dmtc1      $13, $f0                                  \n\t"
+    "gslqc1     $f22, $f20, 0xD0(%[tmp])                  \n\t"
+    "psrah      $f24, $f24, $f0                           \n\t"
+    "psrah      $f26, $f26, $f0                           \n\t"
+    "dmtc1      %[iAlpha], $f0                            \n\t"
+    "pmaxsh     $f20, $f20, $f24                          \n\t"
+    "pmaxsh     $f22, $f22, $f26                          \n\t"
+    "pminsh     $f0, $f0, $f20                            \n\t"
+    "pminsh     $f2, $f2, $f22                            \n\t"
+
+    "dmfc1      %[iAlpha], $f0                            \n\t"
+    "dmfc1      %[iBeta], $f2                             \n\t"
+    "gslqc1     $f22, $f20, 0xC0(%[tmp])                  \n\t"
+    "psubh      $f24, $f4, $f20                           \n\t"
+    "psubh      $f26, $f6, $f22                           \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+    "pcmpgth    $f16, $f16, $f24                          \n\t"
+    "pcmpgth    $f18, $f18, $f26                          \n\t"
+
+    "gslqc1     $f26, $f24, 0xB0(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f4                           \n\t"
+    "psubh      $f26, $f26, $f6                           \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+    "pcmpgth    $f28, $f28, $f24                          \n\t"
+    "pcmpgth    $f30, $f30, $f26                          \n\t"
+
+    "gslqc1     $f26, $f24, 0xE0(%[tmp])                  \n\t"
+    "and        $f16, $f16, $f28                          \n\t"
+    "and        $f18, $f18, $f30                          \n\t"
+
+    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f20                          \n\t"
+    "psubh      $f26, $f26, $f22                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+    "pcmpgth    $f28, $f28, $f24                          \n\t"
+    "pcmpgth    $f30, $f30, $f26                          \n\t"
+    "and        $f16, $f16, $f28                          \n\t"
+    "and        $f18, $f18, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 0x60(%[tmp])                  \n\t"
+    "dmtc1      %[iAlpha], $f0                            \n\t"
+    "dmtc1      %[iBeta], $f2                             \n\t"
+    "and        $f16, $f16, $f28                          \n\t"
+    "and        $f18, $f18, $f30                          \n\t"
+    "and        $f0, $f0, $f16                            \n\t"
+    "and        $f2, $f2, $f18                            \n\t"
+
+    "gslqc1     $f18, $f16, 0x40(%[tmp])                  \n\t"
+    "paddh      $f8, $f8, $f16                            \n\t"
+    "paddh      $f10, $f10, $f18                          \n\t"
+    "paddh      $f4, $f4, $f0                             \n\t"
+    "paddh      $f6, $f6, $f2                             \n\t"
+    "psubh      $f12, $f12, $f16                          \n\t"
+    "psubh      $f14, $f14, $f18                          \n\t"
+    "psubh      $f20, $f20, $f0                           \n\t"
+    "psubh      $f22, $f22, $f2                           \n\t"
+    "packushb   $f8, $f8, $f10                            \n\t"
+    "packushb   $f10, $f4, $f6                            \n\t"
+    "packushb   $f12, $f12, $f14                          \n\t"
+    "packushb   $f14, $f20, $f22                          \n\t"
+
+    "gssqc1     $f10, $f8, 0x80(%[tmp])                   \n\t"
+    "gssqc1     $f14, $f12, 0x90(%[tmp])                  \n\t"
+    "daddiu     $11, %[tmp], 0x70                         \n\t"
+
+    "gslqc1     $f2, $f0, 0x0($11)                        \n\t"
+    "gslqc1     $f6, $f4, 0x10($11)                       \n\t"
+    "gslqc1     $f10, $f8, 0x20($11)                      \n\t"
+    "gslqc1     $f14, $f12, 0x30($11)                     \n\t"
+
+    "punpcklbh  $f24, $f2, $f6                            \n\t"
+    "punpckhbh  $f26, $f2, $f6                            \n\t"
+    "punpckhbh  $f2, $f0, $f4                             \n\t"
+    "punpcklbh  $f0, $f0, $f4                             \n\t"
+
+    "punpcklbh  $f28, $f10, $f14                          \n\t"
+    "punpckhbh  $f30, $f10, $f14                          \n\t"
+    "punpckhbh  $f10, $f8, $f12                           \n\t"
+    "punpcklbh  $f8, $f8, $f12                            \n\t"
+
+    "punpcklhw  $f16, $f2, $f10                           \n\t"
+    "punpckhhw  $f18, $f2, $f10                           \n\t"
+    "punpckhhw  $f2, $f0, $f8                             \n\t"
+    "punpcklhw  $f0, $f0, $f8                             \n\t"
+    "punpcklhw  $f20, $f26, $f30                          \n\t"
+    "punpckhhw  $f22, $f26, $f30                          \n\t"
+    "punpckhhw  $f26, $f24, $f28                          \n\t"
+    "punpcklhw  $f24, $f24, $f28                          \n\t"
+
+    "punpcklwd  $f4, $f2, $f26                            \n\t"
+    "punpckhwd  $f6, $f2, $f26                            \n\t"
+    "punpckhwd  $f2, $f0, $f24                            \n\t"
+    "punpcklwd  $f0, $f0, $f24                            \n\t"
+    "punpcklwd  $f8, $f18, $f22                           \n\t"
+    "punpckhwd  $f10, $f18, $f22                          \n\t"
+    "punpckhwd  $f18, $f16, $f20                          \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+
+    "mov.d      $f20, $f2                                 \n\t"
+    "mov.d      $f22, $f18                                \n\t"
+    "mov.d      $f2, $f16                                 \n\t"
+    "mov.d      $f24, $f6                                 \n\t"
+    "mov.d      $f26, $f10                                \n\t"
+    "mov.d      $f6, $f8                                  \n\t"
+
+    "dli        %[iAlpha], 0x20                           \n\t"
+    "daddu      $8, %[pPixCb], %[iStride]                 \n\t"
+    "gsswlc1    $f0, 0x3(%[pPixCb])                       \n\t"
+    "gsswlc1    $f20, 0x3($8)                             \n\t"
+    "gsswrc1    $f0, 0x0(%[pPixCb])                       \n\t"
+    "gsswrc1    $f20, 0x0($8)                             \n\t"
+    "daddu      $9, $8, %[iStride]                        \n\t"
+    "daddu      $8, $9, %[iStride]                        \n\t"
+    "gsswlc1    $f4, 0x3($9)                              \n\t"
+    "gsswlc1    $f24, 0x3($8)                             \n\t"
+    "gsswrc1    $f4, 0x0($9)                              \n\t"
+    "gsswrc1    $f24, 0x0($8)                             \n\t"
+    "daddu      $9, $8, %[iStride]                        \n\t"
+    "dmtc1      %[iAlpha], $f8                            \n\t"
+
+    "dsrl       $f0, $f0, $f8                             \n\t"
+    "dsrl       $f20, $f20, $f8                           \n\t"
+    "dsrl       $f4, $f4, $f8                             \n\t"
+    "dsrl       $f24, $f24, $f8                           \n\t"
+    "daddu      $10, %[pPixCr], %[iStride]                \n\t"
+    "gsswlc1    $f0, 0x3(%[pPixCr])                       \n\t"
+    "gsswlc1    $f20, 0x3($10)                            \n\t"
+    "gsswrc1    $f0, 0x0(%[pPixCr])                       \n\t"
+    "gsswrc1    $f20, 0x0($10)                            \n\t"
+    "daddu      $11, $10, %[iStride]                      \n\t"
+    "daddu      $10, $11, %[iStride]                      \n\t"
+    "gsswlc1    $f4, 0x3($11)                             \n\t"
+    "gsswlc1    $f24, 0x3($10)                            \n\t"
+    "gsswrc1    $f4, 0x0($11)                             \n\t"
+    "gsswrc1    $f24, 0x0($10)                            \n\t"
+    "daddu      $11, $10, %[iStride]                      \n\t"
+
+    "daddu      $8, $9, %[iStride]                        \n\t"
+    "gsswlc1    $f2, 0x3($9)                              \n\t"
+    "gsswlc1    $f22, 0x3($8)                             \n\t"
+    "gsswrc1    $f2, 0x0($9)                              \n\t"
+    "gsswrc1    $f22, 0x0($8)                             \n\t"
+    "daddu      $9, $8, %[iStride]                        \n\t"
+    "daddu      $8, $9, %[iStride]                        \n\t"
+    "gsswlc1    $f6, 0x3($9)                              \n\t"
+    "gsswlc1    $f26, 0x3($8)                             \n\t"
+    "gsswrc1    $f6, 0x0($9)                              \n\t"
+    "gsswrc1    $f26, 0x0($8)                             \n\t"
+
+    "dsrl       $f2, $f2, $f8                             \n\t"
+    "dsrl       $f22, $f22, $f8                           \n\t"
+    "dsrl       $f6, $f6, $f8                             \n\t"
+    "dsrl       $f26, $f26, $f8                           \n\t"
+    "daddu      $10, $11, %[iStride]                      \n\t"
+    "gsswlc1    $f2, 0x3($11)                             \n\t"
+    "gsswlc1    $f22, 0x3($10)                            \n\t"
+    "gsswrc1    $f2, 0x0($11)                             \n\t"
+    "gsswrc1    $f22, 0x0($10)                            \n\t"
+    "daddu      $11, $10, %[iStride]                      \n\t"
+    "daddu      $10, $11, %[iStride]                      \n\t"
+    "gsswlc1    $f6, 0x3($11)                             \n\t"
+    "gsswlc1    $f26, 0x3($10)                            \n\t"
+    "gsswrc1    $f6, 0x0($11)                             \n\t"
+    "gsswrc1    $f26, 0x0($10)                            \n\t"
+    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp), [pTC]"r"((char *)pTC)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
+      "$f6", "$f8", "$f10", "$f12","$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
+      "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void WelsNonZeroCount_mmi(int8_t *pNonZeroCount) {
+  __asm__ volatile(
+    ".set       arch=loongson3a                 \n\t"
+    "gsldlc1    $f0, 0x7(%[pNonZeroCount])      \n\t"
+    "gsldlc1    $f2, 0xF(%[pNonZeroCount])      \n\t"
+    "gsldlc1    $f4, 0x17(%[pNonZeroCount])     \n\t"
+    "gsldrc1    $f4, 0x10(%[pNonZeroCount])     \n\t"
+    "gsldrc1    $f0, 0x0(%[pNonZeroCount])      \n\t"
+    "gsldrc1    $f2, 0x8(%[pNonZeroCount])      \n\t"
+    "pcmpeqh    $f8, $f8, $f8                   \n\t"
+    "dli        $8, 0xF                         \n\t"
+    "dmtc1      $8, $f6                         \n\t"
+    "psrlh      $f8, $f8, $f6                   \n\t"
+    "packushb   $f8, $f8, $f8                   \n\t"
+
+    "pminub     $f0, $f0, $f8                   \n\t"
+    "pminub     $f2, $f2, $f8                   \n\t"
+    "pminub     $f4, $f4, $f8                   \n\t"
+    "gssdlc1    $f0, 0x7(%[pNonZeroCount])      \n\t"
+    "gssdlc1    $f2, 0xF(%[pNonZeroCount])      \n\t"
+    "gssdlc1    $f4, 0x17(%[pNonZeroCount])     \n\t"
+    "gssdrc1    $f0, 0x0(%[pNonZeroCount])      \n\t"
+    "gssdrc1    $f2, 0x8(%[pNonZeroCount])      \n\t"
+    "gssdrc1    $f4, 0x10(%[pNonZeroCount])     \n\t"
+    :
+    : [pNonZeroCount] "r"((unsigned char *)pNonZeroCount)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
+  );
+}
--- a/codec/common/src/cpu.cpp
+++ b/codec/common/src/cpu.cpp
@@ -307,7 +307,17 @@
          WELS_CPU_NEON;
 }
 
-#else /* Neither X86_ASM, HAVE_NEON nor HAVE_NEON_AARCH64 */
+#elif defined(mips)
+/* for loongson */
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+#if defined(HAVE_MMI)
+  return WELS_CPU_MMI;
+#else
+  return 0;
+#endif
+}
+
+#else /* Neither X86_ASM, HAVE_NEON, HAVE_NEON_AARCH64 nor mips */
 
 uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
   return 0;
--- a/codec/common/src/deblocking_common.cpp
+++ b/codec/common/src/deblocking_common.cpp
@@ -274,3 +274,22 @@
 
 #endif
 
+#ifdef HAVE_MMI
+extern "C" {
+  void DeblockLumaLt4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
+    ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
+
+    DeblockLumaTransposeH2V_mmi (pPixY - 4, iStride, &uiBuf[0]);
+    DeblockLumaLt4V_mmi (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
+    DeblockLumaTransposeV2H_mmi (pPixY - 4, iStride, &uiBuf[0]);
+  }
+
+  void DeblockLumaEq4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+    ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
+
+    DeblockLumaTransposeH2V_mmi (pPixY - 4, iStride, &uiBuf[0]);
+    DeblockLumaEq4V_mmi (&uiBuf[4 * 16], 16, iAlpha, iBeta);
+    DeblockLumaTransposeV2H_mmi (pPixY - 4, iStride, &uiBuf[0]);
+  }
+}
+#endif//HAVE_MMI
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -63,6 +63,15 @@
 endif
 OBJS += $(COMMON_OBJSARM64)
 
+COMMON_ASM_MIPS64_SRCS=\
+	$(COMMON_SRCDIR)/mips64/deblock_mmi.c\
+
+COMMON_OBJSMIPS64 += $(COMMON_ASM_MIPS64_SRCS:.c=.$(OBJ))
+ifeq ($(ASM_ARCH), mips64)
+COMMON_OBJS += $(COMMON_OBJSMIPS64)
+endif
+OBJS += $(COMMON_OBJSMIPS64)
+
 OBJS += $(COMMON_OBJS)
 
 $(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.cpp
@@ -73,6 +82,9 @@
 
 $(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.S
 	$(QUIET_CCAS)$(CCAS) $(CCASFLAGS) $(ASMFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c -o $@ $<
+
+$(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(ASMFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c -o $@ $<
 
 $(LIBPREFIX)common.$(LIBSUFFIX): $(COMMON_OBJS)
 	$(QUIET)rm -f $@
--- a/codec/decoder/core/src/deblocking.cpp
+++ b/codec/decoder/core/src/deblocking.cpp
@@ -1378,6 +1378,19 @@
     pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_AArch64_neon;
   }
 #endif
+
+#if defined(HAVE_MMI)
+  if (iCpu & WELS_CPU_MMI) {
+    pFunc->pfLumaDeblockingLT4Ver   = DeblockLumaLt4V_mmi;
+    pFunc->pfLumaDeblockingEQ4Ver   = DeblockLumaEq4V_mmi;
+    pFunc->pfLumaDeblockingLT4Hor   = DeblockLumaLt4H_mmi;
+    pFunc->pfLumaDeblockingEQ4Hor   = DeblockLumaEq4H_mmi;
+    pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_mmi;
+    pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_mmi;
+    pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_mmi;
+    pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_mmi;
+  }
+#endif//HAVE_MMI
 }
 
 } // namespace WelsDec
--- a/test/decoder/DecUT_Deblock.cpp
+++ b/test/decoder/DecUT_Deblock.cpp
@@ -146,3 +146,20 @@
 GENERATE_CHROMA_UT (ChromaEq4H_AArch64_neon, DeblockChromaEq4H_AArch64_neon_wrap, DeblockChromaEq4H_c_wrap,
                     WELS_CPU_NEON, 1)
 #endif
+
+#if defined(HAVE_MMI)
+WRAP_LUMA_FUNC (DeblockLumaEq4V_mmi)
+WRAP_LUMA_FUNC (DeblockLumaEq4H_mmi)
+WRAP_CHROMA_FUNC (DeblockChromaEq4V_mmi)
+WRAP_CHROMA_FUNC (DeblockChromaEq4H_mmi)
+
+GENERATE_LUMA_UT (LumaLt4V_mmi, DeblockLumaLt4V_mmi, DeblockLumaLt4V_c, WELS_CPU_MMI, 0)
+GENERATE_LUMA_UT (LumaLt4H_mmi, DeblockLumaLt4H_mmi, DeblockLumaLt4H_c, WELS_CPU_MMI, 1)
+GENERATE_LUMA_UT (LumaEq4V_mmi, DeblockLumaEq4V_mmi_wrap, DeblockLumaEq4V_c_wrap, WELS_CPU_MMI, 0)
+GENERATE_LUMA_UT (LumaEq4H_mmi, DeblockLumaEq4H_mmi_wrap, DeblockLumaEq4H_c_wrap, WELS_CPU_MMI, 1)
+
+GENERATE_CHROMA_UT (ChromaLt4V_mmi, DeblockChromaLt4V_mmi, DeblockChromaLt4V_c, WELS_CPU_MMI, 0)
+GENERATE_CHROMA_UT (ChromaLt4H_mmi, DeblockChromaLt4H_mmi, DeblockChromaLt4H_c, WELS_CPU_MMI, 1)
+GENERATE_CHROMA_UT (ChromaEq4V_mmi, DeblockChromaEq4V_mmi_wrap, DeblockChromaEq4V_c_wrap, WELS_CPU_MMI, 0)
+GENERATE_CHROMA_UT (ChromaEq4H_mmi, DeblockChromaEq4H_mmi_wrap, DeblockChromaEq4H_c_wrap, WELS_CPU_MMI, 1)
+#endif//HAVE_MMI
--- a/test/decoder/DecUT_DeblockCommon.cpp
+++ b/test/decoder/DecUT_DeblockCommon.cpp
@@ -540,6 +540,17 @@
   DeblockingInit (&sDBFunc, 0x000004);
   DB_FUNC_CPUFLAG (AArch64_neon)
 #endif
+
+#ifdef HAVE_MMI
+  // pure C
+  DeblockingInit (&sDBFunc, 0x00000000);
+  DB_FUNC_CPUFLAG (c)
+
+  // mmi
+  DeblockingInit (&sDBFunc, 0x00000001);
+  DB_FUNC_CPUFLAG (mmi)
+#endif
+
 }
 
 TEST (DecoderDeblocking, WelsDeblockingFilterSlice) {