ref: 906dacd34972e42819dab320960ffcfb7b84aada
parent: 9e2abda78f0fc0e6a4c2a7a3e2e3067404acdade
author: gxw <guxiwei-hf@loongson.cn>
date: Fri Aug 10 13:46:41 EDT 2018
Add optimization files in codec/encoder/core/mips Add dct_mmi.c, quant_mmi.c and score_mmi.c in codec/encoder/core/mips Change-Id: I5955558968d68c1ff62ec5788c6a1a5f5c9bb28f
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -64,7 +64,11 @@
OBJS += $(COMMON_OBJSARM64)
COMMON_ASM_MIPS_SRCS=\
+ $(COMMON_SRCDIR)/mips/copy_mb_mmi.c\
$(COMMON_SRCDIR)/mips/deblock_mmi.c\
+ $(COMMON_SRCDIR)/mips/expand_picture_mmi.c\
+ $(COMMON_SRCDIR)/mips/intra_pred_com_mmi.c\
+ $(COMMON_SRCDIR)/mips/satd_sad_mmi.c\
COMMON_OBJSMIPS += $(COMMON_ASM_MIPS_SRCS:.c=.$(OBJ))
ifeq ($(ASM_ARCH), mips)
--- a/codec/encoder/core/inc/decode_mb_aux.h
+++ b/codec/encoder/core/inc/decode_mb_aux.h
@@ -95,6 +95,11 @@
int16_t* pDctDc);
#endif
+#if defined(HAVE_MMI)
+void WelsIDctT4Rec_mmi (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
+void WelsIDctFourT4Rec_mmi (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
+void WelsIDctRecI16x16Dc_mmi (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDctDc);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -147,6 +147,33 @@
void WelsQuantFour4x4_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
void WelsQuantFour4x4Max_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
#endif
+
+#ifdef HAVE_MMI
+int32_t WelsGetNoneZeroCount_mmi (int16_t* pLevel);
+
+/****************************************************************************
+ * * Scan and Score functions
+ * ****************************************************************************/
+void WelsScan4x4Ac_mmi (int16_t* zig_value, int16_t* pDct);
+void WelsScan4x4DcAc_mmi (int16_t* pLevel, int16_t* pDct);
+int32_t WelsCalculateSingleCtr4x4_mmi (int16_t* pDct);
+
+/****************************************************************************
+ * * DCT functions
+ * ****************************************************************************/
+void WelsDctT4_mmi (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
+void WelsDctFourT4_mmi (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
+
+/****************************************************************************
+ * * HDM and Quant functions
+ * ****************************************************************************/
+void WelsHadamardT4Dc_mmi (int16_t* pLumaDc, int16_t* pDct);
+
+void WelsQuant4x4_mmi (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
+void WelsQuant4x4Dc_mmi (int16_t* pDct, int16_t iFF, int16_t iMF);
+void WelsQuantFour4x4_mmi (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
+void WelsQuantFour4x4Max_mmi (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- /dev/null
+++ b/codec/encoder/core/mips/dct_mmi.c
@@ -1,0 +1,529 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file dct_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 20/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+#define MMI_Load4x8p(r0, f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+ "gslqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gslqc1 "#f10", "#f8", 0x10("#r0") \n\t" \
+ "gslqc1 "#f18", "#f16", 0x20("#r0") \n\t" \
+ "gslqc1 "#f6", "#f4", 0x30("#r0") \n\t" \
+ MMI_XSawp_DQ(f8, f10, f4, f6, f12, f14) \
+ MMI_XSawp_DQ(f0, f2, f16, f18, f4, f6)
+
+#define MMI_SumSubDiv2(f0, f2, f4, f6, f8, f10, f12, f14, f16) \
+ "mov.d "#f8", "#f4" \n\t" \
+ "mov.d "#f10", "#f6" \n\t" \
+ "psrah "#f4", "#f4", "#f16" \n\t" \
+ "psrah "#f6", "#f6", "#f16" \n\t" \
+ "psrah "#f12", "#f0", "#f16" \n\t" \
+ "psrah "#f14", "#f2", "#f16" \n\t" \
+ "paddh "#f0", "#f0", "#f4" \n\t" \
+ "paddh "#f2", "#f2", "#f6" \n\t" \
+ "psubh "#f12", "#f12", "#f8" \n\t" \
+ "psubh "#f14", "#f14", "#f10" \n\t"
+
+#define MMI_IDCT(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26, f28) \
+ MMI_SumSub(f24, f26, f4, f6, f20, f22) \
+ MMI_SumSubDiv2(f0, f2, f8, f10, f16, f18, f12, f14, f28) \
+ MMI_SumSub(f4, f6, f0, f2, f16, f18) \
+ MMI_SumSub(f24, f26, f12, f14, f16, f18)
+
+#define MMI_StoreDiff8p_6(f0, f2, f4, f6, f8, f12, r0, r1, f14) \
+ "paddh "#f0", "#f0", "#f8" \n\t" \
+ "paddh "#f2", "#f2", "#f8" \n\t" \
+ "psrah "#f0", "#f0", "#f14" \n\t" \
+ "psrah "#f2", "#f2", "#f14" \n\t" \
+ "gsldlc1 "#f4", 0x7("#r1") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r1") \n\t" \
+ "punpckhbh "#f6", "#f4", "#f12" \n\t" \
+ "punpcklbh "#f4", "#f4", "#f12" \n\t" \
+ "paddsh "#f4", "#f4", "#f0" \n\t" \
+ "paddsh "#f6", "#f6", "#f2" \n\t" \
+ "packushb "#f4", "#f4", "#f6" \n\t" \
+ "gssdlc1 "#f4", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f4", 0x0("#r0") \n\t"
+
+#define MMI_StoreDiff8p_5(f0, f2, f4, f6, f8, r0, r1, offset) \
+ "gsldlc1 "#f4", "#offset"+0x7("#r1") \n\t" \
+ "gsldrc1 "#f4", "#offset"+0x0("#r1") \n\t" \
+ "punpckhbh "#f6", "#f4", "#f8" \n\t" \
+ "punpcklbh "#f4", "#f4", "#f8" \n\t" \
+ "paddsh "#f4", "#f4", "#f0" \n\t" \
+ "paddsh "#f6", "#f6", "#f2" \n\t" \
+ "packushb "#f4", "#f4", "#f6" \n\t" \
+ "gssdlc1 "#f4", "#offset"+0x7("#r0") \n\t" \
+ "gssdrc1 "#f4", "#offset"+0x0("#r0") \n\t"
+
+#define MMI_Load8DC(f0, f2, f4, f6, f8, f10, f12, f14, f16, r0, offset, f20) \
+ "gslqc1 "#f2", "#f0", "#offset"+0x0("#r0") \n\t" \
+ "paddh "#f0", "#f0", "#f16" \n\t" \
+ "paddh "#f2", "#f2", "#f16" \n\t" \
+ "psrah "#f0", "#f0", "#f20" \n\t" \
+ "psrah "#f2", "#f2", "#f20" \n\t" \
+ "punpckhhw "#f4", "#f0", "#f0" \n\t" \
+ "punpckhwd "#f6", "#f4", "#f4" \n\t" \
+ "punpcklwd "#f4", "#f4", "#f4" \n\t" \
+ "punpcklhw "#f8", "#f2", "#f2" \n\t" \
+ "punpckhwd "#f10", "#f8", "#f8" \n\t" \
+ "punpcklwd "#f8", "#f8", "#f8" \n\t" \
+ "punpckhhw "#f12", "#f2", "#f2" \n\t" \
+ "punpckhwd "#f14", "#f12", "#f12" \n\t" \
+ "punpcklwd "#f12", "#f12", "#f12" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f0" \n\t" \
+ "punpckhwd "#f2", "#f0", "#f0" \n\t" \
+ "punpcklwd "#f0", "#f0", "#f0" \n\t"
+
+#define MMI_StoreDiff4x8p(f0, f2, f4, f6, f8, f10, f12, r0, r1, r2, r3) \
+ MMI_StoreDiff8p_5(f0, f2, f8, f10, f12, r0, r1, 0x0) \
+ MMI_StoreDiff8p_5(f4, f6, f8, f10, f12, r0, r1, 0x8) \
+ PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
+ PTR_ADDU ""#r1", "#r1", "#r3" \n\t" \
+ MMI_StoreDiff8p_5(f0, f2, f8, f10, f12, r0, r1, 0x0) \
+ MMI_StoreDiff8p_5(f4, f6, f8, f10, f12, r0, r1, 0x8)
+
+#define MMI_Load4Col(f0, f2, f4, f6, f8, r0, offset) \
+ "lh $8, "#offset"("#r0") \n\t" \
+ "dmtc1 $8, "#f0" \n\t" \
+ "lh $8, "#offset"+0x20("#r0") \n\t" \
+ "dmtc1 $8, "#f4" \n\t" \
+ "punpcklwd "#f0", "#f0", "#f4" \n\t" \
+ "lh $8, "#offset"+0x80("#r0") \n\t" \
+ "dmtc1 $8, "#f6" \n\t" \
+ "lh $8, "#offset"+0xa0("#r0") \n\t" \
+ "dmtc1 $8, "#f8" \n\t" \
+ "punpcklwd "#f2", "#f6", "#f8" \n\t"
+
+#define MMI_SumSubD(f0, f2, f4, f6, f8, f10) \
+ "mov.d "#f8", "#f4" \n\t" \
+ "mov.d "#f10", "#f6" \n\t" \
+ "paddw "#f4", "#f4", "#f0" \n\t" \
+ "paddw "#f6", "#f6", "#f2" \n\t" \
+ "psubw "#f0", "#f0", "#f8" \n\t" \
+ "psubw "#f2", "#f2", "#f10" \n\t"
+
+#define WELS_DD1(f0, f2, f_val_31) \
+ "pcmpeqh "#f0", "#f0", "#f0" \n\t" \
+ "pcmpeqh "#f2", "#f2", "#f2" \n\t" \
+ "psrlw "#f0", "#f0", "#f_val_31" \n\t" \
+ "psrlw "#f2", "#f2", "#f_val_31" \n\t"
+
+#define MMI_SumSubDiv2D(f0, f2, f4, f6, f8, f10, f12, f14, f_val_1) \
+ "paddw "#f0", "#f0", "#f4" \n\t" \
+ "paddw "#f2", "#f2", "#f6" \n\t" \
+ "paddw "#f0", "#f0", "#f8" \n\t" \
+ "paddw "#f2", "#f2", "#f10" \n\t" \
+ "psraw "#f0", "#f0", "#f_val_1" \n\t" \
+ "psraw "#f2", "#f2", "#f_val_1" \n\t" \
+ "mov.d "#f12", "#f0" \n\t" \
+ "mov.d "#f14", "#f2" \n\t" \
+ "psubw "#f12", "#f12", "#f4" \n\t" \
+ "psubw "#f14", "#f14", "#f6" \n\t"
+
+#define MMI_Trans4x4W(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+ MMI_XSawp_WD(f0, f2, f4, f6, f16, f18) \
+ MMI_XSawp_WD(f8, f10, f12, f14, f4, f6) \
+ MMI_XSawp_DQ(f0, f2, f8, f10, f12, f14) \
+ MMI_XSawp_DQ(f16, f18, f4, f6, f8, f10)
+
+#define MMI_SumSubMul2(f0, f2, f4, f6, f8, f10) \
+ "mov.d "#f8", "#f0" \n\t" \
+ "mov.d "#f10", "#f2" \n\t" \
+ "paddh "#f0", "#f0", "#f0" \n\t" \
+ "paddh "#f2", "#f2", "#f2" \n\t" \
+ "paddh "#f0", "#f0", "#f4" \n\t" \
+ "paddh "#f2", "#f2", "#f6" \n\t" \
+ "psubh "#f8", "#f8", "#f4" \n\t" \
+ "psubh "#f10", "#f10", "#f6" \n\t" \
+ "psubh "#f8", "#f8", "#f4" \n\t" \
+ "psubh "#f10", "#f10", "#f6" \n\t"
+
+#define MMI_DCT(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22) \
+ MMI_SumSub(f20, f22, f8, f10, f16, f18) \
+ MMI_SumSub(f0, f2, f4, f6, f16, f18) \
+ MMI_SumSub(f8, f10, f4, f6, f16, f18) \
+ MMI_SumSubMul2(f20, f22, f0, f2, f12, f14)
+
+#define MMI_Store4x8p(r0, f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+ MMI_XSawp_DQ(f0, f2, f4, f6, f16, f18) \
+ MMI_XSawp_DQ(f8, f10, f12, f14, f4, f6) \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f10", "#f8", 0x10("#r0") \n\t" \
+ "gssqc1 "#f18", "#f16", 0x20("#r0") \n\t" \
+ "gssqc1 "#f6", "#f4", 0x30("#r0") \n\t"
+
+#define MMI_LoadDiff4P_SINGLE(f0, f2, r0, r1, f4) \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f2", 0x7("#r1") \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f2", 0x0("#r1") \n\t" \
+ "punpcklbh "#f0", "#f0", "#f4" \n\t" \
+ "punpcklbh "#f2", "#f2", "#f4" \n\t" \
+ "psubh "#f0", "#f0", "#f2" \n\t"
+
+#define MMI_LoadDiff4x4P_SINGLE(f0, f2, f4, f6, r0, r1, r2, r3, f8, f10) \
+ MMI_LoadDiff4P_SINGLE(f0, f8, r0, r2, f10) \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ PTR_ADDU ""#r2", "#r2", "#r3" \n\t" \
+ MMI_LoadDiff4P_SINGLE(f2, f8, r0, r2, f10) \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ PTR_ADDU ""#r2", "#r2", "#r3" \n\t" \
+ MMI_LoadDiff4P_SINGLE(f4, f8, r0, r2, f10) \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ PTR_ADDU ""#r2", "#r2", "#r3" \n\t" \
+ MMI_LoadDiff4P_SINGLE(f6, f8, r0, r2, f10)
+
+#define MMI_DCT_SINGLE(f0, f2, f4, f6, f8, f10, f12) \
+ MMI_SumSub_SINGLE(f6, f0, f10) \
+ MMI_SumSub_SINGLE(f4, f2, f10) \
+ MMI_SumSub_SINGLE(f4, f6, f10) \
+ MMI_SumSubMul2_SINGLE(f0, f2, f8, f12)
+
+void WelsIDctT4Rec_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
+ int32_t iPredStride, int16_t* pDct) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gsldlc1 $f0, 0x7(%[pDct]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pDct]) \n\t"
+ "gsldlc1 $f2, 0xF(%[pDct]) \n\t"
+ "gsldrc1 $f2, 0x8(%[pDct]) \n\t"
+ "gsldlc1 $f4, 0x17(%[pDct]) \n\t"
+ "gsldrc1 $f4, 0x10(%[pDct]) \n\t"
+ "gsldlc1 $f6, 0x1F(%[pDct]) \n\t"
+ "gsldrc1 $f6, 0x18(%[pDct]) \n\t"
+
+ "dli $8, 0x1 \n\t"
+ "dmtc1 $8, $f16 \n\t"
+ "dli $8, 0x6 \n\t"
+ "dmtc1 $8, $f18 \n\t"
+
+ MMI_Trans4x4H_SINGLE($f0, $f2, $f4, $f6, $f8)
+ MMI_IDCT_SINGLE($f2, $f4, $f6, $f8, $f0, $f12, $f16)
+ MMI_Trans4x4H_SINGLE($f2, $f6, $f0, $f8, $f4)
+ MMI_IDCT_SINGLE($f6, $f0, $f8, $f4, $f2, $f12, $f16)
+
+ "xor $f14, $f14, $f14 \n\t"
+ "dli $8, 0x0020 \n\t"
+ "dmtc1 $8, $f12 \n\t"
+ "punpcklhw $f12, $f12, $f12 \n\t"
+ "punpcklwd $f12, $f12, $f12 \n\t"
+
+ MMI_StoreDiff4P_SINGLE($f6, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4P_SINGLE($f8, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4P_SINGLE($f2, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4P_SINGLE($f4, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
+ : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred)
+ : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride),
+ [pDct]"r"((short *)pDct)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18"
+ );
+}
+
+void WelsIDctFourT4Rec_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
+ int32_t iPredStride, int16_t* pDct) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ MMI_Load4x8p(%[pDct], $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f20, $f22)
+
+ MMI_TransTwo4x4H($f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f12, $f14)
+ "dli $8, 0x1 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ MMI_IDCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
+ $f0, $f2, $f30)
+ MMI_TransTwo4x4H($f4, $f6, $f16, $f18, $f0, $f2, $f8, $f10, $f12, $f14)
+ MMI_IDCT($f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22, $f24, $f26,
+ $f4, $f6, $f30)
+
+ "xor $f28, $f28, $f28 \n\t"
+ "dli $8, 0x6 \n\t"
+ "dmtc1 $8, $f26 \n\t"
+ "dli $8, 0x0020 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "punpcklhw $f24, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f24, $f24 \n\t"
+
+ MMI_StoreDiff8p_6($f16, $f18, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff8p_6($f0, $f2, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff8p_6($f4, $f6, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff8p_6($f8, $f10, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+
+ PTR_ADDIU "%[pDct], %[pDct], 0x40 \n\t"
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_Load4x8p(%[pDct], $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f20, $f22)
+
+ MMI_TransTwo4x4H($f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f12, $f14)
+ MMI_IDCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
+ $f0, $f2, $f30)
+ MMI_TransTwo4x4H($f4, $f6, $f16, $f18, $f0, $f2, $f8, $f10, $f12, $f14)
+ MMI_IDCT($f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22, $f24, $f26,
+ $f4, $f6, $f30)
+
+ "dli $8, 0x6 \n\t"
+ "dmtc1 $8, $f26 \n\t"
+ "dli $8, 0x0020 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "punpcklhw $f24, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f24, $f24 \n\t"
+
+ MMI_StoreDiff8p_6($f16, $f18, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff8p_6($f0, $f2, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff8p_6($f4, $f6, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff8p_6($f8, $f10, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+ : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred),
+ [pDct]"+&r"((short *)pDct)
+ : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsIDctRecI16x16Dc_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
+ int32_t iPredStride, int16_t* pDct) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "dli $8, 0x0020 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "punpcklhw $f24, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f24, $f24 \n\t"
+ "dli $8, 0x6 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+
+ MMI_Load8DC($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f24,
+ %[pDct], 0x0, $f30)
+
+ MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+
+ MMI_Load8DC($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f24, %[pDct], 0x10, $f30)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+ : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred),
+ [pDct]"+&r"((short *)pDct)
+ : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsHadamardT4Dc_mmi( int16_t *luma_dc, int16_t *pDct) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ MMI_Load4Col($f4, $f6, $f20, $f24, $f0, %[pDct], 0x0)
+ MMI_Load4Col($f8, $f10, $f20, $f24, $f0, %[pDct], 0x40)
+ MMI_Load4Col($f12, $f14, $f20, $f24, $f0, %[pDct], 0x100)
+ MMI_Load4Col($f16, $f18, $f20, $f24, $f0, %[pDct], 0x140)
+
+ MMI_SumSubD($f4, $f6, $f8, $f10, $f28, $f30)
+ MMI_SumSubD($f12, $f14, $f16, $f18, $f28, $f30)
+ MMI_SumSubD($f8, $f10, $f16, $f18, $f28, $f30)
+ MMI_SumSubD($f4, $f6, $f12, $f14, $f28, $f30)
+
+ MMI_Trans4x4W($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f20, $f22)
+
+ MMI_SumSubD($f16, $f18, $f12, $f14, $f28, $f30)
+ MMI_SumSubD($f20, $f22, $f4, $f6, $f28, $f30)
+
+ "dli $8, 0x1F \n\t"
+ "dmtc1 $8, $f30 \n\t"
+
+ WELS_DD1($f24, $f26, $f30)
+
+ "dli $8, 0x1 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+
+ MMI_SumSubDiv2D($f12, $f14, $f4, $f6, $f24, $f26, $f0, $f2, $f30)
+ MMI_SumSubDiv2D($f16, $f18, $f20, $f22, $f24, $f26, $f4, $f6, $f30)
+ MMI_Trans4x4W($f12, $f14, $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10)
+
+ "packsswh $f12, $f12, $f14 \n\t"
+ "packsswh $f14, $f16, $f18 \n\t"
+
+ "packsswh $f8, $f8, $f10 \n\t"
+ "packsswh $f10, $f4, $f6 \n\t"
+ "gssqc1 $f14, $f12, 0x0(%[luma_dc]) \n\t"
+ "gssqc1 $f10, $f8, 0x10(%[luma_dc]) \n\t"
+ :
+ : [luma_dc]"r"((short *)luma_dc), [pDct]"r"((short *)pDct)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsDctT4_mmi(int16_t *pDct, uint8_t *pix1, int32_t i_pix1,
+ uint8_t *pix2, int32_t i_pix2 ) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f14, $f14, $f14 \n\t"
+ "dli $8, 0x1 \n\t"
+ "dmtc1 $8, $f16 \n\t"
+
+ MMI_LoadDiff4x4P_SINGLE($f2, $f4, $f6, $f8, %[pix1], %[i_pix1],
+ %[pix2], %[i_pix2], $f0, $f14)
+
+ MMI_DCT_SINGLE($f2, $f4, $f6, $f8, $f10, $f12, $f16)
+ MMI_Trans4x4H_SINGLE($f6, $f2, $f8, $f10, $f4)
+
+ MMI_DCT_SINGLE($f6, $f10, $f4, $f8, $f2, $f12, $f16)
+ MMI_Trans4x4H_SINGLE($f4, $f6, $f8, $f2, $f10)
+
+ "gssdlc1 $f4, 0x7(%[pDct]) \n\t"
+ "gssdlc1 $f2, 0xF(%[pDct]) \n\t"
+ "gssdlc1 $f10, 0x17(%[pDct]) \n\t"
+ "gssdlc1 $f8, 0x1F(%[pDct]) \n\t"
+ "gssdrc1 $f4, 0x0(%[pDct]) \n\t"
+ "gssdrc1 $f2, 0x8(%[pDct]) \n\t"
+ "gssdrc1 $f10, 0x10(%[pDct]) \n\t"
+ "gssdrc1 $f8, 0x18(%[pDct]) \n\t"
+ : [pDct]"+&r"((short *)pDct), [pix1]"+&r"(pix1), [pix2]"+&r"(pix2)
+ : [i_pix1]"r"(i_pix1), [i_pix2]"r"(i_pix2)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16"
+ );
+}
+
+void WelsDctFourT4_mmi(int16_t *pDct, uint8_t *pix1, int32_t i_pix1,
+ uint8_t *pix2, int32_t i_pix2 ) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ MMI_LoadDiff8P($f0, $f2, $f24, $f26, $f28, %[pix1], %[pix2])
+ PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
+ PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
+ MMI_LoadDiff8P($f4, $f6, $f24, $f26, $f28, %[pix1], %[pix2])
+ PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
+ PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
+ MMI_LoadDiff8P($f8, $f10, $f24, $f26, $f28, %[pix1], %[pix2])
+ PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
+ PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
+ MMI_LoadDiff8P($f12, $f14, $f24, $f26, $f28, %[pix1], %[pix2])
+
+ MMI_DCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f0, $f2)
+ MMI_TransTwo4x4H($f8, $f10, $f0, $f2, $f12, $f14, $f16, $f18, $f4, $f6)
+ MMI_DCT($f0, $f2, $f16, $f18, $f4, $f6, $f12, $f14, $f20, $f22, $f8, $f10)
+ MMI_TransTwo4x4H($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f0, $f2)
+
+ MMI_Store4x8p(%[pDct], $f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22)
+ PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
+ PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
+ MMI_LoadDiff8P($f0, $f2, $f24, $f26, $f28, %[pix1], %[pix2])
+ PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
+ PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
+ MMI_LoadDiff8P($f4, $f6, $f24, $f26, $f28, %[pix1], %[pix2])
+ PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
+ PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
+ MMI_LoadDiff8P($f8, $f10, $f24, $f26, $f28, %[pix1], %[pix2])
+ PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
+ PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
+ MMI_LoadDiff8P($f12, $f14, $f24, $f26, $f28, %[pix1], %[pix2])
+
+ MMI_DCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f0, $f2)
+ MMI_TransTwo4x4H($f8, $f10, $f0, $f2, $f12, $f14, $f16, $f18, $f4, $f6)
+ MMI_DCT($f0, $f2, $f16, $f18, $f4, $f6, $f12, $f14, $f20, $f22, $f8, $f10)
+ MMI_TransTwo4x4H($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f0, $f2)
+
+ PTR_ADDIU "%[pDct], %[pDct], 0x40 \n\t"
+ MMI_Store4x8p(%[pDct], $f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22)
+ : [pDct]"+&r"((short *)pDct), [pix1]"+&r"(pix1), [pix2]"+&r"(pix2)
+ : [i_pix1]"r"(i_pix1), [i_pix2]"r"(i_pix2)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28"
+ );
+ RECOVER_REG;
+}
--- /dev/null
+++ b/codec/encoder/core/mips/quant_mmi.c
@@ -1,0 +1,553 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file quant_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 20/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+void WelsQuant4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f10, $f10, $f10 \n\t"
+ "gslqc1 $f10, $f8, 0x0(%[ff]) \n\t"
+ "gslqc1 $f14, $f12, 0x0(%[mf]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+ :
+ : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)
+ : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
+ );
+}
+
+void WelsQuant4x4Dc_mmi(int16_t *pDct, const int16_t ff, int16_t mf) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f10, $f10, $f10 \n\t"
+ "dmtc1 %[mf], $f12 \n\t"
+ "pshufh $f12, $f12, $f10 \n\t"
+
+ "dmtc1 %[ff], $f8 \n\t"
+ "pshufh $f8, $f8, $f10 \n\t"
+
+ "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f8 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f12 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f8 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f12 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+ :
+ : [pDct]"r"((short *)pDct), [ff]"r"((short)ff), [mf]"r"((short)mf)
+ : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12"
+ );
+}
+
+void WelsQuantFour4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f10, $f8, 0x0(%[ff]) \n\t"
+ "gslqc1 $f14, $f12, 0x0(%[mf]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
+ :
+ : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)
+ : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
+ );
+}
+
+void WelsQuantFour4x4Max_mmi(int16_t *pDct, const int16_t*ff,
+ const int16_t *mf, int16_t *max) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f10, $f8, 0x0(%[ff]) \n\t"
+ "gslqc1 $f14, $f12, 0x0(%[mf]) \n\t"
+
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+
+ "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f16, $f16, $f0 \n\t"
+ "pmaxsh $f18, $f18, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f16, $f16, $f0 \n\t"
+ "pmaxsh $f18, $f18, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f20, $f20, $f0 \n\t"
+ "pmaxsh $f22, $f22, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f20, $f20, $f0 \n\t"
+ "pmaxsh $f22, $f22, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f24, $f24, $f0 \n\t"
+ "pmaxsh $f26, $f26, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f24, $f24, $f0 \n\t"
+ "pmaxsh $f26, $f26, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f28, $f28, $f0 \n\t"
+ "pmaxsh $f30, $f30, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f28, $f28, $f0 \n\t"
+ "pmaxsh $f30, $f30, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
+
+ "mov.d $f0, $f18 \n\t"
+ "punpckhhw $f18, $f16, $f20 \n\t"
+ "punpcklhw $f16, $f16, $f20 \n\t"
+ "punpckhhw $f2, $f0, $f22 \n\t"
+ "punpcklhw $f0, $f0, $f22 \n\t"
+
+ "mov.d $f20, $f26 \n\t"
+ "punpckhhw $f26, $f24, $f28 \n\t"
+ "punpcklhw $f24, $f24, $f28 \n\t"
+ "punpckhhw $f22, $f20, $f30 \n\t"
+ "punpcklhw $f20, $f20, $f30 \n\t"
+
+ "mov.d $f28, $f18 \n\t"
+ "punpckhwd $f18, $f16, $f24 \n\t"
+ "punpcklwd $f16, $f16, $f24 \n\t"
+ "punpckhwd $f30, $f28, $f26 \n\t"
+ "punpcklwd $f28, $f28, $f26 \n\t"
+
+ "mov.d $f24, $f2 \n\t"
+ "punpckhwd $f2, $f0, $f20 \n\t"
+ "punpcklwd $f0, $f0, $f20 \n\t"
+ "punpckhwd $f26, $f24, $f22 \n\t"
+ "punpcklwd $f24, $f24, $f22 \n\t"
+
+ "mov.d $f20, $f18 \n\t"
+ "mov.d $f18, $f0 \n\t"
+ "mov.d $f22, $f2 \n\t"
+
+ "mov.d $f0, $f30 \n\t"
+ "mov.d $f30, $f24 \n\t"
+ "mov.d $f2, $f26 \n\t"
+
+ "pmaxsh $f0, $f0, $f16 \n\t"
+ "pmaxsh $f2, $f2, $f18 \n\t"
+
+ "pmaxsh $f0, $f0, $f20 \n\t"
+ "pmaxsh $f2, $f2, $f22 \n\t"
+
+ "pmaxsh $f0, $f0, $f28 \n\t"
+ "pmaxsh $f2, $f2, $f30 \n\t"
+
+ "mov.d $f4, $f0 \n\t"
+ "mov.d $f6, $f2 \n\t"
+
+ "mov.d $f0, $f2 \n\t"
+ "mov.d $f2, $f6 \n\t"
+
+ "pmaxsh $f0, $f0, $f4 \n\t"
+ "pmaxsh $f2, $f2, $f6 \n\t"
+
+ "gssdlc1 $f0, 0x7(%[max]) \n\t"
+ "gssdrc1 $f0, 0x0(%[max]) \n\t"
+ :
+ : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf),
+ [max]"r"((short *)max)
+ : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",
+ "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
--- /dev/null
+++ b/codec/encoder/core/mips/score_mmi.c
@@ -1,0 +1,324 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file score_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 21/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+unsigned char nozero_count_table[] __attribute__((aligned(16))) = {
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+
+int32_t WelsGetNoneZeroCount_mmi(int16_t *level) {
+ int ret_val = 0;
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[level]) \n\t"
+ "gslqc1 $f6, $f4, 0x10(%[level]) \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ "pcmpeqh $f0, $f0, $f8 \n\t"
+ "pcmpeqh $f2, $f2, $f8 \n\t"
+ "pcmpeqh $f4, $f4, $f8 \n\t"
+ "pcmpeqh $f6, $f6, $f8 \n\t"
+ "packsshb $f4, $f4, $f6 \n\t"
+ "packsshb $f6, $f0, $f2 \n\t"
+ "pmovmskb $f0, $f4 \n\t"
+ "pmovmskb $f2, $f6 \n\t"
+ "dmfc1 $8, $f0 \n\t"
+ "dmfc1 $9, $f2 \n\t"
+ "xor $8, 0xFF \n\t"
+ "xor $9, 0xFF \n\t"
+ PTR_ADDU "$10, $8, %[nozero_count_table] \n\t"
+ "lbu $8, 0x0($10) \n\t"
+ PTR_ADDU "$10, $9, %[nozero_count_table] \n\t"
+ "lbu $9, 0x0($10) \n\t"
+ PTR_ADDU "%[ret_val], $8, $9 \n\t"
+ : [ret_val] "=r"((int)ret_val)
+ : [level] "r"((unsigned char *)level),
+ [nozero_count_table] "r"((unsigned char *)nozero_count_table)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8"
+ );
+ return ret_val;
+}
+
+void WelsScan4x4DcAc_mmi(int16_t level[16], int16_t *pDct) {
+ BACKUP_REG;
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+ "gslqc1 $f6, $f4, 0x10(%[pDct]) \n\t"
+ "dli $8, 0x3 \n\t"
+ "dmtc1 $8, $f22 \n\t"
+ "dli $8, 0x2 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "dli $8, 0x1 \n\t"
+ "dmtc1 $8, $f26 \n\t"
+ "dmtc1 $0, $f28 \n\t"
+ "pextrh $f18, $f2, $f22 \n\t"
+ "pextrh $f20, $f4, $f24 \n\t"
+ "pextrh $f16, $f2, $f26 \n\t"
+ "pinsrh_2 $f4, $f4, $f18 \n\t"
+ "pinsrh_3 $f2, $f2, $f16 \n\t"
+ "pextrh $f18, $f4, $f28 \n\t"
+ "pinsrh_1 $f2, $f2, $f18 \n\t"
+ "pinsrh_0 $f4, $f4, $f20 \n\t"
+ "dli $8, 0x93 \n\t"
+ "dmtc1 $8, $f22 \n\t"
+ "dli $8, 0x39 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "punpckhwd $f10, $f0, $f2 \n\t"
+ "punpcklwd $f8, $f0, $f2 \n\t"
+ "punpckhwd $f14, $f4, $f6 \n\t"
+ "punpcklwd $f12, $f4, $f6 \n\t"
+ "mov.d $f0, $f8 \n\t"
+ "pshufh $f2, $f10, $f22 \n\t"
+ "pshufh $f4, $f12, $f24 \n\t"
+ "mov.d $f6, $f14 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[level]) \n\t"
+ "gssqc1 $f6, $f4, 0x10(%[level]) \n\t"
+ :
+ : [level] "r"((short *)level), [pDct] "r"((short *)pDct)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28"
+ );
+ RECOVER_REG;
+}
+
+void WelsScan4x4Ac_mmi(int16_t *zig_value, int16_t *pDct) {
+ BACKUP_REG;
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+ "gslqc1 $f6, $f4, 0x10(%[pDct]) \n\t"
+ "mov.d $f8, $f2 \n\t"
+ "mov.d $f2, $f4 \n\t"
+ "mov.d $f10, $f6 \n\t"
+
+ "mov.d $f12, $f2 \n\t"
+ "punpckhwd $f2, $f0, $f8 \n\t"
+ "punpcklwd $f0, $f0, $f8 \n\t"
+ "punpckhwd $f14, $f12, $f10 \n\t"
+ "punpcklwd $f12, $f12, $f10 \n\t"
+
+ "dmtc1 $0, $f20 \n\t"
+ "dli $8, 0x10 \n\t"
+ "dmtc1 $8, $f22 \n\t"
+ "dli $8, 0x30 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "dli $8, 0x3 \n\t"
+ "dmtc1 $8, $f26 \n\t"
+ "dli $8, 0x93 \n\t"
+ "dmtc1 $8, $f28 \n\t"
+ "dli $8, 0x39 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ "pextrh $f16, $f0, $f26 \n\t"
+ "pextrh $f18, $f2, $f26 \n\t"
+ "pinsrh_3 $f2, $f2, $f16 \n\t"
+ "pextrh $f16, $f14, $f20 \n\t"
+ "pinsrh_0 $f14, $f14, $f18 \n\t"
+ "pextrh $f18, $f12, $f20 \n\t"
+ "pinsrh_0 $f12, $f12, $f16 \n\t"
+ "pinsrh_3 $f0, $f0, $f18 \n\t"
+
+ "mov.d $f4, $f0 \n\t"
+ "pshufh $f6, $f2, $f28 \n\t"
+ "pshufh $f8, $f12, $f30 \n\t"
+ "mov.d $f10, $f14 \n\t"
+
+ "mov.d $f12, $f8 \n\t"
+ "mov.d $f14, $f10 \n\t"
+ "dsrl $f4, $f4, $f22 \n\t"
+ "pinsrh_3 $f4, $f4, $f6 \n\t"
+ "dsrl $f6, $f6, $f22 \n\t"
+ "dsll $f14, $f12, $f24 \n\t"
+ "xor $f12, $f12, $f12 \n\t"
+ "or $f4, $f4, $f12 \n\t"
+ "or $f6, $f6, $f14 \n\t"
+ "dsrl $f8, $f8, $f22 \n\t"
+ "pinsrh_3 $f8, $f8, $f10 \n\t"
+ "dsrl $f10, $f10, $f22 \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[zig_value]) \n\t"
+ "gssqc1 $f10, $f8, 0x10(%[zig_value]) \n\t"
+ :
+ : [zig_value] "r"((short *)zig_value), [pDct] "r"((short *)pDct)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+unsigned char i_ds_table[]__attribute__((aligned(16))) = {
+ 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned char high_mask_table[]__attribute__((aligned(16))) = {
+ 0, 0, 0, 3, 0, 2, 3, 6, 0, 2,
+ 2, 5, 3, 5, 6, 9, 0, 1, 2, 5,
+ 2, 4, 5, 8, 3, 5, 5, 8, 6, 8,
+ 9,12, 0, 1, 1, 4, 2, 4, 5, 8,
+ 2, 4, 4, 7, 5, 7, 8,11, 3, 4,
+ 5, 8, 5, 7, 8,11, 6, 8, 8,11,
+ 9,11,12,15, 0, 1, 1, 4, 1, 3,
+ 4, 7, 2, 4, 4, 7, 5, 7, 8,11,
+ 2, 3, 4, 7, 4, 6, 7,10, 5, 7,
+ 7,10, 8,10,11,14, 3, 4, 4, 7,
+ 5, 7, 8,11, 5, 7, 7,10, 8,10,
+ 11,14, 6, 7, 8,11, 8,10,11,14,
+ 9,11,11,14,12,14,15,18, 0, 0,
+ 1, 4, 1, 3, 4, 7, 1, 3, 3, 6,
+ 4, 6, 7,10, 2, 3, 4, 7, 4, 6,
+ 7,10, 5, 7, 7,10, 8,10,11,14,
+ 2, 3, 3, 6, 4, 6, 7,10, 4, 6,
+ 6, 9, 7, 9,10,13, 5, 6, 7,10,
+ 7, 9,10,13, 8,10,10,13,11,13,
+ 14,17, 3, 4, 4, 7, 4, 6, 7,10,
+ 5, 7, 7,10, 8,10,11,14, 5, 6,
+ 7,10, 7, 9,10,13, 8,10,10,13,
+ 11,13,14,17, 6, 7, 7,10, 8,10,
+ 11,14, 8,10,10,13,11,13,14,17,
+ 9,10,11,14,11,13,14,17,12,14,
+ 14,17,15,17,18,21};
+
+unsigned char low_mask_table[]__attribute__((aligned(16))) = {
+ 0, 3, 2, 6, 2, 5, 5, 9, 1, 5,
+ 4, 8, 5, 8, 8,12, 1, 4, 4, 8,
+ 4, 7, 7,11, 4, 8, 7,11, 8,11,
+ 11,15, 1, 4, 3, 7, 4, 7, 7,11,
+ 3, 7, 6,10, 7,10,10,14, 4, 7,
+ 7,11, 7,10,10,14, 7,11,10,14,
+ 11,14,14,18, 0, 4, 3, 7, 3, 6,
+ 6,10, 3, 7, 6,10, 7,10,10,14,
+ 3, 6, 6,10, 6, 9, 9,13, 6,10,
+ 9,13,10,13,13,17, 4, 7, 6,10,
+ 7,10,10,14, 6,10, 9,13,10,13,
+ 13,17, 7,10,10,14,10,13,13,17,
+ 10,14,13,17,14,17,17,21, 0, 3,
+ 3, 7, 3, 6, 6,10, 2, 6, 5, 9,
+ 6, 9, 9,13, 3, 6, 6,10, 6, 9,
+ 9,13, 6,10, 9,13,10,13,13,17,
+ 3, 6, 5, 9, 6, 9, 9,13, 5, 9,
+ 8,12, 9,12,12,16, 6, 9, 9,13,
+ 9,12,12,16, 9,13,12,16,13,16,
+ 16,20, 3, 7, 6,10, 6, 9, 9,13,
+ 6,10, 9,13,10,13,13,17, 6, 9,
+ 9,13, 9,12,12,16, 9,13,12,16,
+ 13,16,16,20, 7,10, 9,13,10,13,
+ 13,17, 9,13,12,16,13,16,16,20,
+ 10,13,13,17,13,16,16,20,13,17,
+ 16,20,17,20,20,24};
+
+int32_t WelsCalculateSingleCtr4x4_mmi(int16_t *pDct) {
+ int32_t iSingleCtr = 0;
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+ "gslqc1 $f6, $f4, 0x10(%[pDct]) \n\t"
+ "packsshb $f0, $f0, $f2 \n\t"
+ "packsshb $f2, $f4, $f6 \n\t"
+
+ "xor $f10, $f10, $f10 \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+
+ "pcmpeqb $f0, $f0, $f8 \n\t"
+ "pcmpeqb $f2, $f2, $f8 \n\t"
+
+ "pmovmskb $f10, $f0 \n\t"
+ "pmovmskb $f12, $f2 \n\t"
+ "punpcklbh $f10, $f10, $f12 \n\t"
+
+ "dmfc1 $12, $f10 \n\t"
+ "dli $8, 0xffff \n\t"
+ "xor $12, $12, $8 \n\t"
+
+ "xor %[pDct], %[pDct], %[pDct] \n\t"
+ "dli $8, 0x80 \n\t"
+ "dli $9, 0x7 \n\t"
+ "dli $10, 0x100 \n\t"
+ "dli $11, 0x8 \n\t"
+
+ "1: \n\t"
+ "and $13, $12, $8 \n\t"
+ "bnez $13, 2f \n\t"
+ "nop \n\t"
+ "daddiu $9, -0x1 \n\t"
+ "dsrl $8, 1 \n\t"
+ "bnez $9, 1b \n\t"
+ "nop \n\t"
+ "2: \n\t"
+ "and $13, $12, $10 \n\t"
+ "bnez $13, 3f \n\t"
+ "nop \n\t"
+ "daddiu $11, 0x1 \n\t"
+ "dsll $10, 1 \n\t"
+ "daddiu $13, $11, -0x10 \n\t"
+ "bltz $13, 2b \n\t"
+ "nop \n\t"
+ "3: \n\t"
+ "dsubu $11, $11, $9 \n\t"
+ "daddiu $11, -0x1 \n\t"
+ PTR_ADDU "$8, %[i_ds_table], $11 \n\t"
+ "lb $10, 0x0($8) \n\t"
+ PTR_ADDU "%[pDct], %[pDct], $10 \n\t"
+ "move $11, $12 \n\t"
+ "dli $10, 0xff \n\t"
+ "and $12, $10 \n\t"
+ "dsrl $11, 0x8 \n\t"
+ "and $11, $10 \n\t"
+ PTR_ADDU "$8, %[low_mask_table], $12 \n\t"
+ "lb $10, 0x0($8) \n\t"
+ PTR_ADDU "%[pDct], %[pDct], $10 \n\t"
+ PTR_ADDU "$8, %[high_mask_table], $11 \n\t"
+ "lb $10, 0x0($8) \n\t"
+ PTR_ADDU "%[iSingleCtr], %[pDct], $10 \n\t"
+ : [iSingleCtr] "=r"(iSingleCtr)
+ : [pDct] "r"((short *)pDct),
+ [i_ds_table] "r"((unsigned char *)i_ds_table),
+ [high_mask_table] "r"((unsigned char *)high_mask_table),
+ [low_mask_table] "r"((unsigned char *)low_mask_table)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
+ "$f6", "$f8", "$f10", "$f12"
+ );
+ return iSingleCtr;
+}
--- a/codec/encoder/core/src/decode_mb_aux.cpp
+++ b/codec/encoder/core/src/decode_mb_aux.cpp
@@ -302,5 +302,13 @@
pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_AArch64_neon;
}
#endif
+
+#if defined(HAVE_MMI)
+ if (uiCpuFlag & WELS_CPU_MMI) {
+ pFuncList->pfIDctT4 = WelsIDctT4Rec_mmi;
+ pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_mmi;
+ pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_mmi;
+ }
+#endif//HAVE_MMI
}
}
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -592,9 +592,24 @@
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_mmi;
pFuncList->pfCopy8x16Aligned = WelsCopy8x16_mmi;
+ pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_mmi;
+ pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_mmi;
+
+ pFuncList->pfQuantization4x4 = WelsQuant4x4_mmi;
+ pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_mmi;
+ pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_mmi;
+ pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_mmi;
+
pFuncList->pfCopy16x16Aligned = WelsCopy16x16_mmi;
pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_mmi;
pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_mmi;
+
+ pFuncList->pfScan4x4 = WelsScan4x4DcAc_mmi;
+ pFuncList->pfScan4x4Ac = WelsScan4x4Ac_mmi;
+ pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_mmi;
+
+ pFuncList->pfDctT4 = WelsDctT4_mmi;
+ pFuncList->pfDctFourT4 = WelsDctFourT4_mmi;
}
#endif//HAVE_MMI
}
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -79,10 +79,24 @@
endif
OBJS += $(ENCODER_OBJSARM64)
+ENCODER_ASM_MIPS_SRCS=\
+ $(ENCODER_SRCDIR)/core/mips/dct_mmi.c\
+ $(ENCODER_SRCDIR)/core/mips/quant_mmi.c\
+ $(ENCODER_SRCDIR)/core/mips/score_mmi.c\
+
+ENCODER_OBJSMIPS += $(ENCODER_ASM_MIPS_SRCS:.c=.$(OBJ))
+ifeq ($(ASM_ARCH), mips)
+ENCODER_OBJS += $(ENCODER_OBJSMIPS)
+endif
+OBJS += $(ENCODER_OBJSMIPS)
+
OBJS += $(ENCODER_OBJS)
$(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.cpp
$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c $(CXX_O) $<
+
+$(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.c
+ $(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c $(CXX_O) $<
$(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.asm
$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $@ $<
--- a/test/common/targets.mk
+++ b/test/common/targets.mk
@@ -2,8 +2,8 @@
COMMON_UNITTEST_CPP_SRCS=\
$(COMMON_UNITTEST_SRCDIR)/CWelsListTest.cpp\
$(COMMON_UNITTEST_SRCDIR)/ExpandPicture.cpp\
- $(COMMON_UNITTEST_SRCDIR)/WelsThreadPoolTest.cpp\
$(COMMON_UNITTEST_SRCDIR)/WelsTaskListTest.cpp\
+ $(COMMON_UNITTEST_SRCDIR)/WelsThreadPoolTest.cpp\
COMMON_UNITTEST_OBJS += $(COMMON_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))
--- a/test/encoder/EncUT_DecodeMbAux.cpp
+++ b/test/encoder/EncUT_DecodeMbAux.cpp
@@ -246,6 +246,11 @@
}
#endif
#endif
+#if defined(HAVE_MMI)
+TEST (DecodeMbAuxTest, WelsIDctT4Rec_mmi) {
+ TestIDctT4Rec<int16_t> (WelsIDctT4Rec_mmi);
+}
+#endif
template<typename clip_t>
void WelsIDctT8Anchor (uint8_t* p_dst, int16_t dct[4][16]) {
WelsIDctT4Anchor<clip_t> (&p_dst[0], dct[0]);
@@ -367,6 +372,42 @@
14); //2^14 limit, (2^15+32) will cause overflow for SSE2.
WelsIDctRecI16x16DcAnchor (iRefDst, iRefDct);
WelsIDctRecI16x16Dc_sse2 (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
+ int ok = -1;
+ for (int i = 0; i < 16; i++) {
+ for (int j = 0; j < 16; j++) {
+ if (iRec[i * FDEC_STRIDE + j] != iRefDst[i * FDEC_STRIDE + j]) {
+ ok = i * 16 + j;
+ break;
+ }
+ }
+ }
+ EXPECT_EQ (ok, -1);
+ }
+}
+#endif
+#if defined(HAVE_MMI)
+TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_mmi) {
+ TestIDctFourT4Rec<int16_t> (WelsIDctFourT4Rec_mmi);
+}
+TEST (DecodeMbAuxTest, WelsIDctRecI16x16Dc_mmi) {
+ int32_t iCpuCores = 0;
+ uint32_t uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
+
+ if (uiCpuFeatureFlag & WELS_CPU_MMI) {
+ uint8_t iRefDst[16 * FDEC_STRIDE];
+ int16_t iRefDct[4][4];
+ ENFORCE_STACK_ALIGN_1D (int16_t, iDct, 16, 16);
+ ENFORCE_STACK_ALIGN_1D (uint8_t, iPred, 16 * FDEC_STRIDE, 16);
+ ENFORCE_STACK_ALIGN_1D (uint8_t, iRec, 16 * FDEC_STRIDE, 16);
+ for (int i = 0; i < 16; i++)
+ for (int j = 0; j < 16; j++)
+ iRefDst[i * FDEC_STRIDE + j] = iPred[i * FDEC_STRIDE + j] = rand() & 255;
+ for (int i = 0; i < 4; i++)
+ for (int j = 0; j < 4; j++)
+ iRefDct[i][j] = iDct[i * 4 + j] = (rand() & ((1 << 15) - 1)) - (1 <<
+ 14); //2^14 limit, (2^15+32) will cause overflow for SSE2.
+ WelsIDctRecI16x16DcAnchor (iRefDst, iRefDct);
+ WelsIDctRecI16x16Dc_mmi (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
int ok = -1;
for (int i = 0; i < 16; i++) {
for (int j = 0; j < 16; j++) {
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -315,6 +315,11 @@
TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);
}
#endif
+#ifdef HAVE_MMI
+TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_mmi) {
+ TestGetNoneZeroCount (WelsGetNoneZeroCount_mmi);
+}
+#endif
#define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign)
#define NEW_QUANT(pDct, ff, mf) (((ff)+ WELS_ABS_LC(pDct))*(mf)) >>16
#define WELS_NEW_QUANT(pDct,ff,mf) WELS_ABS_LC(NEW_QUANT(pDct, ff, mf))
@@ -478,6 +483,24 @@
}
#endif //HAVE_AVX2
#endif
+#ifdef HAVE_MMI
+TEST (EncodeMbAuxTest, WelsQuant4x4_mmi) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)
+ TestWelsQuant4x4 (WelsQuant4x4_mmi);
+}
+TEST (EncodeMbAuxTest, WelsQuant4x4Dc_mmi) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)
+ TestWelsQuant4x4Dc (WelsQuant4x4Dc_mmi);
+}
+TEST (EncodeMbAuxTest, WelsQuantFour4x4_mmi) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)
+ TestWelsQuantFour4x4 (WelsQuantFour4x4_mmi);
+}
+TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_mmi) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)
+ TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_mmi);
+}
+#endif //HAVE_MMI
int32_t WelsHadamardQuant2x2SkipAnchor (int16_t* rs, int16_t ff, int16_t mf) {
int16_t pDct[4], s[4];
int16_t threshold = ((1 << 16) - 1) / mf - ff;
@@ -604,6 +627,23 @@
iDct[i] = (rand() & 32767) - 16384;
WelsHadamardT4Dc_c (iLumaDcC, iDct);
WelsHadamardT4Dc_sse2 (iLumaDcS, iDct);
+ for (int i = 0; i < 16; i++)
+ EXPECT_EQ (iLumaDcC[i], iLumaDcS[i]);
+ FREE_MEMORY (iDct);
+ FREE_MEMORY (iLumaDcC);
+ FREE_MEMORY (iLumaDcS);
+}
+#endif
+#ifdef HAVE_MMI
+TEST (EncodeMbAuxTest, WelsHadamardT4Dc_mmi) {
+ CMemoryAlign cMemoryAlign (0);
+ ALLOC_MEMORY (int16_t, iDct, 128 * 16);
+ ALLOC_MEMORY (int16_t, iLumaDcC, 16);
+ ALLOC_MEMORY (int16_t, iLumaDcS, 16);
+ for (int i = 0; i < 128 * 16; i++)
+ iDct[i] = (rand() & 32767) - 16384;
+ WelsHadamardT4Dc_c (iLumaDcC, iDct);
+ WelsHadamardT4Dc_mmi (iLumaDcS, iDct);
for (int i = 0; i < 16; i++)
EXPECT_EQ (iLumaDcC[i], iLumaDcS[i]);
FREE_MEMORY (iDct);
--- a/test/encoder/targets.mk
+++ b/test/encoder/targets.mk
@@ -17,8 +17,8 @@
$(ENCODER_UNITTEST_SRCDIR)/EncUT_ParameterSetStrategy.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_Reconstruct.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_Sample.cpp\
- $(ENCODER_UNITTEST_SRCDIR)/EncUT_SVC_me.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_SliceBufferReallocate.cpp\
+ $(ENCODER_UNITTEST_SRCDIR)/EncUT_SVC_me.cpp\
ENCODER_UNITTEST_OBJS += $(ENCODER_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))