ref: 59e065b6edfa4f62edf23b7c0365b99f5ef86b5e
parent: bff5aa982708c5d471671c4c8edbfdb607018960
author: Shiyou Yin <yinshiyou-hf@loongson.cn>
date: Tue Aug 22 04:44:36 EDT 2017
vpx_dsp:loongson optimize vpx_mseWxH_c(case 16x16,16X8,8X16,8X8) with mmi. Change-Id: I2c782d18d9004414ba61b77238e0caf3e022d8f2
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1540,4 +1540,12 @@
::testing::Values(SseParams(2, 2,
&vpx_get4x4sse_cs_vsx)));
#endif // HAVE_VSX
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, VpxMseTest,
+ ::testing::Values(MseParams(4, 4, &vpx_mse16x16_mmi),
+ MseParams(4, 3, &vpx_mse16x8_mmi),
+ MseParams(3, 4, &vpx_mse8x16_mmi),
+ MseParams(3, 3, &vpx_mse8x8_mmi)));
+#endif // HAVE_MMI
} // namespace
--- /dev/null
+++ b/vpx_dsp/mips/variance_mmi.c
@@ -1,0 +1,146 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+#define VARIANCE_SSE_8 \
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" \
+ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
+ "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
+ "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+
+#define VARIANCE_SSE_16 \
+ VARIANCE_SSE_8 \
+ "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" \
+ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
+ "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
+ "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+
+static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, uint32_t *sse,
+ uint64_t high) {
+ double ftmp[12];
+ uint32_t tmp[1];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+
+ "1: \n\t"
+ VARIANCE_SSE_16
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [a]"+&r"(a), [b]"+&r"(b)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [high]"r"(&high), [sse]"r"(sse)
+ : "memory"
+ );
+
+ return *sse;
+}
+
+#define vpx_mse16xN(n) \
+ uint32_t vpx_mse16x##n##_mmi(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ return vpx_mse16x(a, a_stride, b, b_stride, sse, n); \
+ }
+
+vpx_mse16xN(16);
+vpx_mse16xN(8);
+
+static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, uint32_t *sse,
+ uint64_t high) {
+ double ftmp[12];
+ uint32_t tmp[1];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+
+ "1: \n\t"
+ VARIANCE_SSE_8
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [a]"+&r"(a), [b]"+&r"(b)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [high]"r"(&high), [sse]"r"(sse)
+ : "memory"
+ );
+
+ return *sse;
+}
+
+#define vpx_mse8xN(n) \
+ uint32_t vpx_mse8x##n##_mmi(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ return vpx_mse8x(a, a_stride, b, b_stride, sse, n); \
+ }
+
+vpx_mse8xN(16);
+vpx_mse8xN(8);
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -351,6 +351,8 @@
DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c
+DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c
+
DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1101,16 +1101,16 @@
specialize qw/vpx_get8x8var sse2 neon msa/;
add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vpx_mse16x16 sse2 avx2 neon msa/;
+ specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi/;
add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vpx_mse16x8 sse2 msa/;
+ specialize qw/vpx_mse16x8 sse2 msa mmi/;
add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vpx_mse8x16 sse2 msa/;
+ specialize qw/vpx_mse8x16 sse2 msa mmi/;
add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vpx_mse8x8 sse2 msa/;
+ specialize qw/vpx_mse8x8 sse2 msa mmi/;
add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
specialize qw/vpx_get_mb_ss sse2 msa vsx/;