ref: 2c7b7424c5c3043b363ac23498c4b5dc3505f990
parent: 3ec20445b28ceccb0a32727f81ef2659596aaf33
parent: f4150163a24008a12d4e1ceb9b5f7ee3a8f80360
author: Shiyou Yin <yinshiyou-hf@loongson.cn>
date: Thu Sep 7 20:55:14 EDT 2017
Merge "vpxdsp: [loongson] optimize sad functions with mmi"
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -990,4 +990,59 @@
};
INSTANTIATE_TEST_CASE_P(VSX, SADx4Test, ::testing::ValuesIn(x4d_vsx_tests));
#endif // HAVE_VSX
+
+//------------------------------------------------------------------------------
+// Loongson functions
+#if HAVE_MMI
+const SadMxNParam mmi_tests[] = {
+ SadMxNParam(64, 64, &vpx_sad64x64_mmi),
+ SadMxNParam(64, 32, &vpx_sad64x32_mmi),
+ SadMxNParam(32, 64, &vpx_sad32x64_mmi),
+ SadMxNParam(32, 32, &vpx_sad32x32_mmi),
+ SadMxNParam(32, 16, &vpx_sad32x16_mmi),
+ SadMxNParam(16, 32, &vpx_sad16x32_mmi),
+ SadMxNParam(16, 16, &vpx_sad16x16_mmi),
+ SadMxNParam(16, 8, &vpx_sad16x8_mmi),
+ SadMxNParam(8, 16, &vpx_sad8x16_mmi),
+ SadMxNParam(8, 8, &vpx_sad8x8_mmi),
+ SadMxNParam(8, 4, &vpx_sad8x4_mmi),
+ SadMxNParam(4, 8, &vpx_sad4x8_mmi),
+ SadMxNParam(4, 4, &vpx_sad4x4_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADTest, ::testing::ValuesIn(mmi_tests));
+
+const SadMxNAvgParam avg_mmi_tests[] = {
+ SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_mmi),
+ SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_mmi),
+ SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_mmi),
+ SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_mmi),
+ SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_mmi),
+ SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_mmi),
+ SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_mmi),
+ SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_mmi),
+ SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_mmi),
+ SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_mmi),
+ SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_mmi),
+ SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_mmi),
+ SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADavgTest, ::testing::ValuesIn(avg_mmi_tests));
+
+const SadMxNx4Param x4d_mmi_tests[] = {
+ SadMxNx4Param(64, 64, &vpx_sad64x64x4d_mmi),
+ SadMxNx4Param(64, 32, &vpx_sad64x32x4d_mmi),
+ SadMxNx4Param(32, 64, &vpx_sad32x64x4d_mmi),
+ SadMxNx4Param(32, 32, &vpx_sad32x32x4d_mmi),
+ SadMxNx4Param(32, 16, &vpx_sad32x16x4d_mmi),
+ SadMxNx4Param(16, 32, &vpx_sad16x32x4d_mmi),
+ SadMxNx4Param(16, 16, &vpx_sad16x16x4d_mmi),
+ SadMxNx4Param(16, 8, &vpx_sad16x8x4d_mmi),
+ SadMxNx4Param(8, 16, &vpx_sad8x16x4d_mmi),
+ SadMxNx4Param(8, 8, &vpx_sad8x8x4d_mmi),
+ SadMxNx4Param(8, 4, &vpx_sad8x4x4d_mmi),
+ SadMxNx4Param(4, 8, &vpx_sad4x8x4d_mmi),
+ SadMxNx4Param(4, 4, &vpx_sad4x4x4d_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests));
+#endif // HAVE_MMI
} // namespace
--- /dev/null
+++ b/vpx_dsp/mips/sad_mmi.c
@@ -1,0 +1,805 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#define SAD_SRC_REF_ABS_SUB_64 \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x27(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x20(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x2f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x28(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x27(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x20(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x2f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x28(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x37(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x30(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x3f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x38(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x37(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x30(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x3f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x38(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_32 \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_16 \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_8 \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x00(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+
+#if _MIPS_SIM == _ABIO32
+#define SAD_SRC_REF_ABS_SUB_4 \
+ "ulw %[tmp0], 0x00(%[src]) \n\t" \
+ "mtc1 %[tmp0], %[ftmp1] \n\t" \
+ "ulw %[tmp0], 0x00(%[ref]) \n\t" \
+ "mtc1 %[tmp0], %[ftmp2] \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "mthc1 $0, %[ftmp1] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
+#define SAD_SRC_REF_ABS_SUB_4 \
+ "gslwlc1 %[ftmp1], 0x03(%[src]) \n\t" \
+ "gslwrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gslwlc1 %[ftmp2], 0x03(%[ref]) \n\t" \
+ "gslwrc1 %[ftmp2], 0x00(%[ref]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "mthc1 $0, %[ftmp1] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+#endif /* _MIPS_SIM == _ABIO32 */
+
+#define SAD_SRC_AVGREF_ABS_SUB_64 \
+ "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x27(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x20(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x2f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x28(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x27(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x20(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x2f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x28(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x27(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x20(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x2f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x28(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x37(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x30(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x3f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x38(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x37(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x30(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x3f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x38(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x37(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x30(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x3f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x38(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_32 \
+ "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_16 \
+ "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \
+ "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \
+ "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \
+ "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
+ "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "biadd %[ftmp2], %[ftmp2] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_8 \
+ "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gsldlc1 %[ftmp2], 0x07(%[ref]) \n\t" \
+ "gsldrc1 %[ftmp2], 0x00(%[ref]) \n\t" \
+ "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+
+#if _MIPS_SIM == _ABIO32
+#define SAD_SRC_AVGREF_ABS_SUB_4 \
+ "ulw %[tmp0], 0x00(%[second_pred]) \n\t" \
+ "mtc1 %[tmp0], %[ftmp1] \n\t" \
+ "ulw %[tmp0], 0x00(%[ref]) \n\t" \
+ "mtc1 %[tmp0], %[ftmp2] \n\t" \
+ "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "mthc1 $0, %[ftmp1] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
+#define SAD_SRC_AVGREF_ABS_SUB_4 \
+ "gslwlc1 %[ftmp1], 0x03(%[second_pred]) \n\t" \
+ "gslwrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \
+ "gslwlc1 %[ftmp2], 0x03(%[ref]) \n\t" \
+ "gslwrc1 %[ftmp2], 0x00(%[ref]) \n\t" \
+ "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \
+ "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
+ "mthc1 $0, %[ftmp1] \n\t" \
+ "biadd %[ftmp1], %[ftmp1] \n\t" \
+ "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+#endif /* _MIPS_SIM == _ABIO32 */
+
+// depending on call sites, pass **ref_array to avoid & in subsequent call and
+// de-dup with 4D below.
+#define sadMxNxK_mmi(m, n, k) \
+ void vpx_sad##m##x##n##x##k##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref_array, int ref_stride, \
+ uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < k; ++i) \
+ sad_array[i] = \
+ vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \
+ }
+
+// This appears to be equivalent to the above when k == 4 and refs is const
+#define sadMxNx4D_mmi(m, n) \
+ void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < 4; ++i) \
+ sad_array[i] = \
+ vpx_sad##m##x##n##_mmi(src, src_stride, ref_array[i], ref_stride); \
+ }
+
+static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_64
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_64
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad64xN(H) \
+ unsigned int vpx_sad64x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad64x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad64xN(64);
+vpx_sad64xN(32);
+sadMxNx4D_mmi(64, 64);
+sadMxNx4D_mmi(64, 32);
+
+static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_64
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_64
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"((mips_reg)second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad_avg64xN(H) \
+ unsigned int vpx_sad64x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg64x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg64xN(64);
+vpx_sad_avg64xN(32);
+
+static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_32
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_32
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad32xN(H) \
+ unsigned int vpx_sad32x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad32x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad32xN(64);
+vpx_sad32xN(32);
+vpx_sad32xN(16);
+sadMxNx4D_mmi(32, 64);
+sadMxNx4D_mmi(32, 32);
+sadMxNx4D_mmi(32, 16);
+
+static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_32
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_32
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"((mips_reg)second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad_avg32xN(H) \
+ unsigned int vpx_sad32x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg32x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg32xN(64);
+vpx_sad_avg32xN(32);
+vpx_sad_avg32xN(16);
+
+static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_16
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_16
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad16xN(H) \
+ unsigned int vpx_sad16x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad16x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad16xN(32);
+vpx_sad16xN(16);
+vpx_sad16xN(8);
+sadMxNxK_mmi(16, 16, 3);
+sadMxNxK_mmi(16, 16, 8);
+sadMxNxK_mmi(16, 8, 3);
+sadMxNxK_mmi(16, 8, 8);
+sadMxNx4D_mmi(16, 32);
+sadMxNx4D_mmi(16, 16);
+sadMxNx4D_mmi(16, 8);
+
+static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_16
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_16
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp5] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+ [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"((mips_reg)second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad_avg16xN(H) \
+ unsigned int vpx_sad16x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg16x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg16xN(32);
+vpx_sad_avg16xN(16);
+vpx_sad_avg16xN(8);
+
+static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_8
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_8
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp3] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad8xN(H) \
+ unsigned int vpx_sad8x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad8x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad8xN(16);
+vpx_sad8xN(8);
+vpx_sad8xN(4);
+sadMxNxK_mmi(8, 16, 3);
+sadMxNxK_mmi(8, 16, 8);
+sadMxNxK_mmi(8, 8, 3);
+sadMxNxK_mmi(8, 8, 8);
+sadMxNx4D_mmi(8, 16);
+sadMxNx4D_mmi(8, 8);
+sadMxNx4D_mmi(8, 4);
+
+static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_8
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_8
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp3] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"((mips_reg)second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad_avg8xN(H) \
+ unsigned int vpx_sad8x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg8x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg8xN(16);
+vpx_sad_avg8xN(8);
+vpx_sad_avg8xN(4);
+
+static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_REF_ABS_SUB_4
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_REF_ABS_SUB_4
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp3] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad4xN(H) \
+ unsigned int vpx_sad4x##H##_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return vpx_sad4x(src, src_stride, ref, ref_stride, H); \
+ }
+
+vpx_sad4xN(8);
+vpx_sad4xN(4);
+sadMxNxK_mmi(4, 4, 3);
+sadMxNxK_mmi(4, 4, 8);
+sadMxNx4D_mmi(4, 8);
+sadMxNx4D_mmi(4, 4);
+
+static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ int counter) {
+ unsigned int sad;
+ double ftmp1, ftmp2, ftmp3;
+ mips_reg l_counter = counter;
+
+ __asm__ volatile (
+ "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "1: \n\t"
+ // Include two loop body, to reduce loop time.
+ SAD_SRC_AVGREF_ABS_SUB_4
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ SAD_SRC_AVGREF_ABS_SUB_4
+ MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[ref], %[ref], %[ref_stride])
+ MMI_ADDIU(%[counter], %[counter], -0x02)
+ "bnez %[counter], 1b \n\t"
+ "mfc1 %[sad], %[ftmp3] \n\t"
+ : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+ [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+ [second_pred]"+&r"((mips_reg)second_pred),
+ [sad]"=&r"(sad)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [ref_stride]"r"((mips_reg)ref_stride)
+ );
+
+ return sad;
+}
+
+#define vpx_sad_avg4xN(H) \
+ unsigned int vpx_sad4x##H##_avg_mmi(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return vpx_sad_avg4x(src, src_stride, ref, ref_stride, second_pred, H); \
+ }
+
+vpx_sad_avg4xN(8);
+vpx_sad_avg4xN(4);
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -318,6 +318,7 @@
DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c
+DSP_SRCS-$(HAVE_MMI) += mips/sad_mmi.c
DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c
DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -696,43 +696,43 @@
# Single block SAD
#
add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx/;
+specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx/;
+specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx/;
+specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx/;
+specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx/;
+specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x32 neon msa sse2 vsx/;
+specialize qw/vpx_sad16x32 neon msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 neon msa sse2 vsx/;
+specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 neon msa sse2 vsx/;
+specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x16 neon msa sse2/;
+specialize qw/vpx_sad8x16 neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 neon msa sse2/;
+specialize qw/vpx_sad8x8 neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x4 neon msa sse2/;
+specialize qw/vpx_sad8x4 neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x8 neon msa sse2/;
+specialize qw/vpx_sad4x8 neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x4 neon msa sse2/;
+specialize qw/vpx_sad4x4 neon msa sse2 mmi/;
#
# Avg
@@ -778,43 +778,43 @@
} # CONFIG_VP9_ENCODER
add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx/;
+specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx/;
+specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx/;
+specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx/;
+specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx/;
+specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x32_avg neon msa sse2 vsx/;
+specialize qw/vpx_sad16x32_avg neon msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x16_avg neon msa sse2 vsx/;
+specialize qw/vpx_sad16x16_avg neon msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x8_avg neon msa sse2 vsx/;
+specialize qw/vpx_sad16x8_avg neon msa sse2 vsx mmi/;
add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x16_avg neon msa sse2/;
+specialize qw/vpx_sad8x16_avg neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x8_avg neon msa sse2/;
+specialize qw/vpx_sad8x8_avg neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x4_avg neon msa sse2/;
+specialize qw/vpx_sad8x4_avg neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x8_avg neon msa sse2/;
+specialize qw/vpx_sad4x8_avg neon msa sse2 mmi/;
add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x4_avg neon msa sse2/;
+specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/;
#
# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
@@ -821,77 +821,77 @@
#
# Blocks of 3
add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x3 sse3 ssse3 msa/;
+specialize qw/vpx_sad16x16x3 sse3 ssse3 msa mmi/;
add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x3 sse3 ssse3 msa/;
+specialize qw/vpx_sad16x8x3 sse3 ssse3 msa mmi/;
add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x3 sse3 msa/;
+specialize qw/vpx_sad8x16x3 sse3 msa mmi/;
add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x3 sse3 msa/;
+specialize qw/vpx_sad8x8x3 sse3 msa mmi/;
add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x3 sse3 msa/;
+specialize qw/vpx_sad4x4x3 sse3 msa mmi/;
# Blocks of 8
add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x8 sse4_1 msa/;
+specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/;
add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x8 sse4_1 msa/;
+specialize qw/vpx_sad16x8x8 sse4_1 msa mmi/;
add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x8 sse4_1 msa/;
+specialize qw/vpx_sad8x16x8 sse4_1 msa mmi/;
add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x8 sse4_1 msa/;
+specialize qw/vpx_sad8x8x8 sse4_1 msa mmi/;
add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x8 sse4_1 msa/;
+specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/;
#
# Multi-block SAD, comparing a reference to N independent blocks
#
add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x4d avx2 neon msa sse2 vsx/;
+specialize qw/vpx_sad64x64x4d avx2 neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x32x4d neon msa sse2 vsx/;
+specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x64x4d neon msa sse2 vsx/;
+specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx/;
+specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x16x4d neon msa sse2 vsx/;
+specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x32x4d neon msa sse2 vsx/;
+specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x4d neon msa sse2 vsx/;
+specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x4d neon msa sse2 vsx/;
+specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x4d neon msa sse2/;
+specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x4d neon msa sse2/;
+specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/;
add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x4x4d neon msa sse2/;
+specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x4d neon msa sse2/;
+specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x4d neon msa sse2/;
+specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
specialize qw/vpx_sum_squares_2d_i16 sse2 msa/;