ref: ece1989fa2da10eecf1cb1dde1d5f8afa9480b8b
parent: 34e48d611553989954a5410562fbfec21abe544d
parent: 9e4647c7abc3ec69fe3ddd7537d7e9954c4d3596
author: Shiyou Yin <yinshiyou-hf@loongson.cn>
date: Fri Aug 25 02:44:02 EDT 2017
Merge "vpx_dsp:loongson optimize vpx_varianceWxH_c,vpx_sub_pixel_varianceWxH_c and vpx_sub_pixel_avg_varianceWxH_c with mmi."
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1547,5 +1547,55 @@
MseParams(4, 3, &vpx_mse16x8_mmi),
MseParams(3, 4, &vpx_mse8x16_mmi),
MseParams(3, 3, &vpx_mse8x8_mmi)));
+
+INSTANTIATE_TEST_CASE_P(
+ MMI, VpxVarianceTest,
+ ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_mmi),
+ VarianceParams(6, 5, &vpx_variance64x32_mmi),
+ VarianceParams(5, 6, &vpx_variance32x64_mmi),
+ VarianceParams(5, 5, &vpx_variance32x32_mmi),
+ VarianceParams(5, 4, &vpx_variance32x16_mmi),
+ VarianceParams(4, 5, &vpx_variance16x32_mmi),
+ VarianceParams(4, 4, &vpx_variance16x16_mmi),
+ VarianceParams(4, 3, &vpx_variance16x8_mmi),
+ VarianceParams(3, 4, &vpx_variance8x16_mmi),
+ VarianceParams(3, 3, &vpx_variance8x8_mmi),
+ VarianceParams(3, 2, &vpx_variance8x4_mmi),
+ VarianceParams(2, 3, &vpx_variance4x8_mmi),
+ VarianceParams(2, 2, &vpx_variance4x4_mmi)));
+
+INSTANTIATE_TEST_CASE_P(
+ MMI, VpxSubpelVarianceTest,
+ ::testing::Values(
+ SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_mmi, 0),
+ SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_mmi, 0),
+ SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_mmi, 0),
+ SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_mmi, 0),
+ SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_mmi, 0),
+ SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_mmi, 0),
+ SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_mmi, 0),
+ SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_mmi, 0),
+ SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_mmi, 0),
+ SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_mmi, 0),
+ SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_mmi, 0),
+ SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_mmi, 0),
+ SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_mmi, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+ MMI, VpxSubpelAvgVarianceTest,
+ ::testing::Values(
+ SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_mmi, 0),
+ SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_mmi, 0),
+ SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_mmi, 0),
+ SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_mmi, 0),
+ SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_mmi, 0),
+ SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_mmi, 0),
+ SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_mmi, 0),
+ SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_mmi, 0),
+ SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_mmi, 0),
+ SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_mmi, 0),
+ SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_mmi, 0),
+ SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_mmi, 0),
+ SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_mmi, 0)));
#endif // HAVE_MMI
} // namespace
--- a/vpx_dsp/mips/variance_mmi.c
+++ b/vpx_dsp/mips/variance_mmi.c
@@ -9,10 +9,97 @@
*/
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/variance.h"
#include "vpx_ports/mem.h"
#include "vpx/vpx_integer.h"
#include "vpx_ports/asmdefs_mmi.h"
+static const uint8_t bilinear_filters[8][2] = {
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
+};
+
+/* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32,
+ vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */
+#define VARIANCE_SSE_SUM_8_FOR_W64 \
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
+ \
+ /* sum */ \
+ "punpcklhw %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp2], %[ftmp3], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \
+ "psubw %[ftmp3], %[ftmp1], %[ftmp7] \n\t" \
+ "psubw %[ftmp5], %[ftmp2], %[ftmp8] \n\t" \
+ "punpcklhw %[ftmp1], %[ftmp4], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \
+ "psubw %[ftmp4], %[ftmp1], %[ftmp7] \n\t" \
+ "psubw %[ftmp6], %[ftmp2], %[ftmp8] \n\t" \
+ "paddw %[ftmp9], %[ftmp9], %[ftmp3] \n\t" \
+ "paddw %[ftmp9], %[ftmp9], %[ftmp4] \n\t" \
+ "paddw %[ftmp9], %[ftmp9], %[ftmp5] \n\t" \
+ "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" \
+ \
+ /* *sse */ \
+ "pmuluw %[ftmp1], %[ftmp3], %[ftmp3] \n\t" \
+ "pmuluw %[ftmp2], %[ftmp5], %[ftmp5] \n\t" \
+ "pmuluw %[ftmp7], %[ftmp4], %[ftmp4] \n\t" \
+ "pmuluw %[ftmp8], %[ftmp6], %[ftmp6] \n\t" \
+ "paddw %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \
+ "paddw %[ftmp10], %[ftmp10], %[ftmp2] \n\t" \
+ "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \
+ "paddw %[ftmp10], %[ftmp10], %[ftmp8] \n\t" \
+ "dsrl %[ftmp3], %[ftmp3], %[ftmp11] \n\t" \
+ "dsrl %[ftmp5], %[ftmp5], %[ftmp11] \n\t" \
+ "dsrl %[ftmp4], %[ftmp4], %[ftmp11] \n\t" \
+ "dsrl %[ftmp6], %[ftmp6], %[ftmp11] \n\t" \
+ "pmuluw %[ftmp1], %[ftmp3], %[ftmp3] \n\t" \
+ "pmuluw %[ftmp2], %[ftmp5], %[ftmp5] \n\t" \
+ "pmuluw %[ftmp7], %[ftmp4], %[ftmp4] \n\t" \
+ "pmuluw %[ftmp8], %[ftmp6], %[ftmp6] \n\t" \
+ "paddw %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \
+ "paddw %[ftmp10], %[ftmp10], %[ftmp2] \n\t" \
+ "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \
+ "paddw %[ftmp10], %[ftmp10], %[ftmp8] \n\t"
+
+#define VARIANCE_SSE_SUM_4 \
+ /* sse */ \
+ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
+ "pmaddhw %[ftmp5], %[ftmp4], %[ftmp4] \n\t" \
+ "paddw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" \
+ \
+ /* sum */ \
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
+ "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
+
+#define VARIANCE_SSE_SUM_8 \
+ /* sse */ \
+ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
+ "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
+ "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
+ "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" \
+ \
+ /* sum */ \
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" \
+ "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" \
+ "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t"
+
#define VARIANCE_SSE_8 \
"gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
"gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
@@ -40,6 +127,780 @@
"paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
"paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \
+ /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \
+ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp3], %[ftmp3], %[filter_x1] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \
+ /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \
+ "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
+ "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A \
+ /* calculate: temp2[0] ~ temp2[3] */ \
+ "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \
+ \
+ /* store: temp2[0] ~ temp2[3] */ \
+ "and %[ftmp2], %[ftmp2], %[mask] \n\t" \
+ "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
+ "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B \
+ /* calculate: temp2[0] ~ temp2[3] */ \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
+ "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
+ \
+ /* store: temp2[0] ~ temp2[3] */ \
+ "and %[ftmp4], %[ftmp4], %[mask] \n\t" \
+ "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
+ "gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \
+ /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
+ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \
+ "pmullh %[ftmp3], %[ftmp3], %[filter_x0] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
+ "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_x1] \n\t" \
+ "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
+ "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \
+ "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \
+ /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \
+ "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \
+ "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \
+ "pmullh %[ftmp9], %[ftmp9], %[filter_x0] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \
+ "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp10], %[ftmp10], %[filter_x1] \n\t" \
+ "pmullh %[ftmp11], %[ftmp11], %[filter_x1] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" \
+ "paddh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" \
+ "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \
+ "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \
+ /* calculate: temp2[0] ~ temp2[3] */ \
+ "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp8], %[filter_y1] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \
+ \
+ /* calculate: temp2[4] ~ temp2[7] */ \
+ "pmullh %[ftmp3], %[ftmp3], %[filter_y0] \n\t" \
+ "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp9], %[filter_y1] \n\t" \
+ "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \
+ "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \
+ \
+ /* store: temp2[0] ~ temp2[7] */ \
+ "and %[ftmp2], %[ftmp2], %[mask] \n\t" \
+ "and %[ftmp3], %[ftmp3], %[mask] \n\t" \
+ "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
+ "gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \
+ "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \
+ /* calculate: temp2[0] ~ temp2[3] */ \
+ "pmullh %[ftmp8], %[ftmp8], %[filter_y0] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" \
+ "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \
+ \
+ /* calculate: temp2[4] ~ temp2[7] */ \
+ "pmullh %[ftmp9], %[ftmp9], %[filter_y0] \n\t" \
+ "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp3], %[filter_y1] \n\t" \
+ "paddh %[ftmp9], %[ftmp9], %[ftmp1] \n\t" \
+ "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \
+ \
+ /* store: temp2[0] ~ temp2[7] */ \
+ "and %[ftmp8], %[ftmp8], %[mask] \n\t" \
+ "and %[ftmp9], %[ftmp9], %[mask] \n\t" \
+ "packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
+ "gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \
+ "gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A \
+ /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \
+ \
+ /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \
+ "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \
+ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \
+ "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \
+ "pmullh %[ftmp5], %[ftmp5], %[filter_x0] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
+ "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp6], %[ftmp6], %[filter_x1] \n\t" \
+ "pmullh %[ftmp7], %[ftmp7], %[filter_x1] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
+ "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
+ "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \
+ "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B \
+ /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \
+ \
+ /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \
+ "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \
+ "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \
+ "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \
+ "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \
+ "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \
+ "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \
+ "pmullh %[ftmp11], %[ftmp11], %[filter_x0] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \
+ "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp12], %[ftmp12], %[filter_x1] \n\t" \
+ "pmullh %[ftmp13], %[ftmp13], %[filter_x1] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" \
+ "paddh %[ftmp11], %[ftmp11], %[ftmp13] \n\t" \
+ "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \
+ "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A \
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \
+ \
+ /* calculate: temp2[8] ~ temp2[11] */ \
+ "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp10], %[filter_y1] \n\t" \
+ "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
+ "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \
+ \
+ /* calculate: temp2[12] ~ temp2[15] */ \
+ "pmullh %[ftmp5], %[ftmp5], %[filter_y0] \n\t" \
+ "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp11], %[filter_y1] \n\t" \
+ "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
+ "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \
+ \
+ /* store: temp2[8] ~ temp2[15] */ \
+ "and %[ftmp4], %[ftmp4], %[mask] \n\t" \
+ "and %[ftmp5], %[ftmp5], %[mask] \n\t" \
+ "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
+ "gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \
+ "gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B \
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \
+ \
+ /* calculate: temp2[8] ~ temp2[11] */ \
+ "pmullh %[ftmp10], %[ftmp10], %[filter_y0] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \
+ "paddh %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \
+ "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \
+ \
+ /* calculate: temp2[12] ~ temp2[15] */ \
+ "pmullh %[ftmp11], %[ftmp11], %[filter_y0] \n\t" \
+ "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \
+ "pmullh %[ftmp1], %[ftmp5], %[filter_y1] \n\t" \
+ "paddh %[ftmp11], %[ftmp11], %[ftmp1] \n\t" \
+ "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \
+ \
+ /* store: temp2[8] ~ temp2[15] */ \
+ "and %[ftmp10], %[ftmp10], %[mask] \n\t" \
+ "and %[ftmp11], %[ftmp11], %[mask] \n\t" \
+ "packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \
+ "gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \
+ "gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t"
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ b[j] = ROUND_POWER_OF_TWO(
+ (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+
+ ++a;
+ }
+
+ a += src_pixels_per_line - output_width;
+ b += output_width;
+ }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ b[j] = ROUND_POWER_OF_TWO(
+ (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+ ++a;
+ }
+
+ a += src_pixels_per_line - output_width;
+ b += output_width;
+ }
+}
+
+static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[12];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x27(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x20(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x27(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x20(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x2f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x28(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x2f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x28(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x37(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x30(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x37(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x30(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x3f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x38(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x3f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x38(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+ "mfc1 %[tmp1], %[ftmp9] \n\t"
+ "mfhc1 %[tmp2], %[ftmp9] \n\t"
+ "addu %[sum], %[tmp1], %[tmp2] \n\t"
+ "swc1 %[ftmp10], 0x00(%[sse]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+ [tmp2]"=&r"(tmp[2]),
+ [a]"+&r"(a), [b]"+&r"(b),
+ [sum]"=&r"(sum)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [high]"r"(&high), [sse]"r"(sse)
+ : "memory"
+ );
+
+ return *sse - (((int64_t)sum * sum) / (64 * high));
+}
+
+#define VPX_VARIANCE64XN(n) \
+ uint32_t vpx_variance64x##n##_mmi(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ return vpx_variance64x(a, a_stride, b, b_stride, sse, n); \
+ }
+
+VPX_VARIANCE64XN(64)
+VPX_VARIANCE64XN(32)
+
+uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, uint32_t *sse) {
+ int sum;
+ double ftmp[12];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "li %[tmp0], 0x40 \n\t"
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8_FOR_W64
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+ "mfc1 %[tmp1], %[ftmp9] \n\t"
+ "mfhc1 %[tmp2], %[ftmp9] \n\t"
+ "addu %[sum], %[tmp1], %[tmp2] \n\t"
+ "swc1 %[ftmp10], 0x00(%[sse]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+ [tmp2]"=&r"(tmp[2]),
+ [a]"+&r"(a), [b]"+&r"(b),
+ [sum]"=&r"(sum)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [sse]"r"(sse)
+ : "memory"
+ );
+
+ return *sse - (((int64_t)sum * sum) / 2048);
+}
+
+static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[13];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8
+ "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8
+ "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8
+ "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+
+ "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "paddw %[ftmp10], %[ftmp10], %[ftmp3] \n\t"
+ "paddw %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
+ "psubw %[ftmp10], %[ftmp10], %[ftmp5] \n\t"
+ "psubw %[ftmp10], %[ftmp10], %[ftmp6] \n\t"
+ "dsrl %[ftmp0], %[ftmp10], %[ftmp11] \n\t"
+ "paddw %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+ "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
+
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [a]"+&r"(a), [b]"+&r"(b)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+ : "memory"
+ );
+
+ return *sse - (((int64_t)sum * sum) / (32 * high));
+}
+
+#define VPX_VARIANCE32XN(n) \
+ uint32_t vpx_variance32x##n##_mmi(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ return vpx_variance32x(a, a_stride, b, b_stride, sse, n); \
+ }
+
+VPX_VARIANCE32XN(32)
+VPX_VARIANCE32XN(16)
+
+static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[13];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8
+ "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+
+ "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "paddw %[ftmp10], %[ftmp10], %[ftmp3] \n\t"
+ "paddw %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
+ "psubw %[ftmp10], %[ftmp10], %[ftmp5] \n\t"
+ "psubw %[ftmp10], %[ftmp10], %[ftmp6] \n\t"
+ "dsrl %[ftmp0], %[ftmp10], %[ftmp11] \n\t"
+ "paddw %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+ "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
+
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [a]"+&r"(a), [b]"+&r"(b)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+ : "memory"
+ );
+
+ return *sse - (((int64_t)sum * sum) / (16 * high));
+}
+
+#define VPX_VARIANCE16XN(n) \
+ uint32_t vpx_variance16x##n##_mmi(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ return vpx_variance16x(a, a_stride, b, b_stride, sse, n); \
+ }
+
+VPX_VARIANCE16XN(32)
+VPX_VARIANCE16XN(16)
+VPX_VARIANCE16XN(8)
+
+static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[13];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
+ VARIANCE_SSE_SUM_8
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+
+ "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "paddw %[ftmp10], %[ftmp10], %[ftmp3] \n\t"
+ "paddw %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
+ "psubw %[ftmp10], %[ftmp10], %[ftmp5] \n\t"
+ "psubw %[ftmp10], %[ftmp10], %[ftmp6] \n\t"
+ "dsrl %[ftmp0], %[ftmp10], %[ftmp11] \n\t"
+ "paddw %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+ "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
+
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [a]"+&r"(a), [b]"+&r"(b)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+ : "memory"
+ );
+
+ return *sse - (((int64_t)sum * sum) / (8 * high));
+}
+
+#define VPX_VARIANCE8XN(n) \
+ uint32_t vpx_variance8x##n##_mmi(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ return vpx_variance8x(a, a_stride, b, b_stride, sse, n); \
+ }
+
+VPX_VARIANCE8XN(16)
+VPX_VARIANCE8XN(8)
+VPX_VARIANCE8XN(4)
+
+static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse, int high) {
+ int sum;
+ double ftmp[12];
+ uint32_t tmp[3];
+
+ *sse = 0;
+
+ __asm__ volatile (
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp10] \n\t"
+ MMI_L(%[tmp0], %[high], 0x00)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
+ "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
+ "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t"
+ VARIANCE_SSE_SUM_4
+
+ "addiu %[tmp0], %[tmp0], -0x01 \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ MMI_ADDU(%[b], %[b], %[b_stride])
+ "bnez %[tmp0], 1b \n\t"
+
+ "dsrl %[ftmp9], %[ftmp6], %[ftmp10] \n\t"
+ "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t"
+ "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
+
+ "punpcklhw %[ftmp3], %[ftmp7], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp7], %[ftmp0] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp8], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp8], %[ftmp0] \n\t"
+ "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
+ "paddw %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
+ "paddw %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
+ "psubw %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
+ "psubw %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
+ "dsrl %[ftmp0], %[ftmp7], %[ftmp10] \n\t"
+ "paddw %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]),
+ [tmp0]"=&r"(tmp[0]),
+ [a]"+&r"(a), [b]"+&r"(b)
+ : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+ [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+ : "memory"
+ );
+
+ return *sse - (((int64_t)sum * sum) / (4 * high));
+}
+
+#define VPX_VARIANCE4XN(n) \
+ uint32_t vpx_variance4x##n##_mmi(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ return vpx_variance4x(a, a_stride, b, b_stride, sse, n); \
+ }
+
+VPX_VARIANCE4XN(8)
+VPX_VARIANCE4XN(4)
+
static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride, uint32_t *sse,
uint64_t high) {
@@ -144,3 +1005,298 @@
vpx_mse8xN(16);
vpx_mse8xN(8);
+
+#define SUBPIX_VAR(W, H) \
+ uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ \
+ var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+ \
+ return vpx_variance##W##x##H##_mmi(temp2, W, b, b_stride, sse); \
+ }
+
+SUBPIX_VAR(64, 64)
+SUBPIX_VAR(64, 32)
+SUBPIX_VAR(32, 64)
+SUBPIX_VAR(32, 32)
+SUBPIX_VAR(32, 16)
+SUBPIX_VAR(16, 32)
+
+static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ uint8_t *temp2, int counter) {
+ uint8_t *temp2_ptr = temp2;
+ mips_reg l_counter = counter;
+ double ftmp[15];
+ mips_reg tmp[2];
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+ DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+
+ const uint8_t *filter_x = bilinear_filters[xoffset];
+ const uint8_t *filter_y = bilinear_filters[yoffset];
+
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x07)
+ MMI_MTC1(%[tmp0], %[ftmp14])
+ "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
+ "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
+ "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
+ "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
+
+ // fdata3: fdata3[0] ~ fdata3[15]
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+
+ // fdata3 +a_stride*1: fdata3[0] ~ fdata3[15]
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
+ // temp2: temp2[0] ~ temp2[15]
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
+
+ // fdata3 +a_stride*2: fdata3[0] ~ fdata3[15]
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+ // temp2+16*1: temp2[0] ~ temp2[15]
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
+
+ "1: \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
+
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
+ "addiu %[counter], %[counter], -0x01 \n\t"
+ "bnez %[counter], 1b \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+ [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+ [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+ [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
+ [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),
+ [counter]"+&r"(l_counter)
+ : [filter_x0] "f"((uint64_t)filter_x[0]),
+ [filter_x1] "f"((uint64_t)filter_x[1]),
+ [filter_y0] "f"((uint64_t)filter_y[0]),
+ [filter_y1] "f"((uint64_t)filter_y[1]),
+ [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+ [mask] "f"(mask)
+ : "memory"
+ );
+}
+
+#define SUBPIX_VAR16XN(H) \
+ uint32_t vpx_sub_pixel_variance16x##H##_mmi( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ uint8_t temp2[16 * H]; \
+ var_filter_block2d_bil_16x(a, a_stride, xoffset, yoffset, temp2, \
+ (H - 2) / 2); \
+ \
+ return vpx_variance16x##H##_mmi(temp2, 16, b, b_stride, sse); \
+ }
+
+SUBPIX_VAR16XN(16)
+SUBPIX_VAR16XN(8)
+
+static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ uint8_t *temp2, int counter) {
+ uint8_t *temp2_ptr = temp2;
+ mips_reg l_counter = counter;
+ double ftmp[15];
+ mips_reg tmp[2];
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+ DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+ const uint8_t *filter_x = bilinear_filters[xoffset];
+ const uint8_t *filter_y = bilinear_filters[yoffset];
+
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x07)
+ MMI_MTC1(%[tmp0], %[ftmp14])
+ "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
+ "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
+ "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
+ "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
+
+ // fdata3: fdata3[0] ~ fdata3[7]
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+
+ // fdata3 +a_stride*1: fdata3[0] ~ fdata3[7]
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
+ // temp2: temp2[0] ~ temp2[7]
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
+
+ // fdata3 +a_stride*2: fdata3[0] ~ fdata3[7]
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+ // temp2+8*1: temp2[0] ~ temp2[7]
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
+
+ "1: \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
+
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
+ "addiu %[counter], %[counter], -0x01 \n\t"
+ "bnez %[counter], 1b \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+ [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+ [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+ [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
+ [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),
+ [counter]"+&r"(l_counter)
+ : [filter_x0] "f"((uint64_t)filter_x[0]),
+ [filter_x1] "f"((uint64_t)filter_x[1]),
+ [filter_y0] "f"((uint64_t)filter_y[0]),
+ [filter_y1] "f"((uint64_t)filter_y[1]),
+ [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+ [mask] "f"(mask)
+ : "memory"
+ );
+}
+
+#define SUBPIX_VAR8XN(H) \
+ uint32_t vpx_sub_pixel_variance8x##H##_mmi( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ uint8_t temp2[8 * H]; \
+ var_filter_block2d_bil_8x(a, a_stride, xoffset, yoffset, temp2, \
+ (H - 2) / 2); \
+ \
+ return vpx_variance8x##H##_mmi(temp2, 8, b, b_stride, sse); \
+ }
+
+SUBPIX_VAR8XN(16)
+SUBPIX_VAR8XN(8)
+SUBPIX_VAR8XN(4)
+
+static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ uint8_t *temp2, int counter) {
+ uint8_t *temp2_ptr = temp2;
+ mips_reg l_counter = counter;
+ double ftmp[7];
+ mips_reg tmp[2];
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+ DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+ const uint8_t *filter_x = bilinear_filters[xoffset];
+ const uint8_t *filter_y = bilinear_filters[yoffset];
+
+ __asm__ volatile (
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ MMI_LI(%[tmp0], 0x07)
+ MMI_MTC1(%[tmp0], %[ftmp6])
+ "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
+ "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
+ "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
+ "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
+ // fdata3: fdata3[0] ~ fdata3[3]
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+
+ // fdata3 +a_stride*1: fdata3[0] ~ fdata3[3]
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
+ // temp2: temp2[0] ~ temp2[7]
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
+
+ // fdata3 +a_stride*2: fdata3[0] ~ fdata3[3]
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+ // temp2+4*1: temp2[0] ~ temp2[7]
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
+
+ "1: \n\t"
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
+
+ MMI_ADDU(%[a], %[a], %[a_stride])
+ VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+ MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+ VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
+ "addiu %[counter], %[counter], -0x01 \n\t"
+ "bnez %[counter], 1b \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [a] "+&r"(a),
+ [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
+ : [filter_x0] "f"((uint64_t)filter_x[0]),
+ [filter_x1] "f"((uint64_t)filter_x[1]),
+ [filter_y0] "f"((uint64_t)filter_y[0]),
+ [filter_y1] "f"((uint64_t)filter_y[1]),
+ [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+ [mask] "f"(mask)
+ : "memory"
+ );
+}
+
+#define SUBPIX_VAR4XN(H) \
+ uint32_t vpx_sub_pixel_variance4x##H##_mmi( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ uint8_t temp2[4 * H]; \
+ var_filter_block2d_bil_4x(a, a_stride, xoffset, yoffset, temp2, \
+ (H - 2) / 2); \
+ \
+ return vpx_variance4x##H##_mmi(temp2, 4, b, b_stride, sse); \
+ }
+
+SUBPIX_VAR4XN(8)
+SUBPIX_VAR4XN(4)
+
+#define SUBPIX_AVG_VAR(W, H) \
+ uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ \
+ var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+ \
+ vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \
+ \
+ return vpx_variance##W##x##H##_mmi(temp3, W, b, b_stride, sse); \
+ }
+
+SUBPIX_AVG_VAR(64, 64)
+SUBPIX_AVG_VAR(64, 32)
+SUBPIX_AVG_VAR(32, 64)
+SUBPIX_AVG_VAR(32, 32)
+SUBPIX_AVG_VAR(32, 16)
+SUBPIX_AVG_VAR(16, 32)
+SUBPIX_AVG_VAR(16, 16)
+SUBPIX_AVG_VAR(16, 8)
+SUBPIX_AVG_VAR(8, 16)
+SUBPIX_AVG_VAR(8, 8)
+SUBPIX_AVG_VAR(8, 4)
+SUBPIX_AVG_VAR(4, 8)
+SUBPIX_AVG_VAR(4, 4)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1053,43 +1053,43 @@
# Variance
#
add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
+ specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance64x32 sse2 avx2 neon msa/;
+ specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance32x64 sse2 neon msa/;
+ specialize qw/vpx_variance32x64 sse2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
+ specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance32x16 sse2 avx2 neon msa/;
+ specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance16x32 sse2 neon msa/;
+ specialize qw/vpx_variance16x32 sse2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance16x16 sse2 avx2 neon msa/;
+ specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance16x8 sse2 neon msa/;
+ specialize qw/vpx_variance16x8 sse2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance8x16 sse2 neon msa/;
+ specialize qw/vpx_variance8x16 sse2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance8x8 sse2 neon msa/;
+ specialize qw/vpx_variance8x8 sse2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance8x4 sse2 neon msa/;
+ specialize qw/vpx_variance8x4 sse2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance4x8 sse2 neon msa/;
+ specialize qw/vpx_variance4x8 sse2 neon msa mmi/;
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance4x4 sse2 neon msa/;
+ specialize qw/vpx_variance4x4 sse2 neon msa mmi/;
#
# Specialty Variance
@@ -1125,82 +1125,82 @@
# Subpixel Variance
#
add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance64x32 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance64x32 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance32x64 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance32x16 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance16x32 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance16x16 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance16x8 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance8x16 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance8x8 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance8x4 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance4x8 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance4x8 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance4x4 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa mmi sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa mmi sse2 ssse3/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
--- a/vpx_ports/asmdefs_mmi.h
+++ b/vpx_ports/asmdefs_mmi.h
@@ -40,10 +40,10 @@
"dsll " #reg1 ", " #reg2 ", " #shift " \n\t"
#define MMI_MTC1(reg, fp) \
- "dmtc1 " #reg " " #fp " \n\t"
+ "dmtc1 " #reg ", " #fp " \n\t"
#define MMI_LI(reg, immediate) \
- "dli " #reg " " #immediate " \n\t"
+ "dli " #reg ", " #immediate " \n\t"
#else
#define mips_reg int32_t
@@ -69,10 +69,10 @@
"sll " #reg1 ", " #reg2 ", " #shift " \n\t"
#define MMI_MTC1(reg, fp) \
- "mtc1 " #reg " " #fp " \n\t"
+ "mtc1 " #reg ", " #fp " \n\t"
#define MMI_LI(reg, immediate) \
- "li " #reg " " #immediate " \n\t"
+ "li " #reg ", " #immediate " \n\t"
#endif /* HAVE_MIPS64 */