shithub: libvpx

--- a/test/variance_test.cc

+++ b/test/variance_test.cc

@@ -1547,5 +1547,55 @@

                                           MseParams(4, 3, &vpx_mse16x8_mmi),

                                           MseParams(3, 4, &vpx_mse8x16_mmi),

                                           MseParams(3, 3, &vpx_mse8x8_mmi)));

+INSTANTIATE_TEST_CASE_P(

+    MMI, VpxVarianceTest,

+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_mmi),

+                      VarianceParams(6, 5, &vpx_variance64x32_mmi),

+                      VarianceParams(5, 6, &vpx_variance32x64_mmi),

+                      VarianceParams(5, 5, &vpx_variance32x32_mmi),

+                      VarianceParams(5, 4, &vpx_variance32x16_mmi),

+                      VarianceParams(4, 5, &vpx_variance16x32_mmi),

+                      VarianceParams(4, 4, &vpx_variance16x16_mmi),

+                      VarianceParams(4, 3, &vpx_variance16x8_mmi),

+                      VarianceParams(3, 4, &vpx_variance8x16_mmi),

+                      VarianceParams(3, 3, &vpx_variance8x8_mmi),

+                      VarianceParams(3, 2, &vpx_variance8x4_mmi),

+                      VarianceParams(2, 3, &vpx_variance4x8_mmi),

+                      VarianceParams(2, 2, &vpx_variance4x4_mmi)));

+INSTANTIATE_TEST_CASE_P(

+    MMI, VpxSubpelVarianceTest,

+    ::testing::Values(

+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_mmi, 0),

+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_mmi, 0),

+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_mmi, 0),

+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_mmi, 0),

+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_mmi, 0),

+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_mmi, 0),

+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_mmi, 0),

+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_mmi, 0),

+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_mmi, 0),

+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_mmi, 0),

+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_mmi, 0),

+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_mmi, 0),

+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_mmi, 0)));

+INSTANTIATE_TEST_CASE_P(

+    MMI, VpxSubpelAvgVarianceTest,

+    ::testing::Values(

+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_mmi, 0),

+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_mmi, 0),

+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_mmi, 0),

+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_mmi, 0),

+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_mmi, 0),

+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_mmi, 0),

+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_mmi, 0),

+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_mmi, 0),

+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_mmi, 0),

+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_mmi, 0),

+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_mmi, 0),

+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_mmi, 0),

+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_mmi, 0)));

 #endif  // HAVE_MMI

 }  // namespace

--- a/vpx_dsp/mips/variance_mmi.c

+++ b/vpx_dsp/mips/variance_mmi.c

@@ -9,10 +9,97 @@

*/

 #include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/variance.h"

 #include "vpx_ports/mem.h"

 #include "vpx/vpx_integer.h"

 #include "vpx_ports/asmdefs_mmi.h"

+static const uint8_t bilinear_filters[8][2] = {

+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },

+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },

+};

+/* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32,

+   vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */

+#define VARIANCE_SSE_SUM_8_FOR_W64                                  \

+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t" \

+  "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t" \

+                                                                    \

+  /* sum */                                                         \

+  "punpcklhw  %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t" \

+  "punpckhhw  %[ftmp2],   %[ftmp3],       %[ftmp0]            \n\t" \

+  "punpcklhw  %[ftmp7],   %[ftmp5],       %[ftmp0]            \n\t" \

+  "punpckhhw  %[ftmp8],   %[ftmp5],       %[ftmp0]            \n\t" \

+  "psubw      %[ftmp3],   %[ftmp1],       %[ftmp7]            \n\t" \

+  "psubw      %[ftmp5],   %[ftmp2],       %[ftmp8]            \n\t" \

+  "punpcklhw  %[ftmp1],   %[ftmp4],       %[ftmp0]            \n\t" \

+  "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp0]            \n\t" \

+  "punpcklhw  %[ftmp7],   %[ftmp6],       %[ftmp0]            \n\t" \

+  "punpckhhw  %[ftmp8],   %[ftmp6],       %[ftmp0]            \n\t" \

+  "psubw      %[ftmp4],   %[ftmp1],       %[ftmp7]            \n\t" \

+  "psubw      %[ftmp6],   %[ftmp2],       %[ftmp8]            \n\t" \

+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp3]            \n\t" \

+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp4]            \n\t" \

+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp5]            \n\t" \

+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp6]            \n\t" \

+                                                                    \

+  /* *sse */                                                        \

+  "pmuluw     %[ftmp1],   %[ftmp3],       %[ftmp3]            \n\t" \

+  "pmuluw     %[ftmp2],   %[ftmp5],       %[ftmp5]            \n\t" \

+  "pmuluw     %[ftmp7],   %[ftmp4],       %[ftmp4]            \n\t" \

+  "pmuluw     %[ftmp8],   %[ftmp6],       %[ftmp6]            \n\t" \

+  "paddw      %[ftmp10],  %[ftmp10],      %[ftmp1]            \n\t" \

+  "paddw      %[ftmp10],  %[ftmp10],      %[ftmp2]            \n\t" \

+  "paddw      %[ftmp10],  %[ftmp10],      %[ftmp7]            \n\t" \

+  "paddw      %[ftmp10],  %[ftmp10],      %[ftmp8]            \n\t" \

+  "dsrl       %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t" \

+  "dsrl       %[ftmp5],   %[ftmp5],       %[ftmp11]           \n\t" \

+  "dsrl       %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t" \

+  "dsrl       %[ftmp6],   %[ftmp6],       %[ftmp11]           \n\t" \

+  "pmuluw     %[ftmp1],   %[ftmp3],       %[ftmp3]            \n\t" \

+  "pmuluw     %[ftmp2],   %[ftmp5],       %[ftmp5]            \n\t" \

+  "pmuluw     %[ftmp7],   %[ftmp4],       %[ftmp4]            \n\t" \

+  "pmuluw     %[ftmp8],   %[ftmp6],       %[ftmp6]            \n\t" \

+  "paddw      %[ftmp10],  %[ftmp10],      %[ftmp1]            \n\t" \

+  "paddw      %[ftmp10],  %[ftmp10],      %[ftmp2]            \n\t" \

+  "paddw      %[ftmp10],  %[ftmp10],      %[ftmp7]            \n\t" \

+  "paddw      %[ftmp10],  %[ftmp10],      %[ftmp8]            \n\t"

+#define VARIANCE_SSE_SUM_4                                          \

+  /* sse */                                                         \

+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \

+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \

+  "pmaddhw    %[ftmp5],   %[ftmp4],       %[ftmp4]            \n\t" \

+  "paddw      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t" \

+                                                                    \

+  /* sum */                                                         \

+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t" \

+  "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t" \

+  "paddh      %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"

+#define VARIANCE_SSE_SUM_8                                          \

+  /* sse */                                                         \

+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \

+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \

+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \

+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \

+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \

+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \

+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t" \

+                                                                    \

+  /* sum */                                                         \

+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t" \

+  "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t" \

+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp3]            \n\t" \

+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t" \

+  "paddh      %[ftmp12],  %[ftmp12],      %[ftmp5]            \n\t" \

+  "paddh      %[ftmp12],  %[ftmp12],      %[ftmp6]            \n\t"

 #define VARIANCE_SSE_8                                              \

   "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \

   "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \

@@ -40,6 +127,780 @@

   "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \

   "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t"

+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A                       \

+  /* calculate fdata3[0]~fdata3[3], store at ftmp2*/                \

+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \

+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \

+  "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \

+  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \

+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_x0]        \n\t" \

+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp3],   %[ftmp3],       %[filter_x1]        \n\t" \

+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t" \

+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"

+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B                       \

+  /* calculate fdata3[0]~fdata3[3], store at ftmp4*/                \

+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \

+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \

+  "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \

+  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \

+  "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x0]        \n\t" \

+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x1]        \n\t" \

+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t" \

+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t"

+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A                      \

+  /* calculate: temp2[0] ~ temp2[3] */                              \

+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_y0]        \n\t" \

+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp1],   %[ftmp4],       %[filter_y1]        \n\t" \

+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t" \

+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t" \

+                                                                    \

+  /* store: temp2[0] ~ temp2[3] */                                  \

+  "and        %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \

+  "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t" \

+  "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"

+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B                      \

+  /* calculate: temp2[0] ~ temp2[3] */                              \

+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_y0]        \n\t" \

+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp1],   %[ftmp2],       %[filter_y1]        \n\t" \

+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]            \n\t" \

+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t" \

+                                                                    \

+  /* store: temp2[0] ~ temp2[3] */                                  \

+  "and        %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \

+  "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t" \

+  "gssdrc1    %[ftmp4],   0x00(%[temp2_ptr])                  \n\t"

+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A                       \

+  /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/      \

+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \

+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \

+  "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \

+  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \

+  "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_x0]        \n\t" \

+  "pmullh     %[ftmp3],   %[ftmp3],       %[filter_x0]        \n\t" \

+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \

+  "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x1]        \n\t" \

+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x1]        \n\t" \

+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \

+  "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t" \

+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp14]           \n\t" \

+  "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp14]           \n\t"

+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B                       \

+  /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/      \

+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \

+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \

+  "punpcklbh  %[ftmp8],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "punpckhbh  %[ftmp9],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \

+  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \

+  "punpcklbh  %[ftmp10],  %[ftmp1],       %[ftmp0]            \n\t" \

+  "punpckhbh  %[ftmp11],  %[ftmp1],       %[ftmp0]            \n\t" \

+  "pmullh     %[ftmp8],   %[ftmp8],       %[filter_x0]        \n\t" \

+  "pmullh     %[ftmp9],   %[ftmp9],       %[filter_x0]        \n\t" \

+  "paddh      %[ftmp8],   %[ftmp8],       %[ff_ph_40]         \n\t" \

+  "paddh      %[ftmp9],   %[ftmp9],       %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp10],  %[ftmp10],      %[filter_x1]        \n\t" \

+  "pmullh     %[ftmp11],  %[ftmp11],      %[filter_x1]        \n\t" \

+  "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t" \

+  "paddh      %[ftmp9],   %[ftmp9],       %[ftmp11]           \n\t" \

+  "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp14]           \n\t" \

+  "psrlh      %[ftmp9],   %[ftmp9],       %[ftmp14]           \n\t"

+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A                      \

+  /* calculate: temp2[0] ~ temp2[3] */                              \

+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_y0]        \n\t" \

+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp1],   %[ftmp8],       %[filter_y1]        \n\t" \

+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t" \

+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp14]           \n\t" \

+                                                                    \

+  /* calculate: temp2[4] ~ temp2[7] */                              \

+  "pmullh     %[ftmp3],   %[ftmp3],       %[filter_y0]        \n\t" \

+  "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp1],   %[ftmp9],       %[filter_y1]        \n\t" \

+  "paddh      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t" \

+  "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp14]           \n\t" \

+                                                                    \

+  /* store: temp2[0] ~ temp2[7] */                                  \

+  "and        %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \

+  "and        %[ftmp3],   %[ftmp3],       %[mask]             \n\t" \

+  "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t" \

+  "gssdlc1    %[ftmp2],   0x07(%[temp2_ptr])                  \n\t" \

+  "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"

+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B                      \

+  /* calculate: temp2[0] ~ temp2[3] */                              \

+  "pmullh     %[ftmp8],   %[ftmp8],       %[filter_y0]        \n\t" \

+  "paddh      %[ftmp8],   %[ftmp8],       %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp1],   %[ftmp2],       %[filter_y1]        \n\t" \

+  "paddh      %[ftmp8],   %[ftmp8],       %[ftmp1]            \n\t" \

+  "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp14]           \n\t" \

+                                                                    \

+  /* calculate: temp2[4] ~ temp2[7] */                              \

+  "pmullh     %[ftmp9],   %[ftmp9],       %[filter_y0]        \n\t" \

+  "paddh      %[ftmp9],   %[ftmp9],       %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp1],   %[ftmp3],       %[filter_y1]        \n\t" \

+  "paddh      %[ftmp9],   %[ftmp9],       %[ftmp1]            \n\t" \

+  "psrlh      %[ftmp9],   %[ftmp9],       %[ftmp14]           \n\t" \

+                                                                    \

+  /* store: temp2[0] ~ temp2[7] */                                  \

+  "and        %[ftmp8],   %[ftmp8],       %[mask]             \n\t" \

+  "and        %[ftmp9],   %[ftmp9],       %[mask]             \n\t" \

+  "packushb   %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t" \

+  "gssdlc1    %[ftmp8],   0x07(%[temp2_ptr])                  \n\t" \

+  "gssdrc1    %[ftmp8],   0x00(%[temp2_ptr])                  \n\t"

+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A                      \

+  /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/      \

+  VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A                             \

+                                                                    \

+  /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/     \

+  "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \

+  "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \

+  "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "gsldlc1    %[ftmp1],   0x10(%[a])                          \n\t" \

+  "gsldrc1    %[ftmp1],   0x09(%[a])                          \n\t" \

+  "punpcklbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t" \

+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x0]        \n\t" \

+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x0]        \n\t" \

+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \

+  "paddh      %[ftmp5],   %[ftmp5],       %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp6],   %[ftmp6],       %[filter_x1]        \n\t" \

+  "pmullh     %[ftmp7],   %[ftmp7],       %[filter_x1]        \n\t" \

+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t" \

+  "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t" \

+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp14]           \n\t" \

+  "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp14]           \n\t"

+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B                      \

+  /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/      \

+  VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B                             \

+                                                                    \

+  /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/   \

+  "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \

+  "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \

+  "punpcklbh  %[ftmp10],  %[ftmp1],       %[ftmp0]            \n\t" \

+  "punpckhbh  %[ftmp11],  %[ftmp1],       %[ftmp0]            \n\t" \

+  "gsldlc1    %[ftmp1],   0x10(%[a])                          \n\t" \

+  "gsldrc1    %[ftmp1],   0x09(%[a])                          \n\t" \

+  "punpcklbh  %[ftmp12],  %[ftmp1],       %[ftmp0]            \n\t" \

+  "punpckhbh  %[ftmp13],  %[ftmp1],       %[ftmp0]            \n\t" \

+  "pmullh     %[ftmp10],  %[ftmp10],      %[filter_x0]        \n\t" \

+  "pmullh     %[ftmp11],  %[ftmp11],      %[filter_x0]        \n\t" \

+  "paddh      %[ftmp10],  %[ftmp10],      %[ff_ph_40]         \n\t" \

+  "paddh      %[ftmp11],  %[ftmp11],      %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp12],  %[ftmp12],      %[filter_x1]        \n\t" \

+  "pmullh     %[ftmp13],  %[ftmp13],      %[filter_x1]        \n\t" \

+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp12]           \n\t" \

+  "paddh      %[ftmp11],  %[ftmp11],      %[ftmp13]           \n\t" \

+  "psrlh      %[ftmp10],  %[ftmp10],      %[ftmp14]           \n\t" \

+  "psrlh      %[ftmp11],  %[ftmp11],      %[ftmp14]           \n\t"

+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A                     \

+  VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A                            \

+                                                                    \

+  /* calculate: temp2[8] ~ temp2[11] */                             \

+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_y0]        \n\t" \

+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp1],   %[ftmp10],      %[filter_y1]        \n\t" \

+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]            \n\t" \

+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp14]           \n\t" \

+                                                                    \

+  /* calculate: temp2[12] ~ temp2[15] */                            \

+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_y0]        \n\t" \

+  "paddh      %[ftmp5],   %[ftmp5],       %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp1],   %[ftmp11],       %[filter_y1]       \n\t" \

+  "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \

+  "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp14]           \n\t" \

+                                                                    \

+  /* store: temp2[8] ~ temp2[15] */                                 \

+  "and        %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \

+  "and        %[ftmp5],   %[ftmp5],       %[mask]             \n\t" \

+  "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t" \

+  "gssdlc1    %[ftmp4],   0x0f(%[temp2_ptr])                  \n\t" \

+  "gssdrc1    %[ftmp4],   0x08(%[temp2_ptr])                  \n\t"

+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B                     \

+  VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B                            \

+                                                                    \

+  /* calculate: temp2[8] ~ temp2[11] */                             \

+  "pmullh     %[ftmp10],  %[ftmp10],      %[filter_y0]        \n\t" \

+  "paddh      %[ftmp10],  %[ftmp10],      %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp1],   %[ftmp4],       %[filter_y1]        \n\t" \

+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp1]            \n\t" \

+  "psrlh      %[ftmp10],  %[ftmp10],      %[ftmp14]           \n\t" \

+                                                                    \

+  /* calculate: temp2[12] ~ temp2[15] */                            \

+  "pmullh     %[ftmp11],  %[ftmp11],      %[filter_y0]        \n\t" \

+  "paddh      %[ftmp11],  %[ftmp11],      %[ff_ph_40]         \n\t" \

+  "pmullh     %[ftmp1],   %[ftmp5],       %[filter_y1]        \n\t" \

+  "paddh      %[ftmp11],  %[ftmp11],      %[ftmp1]            \n\t" \

+  "psrlh      %[ftmp11],  %[ftmp11],      %[ftmp14]           \n\t" \

+                                                                    \

+  /* store: temp2[8] ~ temp2[15] */                                 \

+  "and        %[ftmp10],  %[ftmp10],      %[mask]             \n\t" \

+  "and        %[ftmp11],  %[ftmp11],      %[mask]             \n\t" \

+  "packushb   %[ftmp10],  %[ftmp10],      %[ftmp11]           \n\t" \

+  "gssdlc1    %[ftmp10],  0x0f(%[temp2_ptr])                  \n\t" \

+  "gssdrc1    %[ftmp10],  0x08(%[temp2_ptr])                  \n\t"

+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal

+// or vertical direction to produce the filtered output block. Used to implement

+// the first-pass of 2-D separable filter.

+//

+// Produces int16_t output to retain precision for the next pass. Two filter

+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is

+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).

+// It defines the offset required to move from one input to the next.

+static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,

+                                              unsigned int src_pixels_per_line,

+                                              int pixel_step,

+                                              unsigned int output_height,

+                                              unsigned int output_width,

+                                              const uint8_t *filter) {

+  unsigned int i, j;

+  for (i = 0; i < output_height; ++i) {

+    for (j = 0; j < output_width; ++j) {

+      b[j] = ROUND_POWER_OF_TWO(

+          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);

+      ++a;

+    }

+    a += src_pixels_per_line - output_width;

+    b += output_width;

+  }

+}

+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal

+// or vertical direction to produce the filtered output block. Used to implement

+// the second-pass of 2-D separable filter.

+//

+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two

+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the

+// filter is applied horizontally (pixel_step = 1) or vertically

+// (pixel_step = stride). It defines the offset required to move from one input

+// to the next. Output is 8-bit.

+static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,

+                                               unsigned int src_pixels_per_line,

+                                               unsigned int pixel_step,

+                                               unsigned int output_height,

+                                               unsigned int output_width,

+                                               const uint8_t *filter) {

+  unsigned int i, j;

+  for (i = 0; i < output_height; ++i) {

+    for (j = 0; j < output_width; ++j) {

+      b[j] = ROUND_POWER_OF_TWO(

+          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);

+      ++a;

+    }

+    a += src_pixels_per_line - output_width;

+    b += output_width;

+  }

+}

+static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride,

+                                       const uint8_t *b, int b_stride,

+                                       uint32_t *sse, int high) {

+  int sum;

+  double ftmp[12];

+  uint32_t tmp[3];

+  *sse = 0;

+  __asm__ volatile (

+    "li         %[tmp0],    0x20                                \n\t"

+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

+    MMI_L(%[tmp0], %[high], 0x00)

+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

+    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"

+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"

+    "1:                                                         \n\t"

+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8_FOR_W64

+    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8_FOR_W64

+    "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8_FOR_W64

+    "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8_FOR_W64

+    "gsldlc1    %[ftmp1],   0x27(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x20(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x27(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x20(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8_FOR_W64

+    "gsldlc1    %[ftmp1],   0x2f(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x28(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x2f(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x28(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8_FOR_W64

+    "gsldlc1    %[ftmp1],   0x37(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x30(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x37(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x30(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8_FOR_W64

+    "gsldlc1    %[ftmp1],   0x3f(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x38(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x3f(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x38(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8_FOR_W64

+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    MMI_ADDU(%[b], %[b], %[b_stride])

+    "bnez       %[tmp0],    1b                                  \n\t"

+    "mfc1       %[tmp1],    %[ftmp9]                            \n\t"

+    "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"

+    "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"

+    "swc1       %[ftmp10],  0x00(%[sse])                        \n\t"

+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),

+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),

+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),

+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),

+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),

+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),

+      [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),

+      [tmp2]"=&r"(tmp[2]),

+      [a]"+&r"(a),                      [b]"+&r"(b),

+      [sum]"=&r"(sum)

+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),

+      [high]"r"(&high), [sse]"r"(sse)

+    : "memory"

+  );

+  return *sse - (((int64_t)sum * sum) / (64 * high));

+}

+#define VPX_VARIANCE64XN(n)                                         \

+  uint32_t vpx_variance64x##n##_mmi(const uint8_t *a, int a_stride, \

+                                    const uint8_t *b, int b_stride, \

+                                    uint32_t *sse) {                \

+    return vpx_variance64x(a, a_stride, b, b_stride, sse, n);       \

+  }

+VPX_VARIANCE64XN(64)

+VPX_VARIANCE64XN(32)

+uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b,

+                               int b_stride, uint32_t *sse) {

+  int sum;

+  double ftmp[12];

+  uint32_t tmp[3];

+  *sse = 0;

+  __asm__ volatile (

+    "li         %[tmp0],    0x20                                \n\t"

+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

+    "li         %[tmp0],    0x40                                \n\t"

+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

+    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"

+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"

+    "1:                                                         \n\t"

+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8_FOR_W64

+    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8_FOR_W64

+    "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8_FOR_W64

+    "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8_FOR_W64

+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    MMI_ADDU(%[b], %[b], %[b_stride])

+    "bnez       %[tmp0],    1b                                  \n\t"

+    "mfc1       %[tmp1],    %[ftmp9]                            \n\t"

+    "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"

+    "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"

+    "swc1       %[ftmp10],  0x00(%[sse])                        \n\t"

+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),

+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),

+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),

+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),

+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),

+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),

+      [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),

+      [tmp2]"=&r"(tmp[2]),

+      [a]"+&r"(a),                      [b]"+&r"(b),

+      [sum]"=&r"(sum)

+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),

+      [sse]"r"(sse)

+    : "memory"

+  );

+  return *sse - (((int64_t)sum * sum) / 2048);

+}

+static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride,

+                                       const uint8_t *b, int b_stride,

+                                       uint32_t *sse, int high) {

+  int sum;

+  double ftmp[13];

+  uint32_t tmp[3];

+  *sse = 0;

+  __asm__ volatile (

+    "li         %[tmp0],    0x20                                \n\t"

+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

+    MMI_L(%[tmp0], %[high], 0x00)

+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"

+    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"

+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"

+    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"

+    "1:                                                         \n\t"

+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8

+    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8

+    "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8

+    "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8

+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    MMI_ADDU(%[b], %[b], %[b_stride])

+    "bnez       %[tmp0],    1b                                  \n\t"

+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"

+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"

+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"

+    "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"

+    "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"

+    "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"

+    "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"

+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"

+    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp3]            \n\t"

+    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t"

+    "psubw      %[ftmp10],  %[ftmp10],      %[ftmp5]            \n\t"

+    "psubw      %[ftmp10],  %[ftmp10],      %[ftmp6]            \n\t"

+    "dsrl       %[ftmp0],   %[ftmp10],      %[ftmp11]           \n\t"

+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp10]           \n\t"

+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"

+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),

+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),

+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),

+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),

+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),

+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),

+      [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),

+      [a]"+&r"(a),                      [b]"+&r"(b)

+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),

+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)

+    : "memory"

+  );

+  return *sse - (((int64_t)sum * sum) / (32 * high));

+}

+#define VPX_VARIANCE32XN(n)                                         \

+  uint32_t vpx_variance32x##n##_mmi(const uint8_t *a, int a_stride, \

+                                    const uint8_t *b, int b_stride, \

+                                    uint32_t *sse) {                \

+    return vpx_variance32x(a, a_stride, b, b_stride, sse, n);       \

+  }

+VPX_VARIANCE32XN(32)

+VPX_VARIANCE32XN(16)

+static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride,

+                                       const uint8_t *b, int b_stride,

+                                       uint32_t *sse, int high) {

+  int sum;

+  double ftmp[13];

+  uint32_t tmp[3];

+  *sse = 0;

+  __asm__ volatile (

+    "li         %[tmp0],    0x20                                \n\t"

+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

+    MMI_L(%[tmp0], %[high], 0x00)

+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"

+    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"

+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"

+    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"

+    "1:                                                         \n\t"

+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8

+    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8

+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    MMI_ADDU(%[b], %[b], %[b_stride])

+    "bnez       %[tmp0],    1b                                  \n\t"

+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"

+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"

+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"

+    "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"

+    "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"

+    "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"

+    "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"

+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"

+    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp3]            \n\t"

+    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t"

+    "psubw      %[ftmp10],  %[ftmp10],      %[ftmp5]            \n\t"

+    "psubw      %[ftmp10],  %[ftmp10],      %[ftmp6]            \n\t"

+    "dsrl       %[ftmp0],   %[ftmp10],      %[ftmp11]           \n\t"

+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp10]           \n\t"

+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"

+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),

+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),

+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),

+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),

+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),

+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),

+      [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),

+      [a]"+&r"(a),                      [b]"+&r"(b)

+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),

+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)

+    : "memory"

+  );

+  return *sse - (((int64_t)sum * sum) / (16 * high));

+}

+#define VPX_VARIANCE16XN(n)                                         \

+  uint32_t vpx_variance16x##n##_mmi(const uint8_t *a, int a_stride, \

+                                    const uint8_t *b, int b_stride, \

+                                    uint32_t *sse) {                \

+    return vpx_variance16x(a, a_stride, b, b_stride, sse, n);       \

+  }

+VPX_VARIANCE16XN(32)

+VPX_VARIANCE16XN(16)

+VPX_VARIANCE16XN(8)

+static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride,

+                                      const uint8_t *b, int b_stride,

+                                      uint32_t *sse, int high) {

+  int sum;

+  double ftmp[13];

+  uint32_t tmp[3];

+  *sse = 0;

+  __asm__ volatile (

+    "li         %[tmp0],    0x20                                \n\t"

+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

+    MMI_L(%[tmp0], %[high], 0x00)

+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"

+    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"

+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"

+    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"

+    "1:                                                         \n\t"

+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_8

+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    MMI_ADDU(%[b], %[b], %[b_stride])

+    "bnez       %[tmp0],    1b                                  \n\t"

+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"

+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"

+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"

+    "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"

+    "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"

+    "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"

+    "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"

+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"

+    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp3]            \n\t"

+    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t"

+    "psubw      %[ftmp10],  %[ftmp10],      %[ftmp5]            \n\t"

+    "psubw      %[ftmp10],  %[ftmp10],      %[ftmp6]            \n\t"

+    "dsrl       %[ftmp0],   %[ftmp10],      %[ftmp11]           \n\t"

+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp10]           \n\t"

+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"

+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),

+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),

+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),

+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),

+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),

+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),

+      [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),

+      [a]"+&r"(a),                      [b]"+&r"(b)

+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),

+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)

+    : "memory"

+  );

+  return *sse - (((int64_t)sum * sum) / (8 * high));

+}

+#define VPX_VARIANCE8XN(n)                                         \

+  uint32_t vpx_variance8x##n##_mmi(const uint8_t *a, int a_stride, \

+                                   const uint8_t *b, int b_stride, \

+                                   uint32_t *sse) {                \

+    return vpx_variance8x(a, a_stride, b, b_stride, sse, n);       \

+  }

+VPX_VARIANCE8XN(16)

+VPX_VARIANCE8XN(8)

+VPX_VARIANCE8XN(4)

+static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride,

+                                      const uint8_t *b, int b_stride,

+                                      uint32_t *sse, int high) {

+  int sum;

+  double ftmp[12];

+  uint32_t tmp[3];

+  *sse = 0;

+  __asm__ volatile (

+    "li         %[tmp0],    0x20                                \n\t"

+    "mtc1       %[tmp0],    %[ftmp10]                           \n\t"

+    MMI_L(%[tmp0], %[high], 0x00)

+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

+    "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"

+    "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"

+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"

+    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"

+    "1:                                                         \n\t"

+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"

+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"

+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"

+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"

+    VARIANCE_SSE_SUM_4

+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    MMI_ADDU(%[b], %[b], %[b_stride])

+    "bnez       %[tmp0],    1b                                  \n\t"

+    "dsrl       %[ftmp9],   %[ftmp6],       %[ftmp10]           \n\t"

+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp6]            \n\t"

+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"

+    "punpcklhw  %[ftmp3],   %[ftmp7],       %[ftmp0]            \n\t"

+    "punpckhhw  %[ftmp4],   %[ftmp7],       %[ftmp0]            \n\t"

+    "punpcklhw  %[ftmp5],   %[ftmp8],       %[ftmp0]            \n\t"

+    "punpckhhw  %[ftmp6],   %[ftmp8],       %[ftmp0]            \n\t"

+    "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"

+    "paddw      %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"

+    "paddw      %[ftmp7],   %[ftmp7],       %[ftmp4]            \n\t"

+    "psubw      %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"

+    "psubw      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"

+    "dsrl       %[ftmp0],   %[ftmp7],       %[ftmp10]           \n\t"

+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"

+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"

+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),

+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),

+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),

+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),

+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),

+      [ftmp10]"=&f"(ftmp[10]),

+      [tmp0]"=&r"(tmp[0]),

+      [a]"+&r"(a),                      [b]"+&r"(b)

+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),

+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)

+    : "memory"

+  );

+  return *sse - (((int64_t)sum * sum) / (4 * high));

+}

+#define VPX_VARIANCE4XN(n)                                         \

+  uint32_t vpx_variance4x##n##_mmi(const uint8_t *a, int a_stride, \

+                                   const uint8_t *b, int b_stride, \

+                                   uint32_t *sse) {                \

+    return vpx_variance4x(a, a_stride, b, b_stride, sse, n);       \

+  }

+VPX_VARIANCE4XN(8)

+VPX_VARIANCE4XN(4)

 static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride,

                                   const uint8_t *b, int b_stride, uint32_t *sse,

                                   uint64_t high) {

@@ -144,3 +1005,298 @@

 vpx_mse8xN(16);

 vpx_mse8xN(8);

+#define SUBPIX_VAR(W, H)                                                \

+  uint32_t vpx_sub_pixel_variance##W##x##H##_mmi(                       \

+      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \

+      const uint8_t *b, int b_stride, uint32_t *sse) {                  \

+    uint16_t fdata3[(H + 1) * W];                                       \

+    uint8_t temp2[H * W];                                               \

+                                                                        \

+    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \

+                                      bilinear_filters[xoffset]);       \

+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \

+                                       bilinear_filters[yoffset]);      \

+                                                                        \

+    return vpx_variance##W##x##H##_mmi(temp2, W, b, b_stride, sse);     \

+  }

+SUBPIX_VAR(64, 64)

+SUBPIX_VAR(64, 32)

+SUBPIX_VAR(32, 64)

+SUBPIX_VAR(32, 32)

+SUBPIX_VAR(32, 16)

+SUBPIX_VAR(16, 32)

+static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride,

+                                              int xoffset, int yoffset,

+                                              uint8_t *temp2, int counter) {

+  uint8_t *temp2_ptr = temp2;

+  mips_reg l_counter = counter;

+  double ftmp[15];

+  mips_reg tmp[2];

+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };

+  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };

+  const uint8_t *filter_x = bilinear_filters[xoffset];

+  const uint8_t *filter_y = bilinear_filters[yoffset];

+  __asm__ volatile (

+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

+    MMI_LI(%[tmp0], 0x07)

+    MMI_MTC1(%[tmp0], %[ftmp14])

+    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"

+    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"

+    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"

+    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"

+    // fdata3: fdata3[0] ~ fdata3[15]

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A

+    // fdata3 +a_stride*1: fdata3[0] ~ fdata3[15]

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B

+    // temp2: temp2[0] ~ temp2[15]

+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A

+    // fdata3 +a_stride*2: fdata3[0] ~ fdata3[15]

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A

+    // temp2+16*1: temp2[0] ~ temp2[15]

+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)

+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B

+    "1:                                                         \n\t"

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A

+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)

+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B

+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)

+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A

+    "addiu      %[counter], %[counter],     -0x01               \n\t"

+    "bnez       %[counter], 1b                                  \n\t"

+    : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),

+      [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),

+      [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),

+      [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),

+      [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),

+      [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),

+      [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),

+      [counter]"+&r"(l_counter)

+    : [filter_x0] "f"((uint64_t)filter_x[0]),

+      [filter_x1] "f"((uint64_t)filter_x[1]),

+      [filter_y0] "f"((uint64_t)filter_y[0]),

+      [filter_y1] "f"((uint64_t)filter_y[1]),

+      [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),

+      [mask] "f"(mask)

+    : "memory"

+  );

+}

+#define SUBPIX_VAR16XN(H)                                            \

+  uint32_t vpx_sub_pixel_variance16x##H##_mmi(                       \

+      const uint8_t *a, int a_stride, int xoffset, int yoffset,      \

+      const uint8_t *b, int b_stride, uint32_t *sse) {               \

+    uint8_t temp2[16 * H];                                           \

+    var_filter_block2d_bil_16x(a, a_stride, xoffset, yoffset, temp2, \

+                               (H - 2) / 2);                         \

+                                                                     \

+    return vpx_variance16x##H##_mmi(temp2, 16, b, b_stride, sse);    \

+  }

+SUBPIX_VAR16XN(16)

+SUBPIX_VAR16XN(8)

+static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride,

+                                             int xoffset, int yoffset,

+                                             uint8_t *temp2, int counter) {

+  uint8_t *temp2_ptr = temp2;

+  mips_reg l_counter = counter;

+  double ftmp[15];

+  mips_reg tmp[2];

+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };

+  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };

+  const uint8_t *filter_x = bilinear_filters[xoffset];

+  const uint8_t *filter_y = bilinear_filters[yoffset];

+  __asm__ volatile (

+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

+    MMI_LI(%[tmp0], 0x07)

+    MMI_MTC1(%[tmp0], %[ftmp14])

+    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"

+    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"

+    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"

+    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"

+    // fdata3: fdata3[0] ~ fdata3[7]

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A

+    // fdata3 +a_stride*1: fdata3[0] ~ fdata3[7]

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B

+    // temp2: temp2[0] ~ temp2[7]

+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A

+    // fdata3 +a_stride*2: fdata3[0] ~ fdata3[7]

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A

+    // temp2+8*1: temp2[0] ~ temp2[7]

+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)

+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B

+    "1:                                                         \n\t"

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B

+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)

+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A

+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)

+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B

+    "addiu      %[counter], %[counter],     -0x01               \n\t"

+    "bnez       %[counter], 1b                                  \n\t"

+    : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),

+      [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),

+      [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),

+      [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),

+      [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),

+      [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),

+      [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),

+      [counter]"+&r"(l_counter)

+    : [filter_x0] "f"((uint64_t)filter_x[0]),

+      [filter_x1] "f"((uint64_t)filter_x[1]),

+      [filter_y0] "f"((uint64_t)filter_y[0]),

+      [filter_y1] "f"((uint64_t)filter_y[1]),

+      [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),

+      [mask] "f"(mask)

+    : "memory"

+  );

+}

+#define SUBPIX_VAR8XN(H)                                            \

+  uint32_t vpx_sub_pixel_variance8x##H##_mmi(                       \

+      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \

+      const uint8_t *b, int b_stride, uint32_t *sse) {              \

+    uint8_t temp2[8 * H];                                           \

+    var_filter_block2d_bil_8x(a, a_stride, xoffset, yoffset, temp2, \

+                              (H - 2) / 2);                         \

+                                                                    \

+    return vpx_variance8x##H##_mmi(temp2, 8, b, b_stride, sse);     \

+  }

+SUBPIX_VAR8XN(16)

+SUBPIX_VAR8XN(8)

+SUBPIX_VAR8XN(4)

+static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride,

+                                             int xoffset, int yoffset,

+                                             uint8_t *temp2, int counter) {

+  uint8_t *temp2_ptr = temp2;

+  mips_reg l_counter = counter;

+  double ftmp[7];

+  mips_reg tmp[2];

+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };

+  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };

+  const uint8_t *filter_x = bilinear_filters[xoffset];

+  const uint8_t *filter_y = bilinear_filters[yoffset];

+  __asm__ volatile (

+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

+    MMI_LI(%[tmp0], 0x07)

+    MMI_MTC1(%[tmp0], %[ftmp6])

+    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"

+    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"

+    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"

+    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"

+    // fdata3: fdata3[0] ~ fdata3[3]

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A

+    // fdata3 +a_stride*1: fdata3[0] ~ fdata3[3]

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B

+    // temp2: temp2[0] ~ temp2[7]

+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A

+    // fdata3 +a_stride*2: fdata3[0] ~ fdata3[3]

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A

+    // temp2+4*1: temp2[0] ~ temp2[7]

+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)

+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B

+    "1:                                                         \n\t"

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B

+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)

+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A

+    MMI_ADDU(%[a], %[a], %[a_stride])

+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A

+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)

+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B

+    "addiu      %[counter], %[counter],     -0x01               \n\t"

+    "bnez       %[counter], 1b                                  \n\t"

+    : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),

+      [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),

+      [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [a] "+&r"(a),

+      [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)

+    : [filter_x0] "f"((uint64_t)filter_x[0]),

+      [filter_x1] "f"((uint64_t)filter_x[1]),

+      [filter_y0] "f"((uint64_t)filter_y[0]),

+      [filter_y1] "f"((uint64_t)filter_y[1]),

+      [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),

+      [mask] "f"(mask)

+    : "memory"

+  );

+}

+#define SUBPIX_VAR4XN(H)                                            \

+  uint32_t vpx_sub_pixel_variance4x##H##_mmi(                       \

+      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \

+      const uint8_t *b, int b_stride, uint32_t *sse) {              \

+    uint8_t temp2[4 * H];                                           \

+    var_filter_block2d_bil_4x(a, a_stride, xoffset, yoffset, temp2, \

+                              (H - 2) / 2);                         \

+                                                                    \

+    return vpx_variance4x##H##_mmi(temp2, 4, b, b_stride, sse);     \

+  }

+SUBPIX_VAR4XN(8)

+SUBPIX_VAR4XN(4)

+#define SUBPIX_AVG_VAR(W, H)                                            \

+  uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi(                   \

+      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \

+      const uint8_t *b, int b_stride, uint32_t *sse,                    \

+      const uint8_t *second_pred) {                                     \

+    uint16_t fdata3[(H + 1) * W];                                       \

+    uint8_t temp2[H * W];                                               \

+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                         \

+                                                                        \

+    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \

+                                      bilinear_filters[xoffset]);       \

+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \

+                                       bilinear_filters[yoffset]);      \

+                                                                        \

+    vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W);            \

+                                                                        \

+    return vpx_variance##W##x##H##_mmi(temp3, W, b, b_stride, sse);     \

+  }

+SUBPIX_AVG_VAR(64, 64)

+SUBPIX_AVG_VAR(64, 32)

+SUBPIX_AVG_VAR(32, 64)

+SUBPIX_AVG_VAR(32, 32)

+SUBPIX_AVG_VAR(32, 16)

+SUBPIX_AVG_VAR(16, 32)

+SUBPIX_AVG_VAR(16, 16)

+SUBPIX_AVG_VAR(16, 8)

+SUBPIX_AVG_VAR(8, 16)

+SUBPIX_AVG_VAR(8, 8)

+SUBPIX_AVG_VAR(8, 4)

+SUBPIX_AVG_VAR(4, 8)

+SUBPIX_AVG_VAR(4, 4)

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -1053,43 +1053,43 @@

 # Variance

 add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;

+  specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi/;

 add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance64x32 sse2 avx2 neon msa/;

+  specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi/;

 add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance32x64 sse2 neon msa/;

+  specialize qw/vpx_variance32x64 sse2 neon msa mmi/;

 add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;

+  specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi/;

 add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance32x16 sse2 avx2 neon msa/;

+  specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi/;

 add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance16x32 sse2 neon msa/;

+  specialize qw/vpx_variance16x32 sse2 neon msa mmi/;

 add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance16x16 sse2 avx2 neon msa/;

+  specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi/;

 add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance16x8 sse2 neon msa/;

+  specialize qw/vpx_variance16x8 sse2 neon msa mmi/;

 add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance8x16 sse2 neon msa/;

+  specialize qw/vpx_variance8x16 sse2 neon msa mmi/;

 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance8x8 sse2 neon msa/;

+  specialize qw/vpx_variance8x8 sse2 neon msa mmi/;

 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance8x4 sse2 neon msa/;

+  specialize qw/vpx_variance8x4 sse2 neon msa mmi/;

 add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance4x8 sse2 neon msa/;

+  specialize qw/vpx_variance4x8 sse2 neon msa mmi/;

 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance4x4 sse2 neon msa/;

+  specialize qw/vpx_variance4x4 sse2 neon msa mmi/;

 # Specialty Variance

@@ -1125,82 +1125,82 @@

 # Subpixel Variance

 add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance64x32 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_variance64x32 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance32x64 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance32x16 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance16x32 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance16x16 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance16x8 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance8x16 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance8x8 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance8x4 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance4x8 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_variance4x8 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance4x4 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

-  specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

-  specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

-  specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

-  specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

-  specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

-  specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

-  specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

-  specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

-  specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

-  specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

-  specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

-  specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa mmi sse2 ssse3/;

 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

-  specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa sse2 ssse3/;

+  specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa mmi sse2 ssse3/;

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

--- a/vpx_ports/asmdefs_mmi.h

+++ b/vpx_ports/asmdefs_mmi.h

@@ -40,10 +40,10 @@

   "dsll        " #reg1 ",       " #reg2 ",       " #shift "        \n\t"

 #define MMI_MTC1(reg, fp) \

-  "dmtc1       " #reg "         " #fp "                          \n\t"

+  "dmtc1       " #reg ",        " #fp "                            \n\t"

 #define MMI_LI(reg, immediate) \

-  "dli         " #reg "         " #immediate "                   \n\t"

+  "dli         " #reg ",        " #immediate "                     \n\t"

 #else

 #define mips_reg int32_t

@@ -69,10 +69,10 @@

   "sll         " #reg1 ",       " #reg2 ",       " #shift "        \n\t"

 #define MMI_MTC1(reg, fp) \

-  "mtc1        " #reg "         " #fp "                          \n\t"

+  "mtc1        " #reg ",        " #fp "                            \n\t"

 #define MMI_LI(reg, immediate) \

-  "li          " #reg "         " #immediate "                   \n\t"

+  "li          " #reg ",        " #immediate "                     \n\t"

 #endif /* HAVE_MIPS64 */

--

⑨