shithub: libvpx

--- a/vp8/common/mips/mmi/idctllm_mmi.c

+++ b/vp8/common/mips/mmi/idctllm_mmi.c

@@ -41,14 +41,18 @@

                               int pred_stride, unsigned char *dst_ptr,

                               int dst_stride) {

   double ftmp[12];

-  uint32_t tmp[0];

-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_04) = { 0x0004000400040004ULL };

-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = { 0x4e7b4e7b4e7b4e7bULL };

-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = { 0x22a322a322a322a3ULL };

+  uint64_t tmp[1];

+  double ff_ph_04, ff_ph_4e7b, ff_ph_22a3;

   __asm__ volatile (

+    "dli        %[tmp0],    0x0004000400040004                  \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_04]                         \n\t"

+    "dli        %[tmp0],    0x4e7b4e7b4e7b4e7b                  \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_4e7b]                       \n\t"

+    "dli        %[tmp0],    0x22a322a322a322a3                  \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_22a3]                       \n\t"

     MMI_LI(%[tmp0], 0x02)

-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"

     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

     "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"

@@ -186,9 +190,10 @@

       [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),

       [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),

       [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]),

-      [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr)

-    : [ip]"r"(input), [ff_ph_22a3]"f"(ff_ph_22a3),

-      [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_04]"f"(ff_ph_04),

+      [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr),

+      [ff_ph_4e7b]"=&f"(ff_ph_4e7b), [ff_ph_04]"=&f"(ff_ph_04),

+      [ff_ph_22a3]"=&f"(ff_ph_22a3)

+    : [ip]"r"(input),

       [pred_stride]"r"((mips_reg)pred_stride),

       [dst_stride]"r"((mips_reg)dst_stride)

     : "memory"

@@ -198,12 +203,13 @@

 void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,

                               int pred_stride, unsigned char *dst_ptr,

                               int dst_stride) {

-  int a1 = ((input_dc + 4) >> 3);

-  double ftmp[5];

+  int a0 = ((input_dc + 4) >> 3);

+  double a1, ftmp[5];

   int low32;

   __asm__ volatile (

     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"

+    "dmtc1      %[a0],      %[a1]                           \n\t"

     "pshufh     %[a1],      %[a1],          %[ftmp0]        \n\t"

     "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"

     "mtc1       %[low32],   %[ftmp1]                        \n\t"

@@ -244,9 +250,9 @@

     "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"

     : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),

       [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32),

-      [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr)

+      [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr), [a1]"=&f"(a1)

     : [dst_stride]"r"((mips_reg)dst_stride),

-      [pred_stride]"r"((mips_reg)pred_stride), [a1]"f"(a1)

+      [pred_stride]"r"((mips_reg)pred_stride), [a0]"r"(a0)

     : "memory"

);

@@ -254,14 +260,15 @@

 void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {

   int i;

   int16_t output[16];

-  double ftmp[12];

-  uint32_t tmp[1];

-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_03) = { 0x0003000300030003ULL };

+  double ff_ph_03, ftmp[12];

+  uint64_t tmp[1];

   __asm__ volatile (

+    "dli        %[tmp0],    0x0003000300030003                  \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_03]                         \n\t"

     MMI_LI(%[tmp0], 0x03)

     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"

     "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"

     "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"

     "gsldlc1    %[ftmp2],   0x0f(%[ip])                         \n\t"

@@ -317,8 +324,8 @@

       [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),

       [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),

       [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),

-      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0])

-    : [ip]"r"(input), [op]"r"(output), [ff_ph_03]"f"(ff_ph_03)

+      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [ff_ph_03]"=&f"(ff_ph_03)

+    : [ip]"r"(input), [op]"r"(output)

     : "memory"

);

--- a/vp8/common/mips/mmi/loopfilter_filters_mmi.c

+++ b/vp8/common/mips/mmi/loopfilter_filters_mmi.c

@@ -13,28 +13,25 @@

 #include "vp8/common/onyxc_int.h"

 #include "vpx_ports/asmdefs_mmi.h"

-DECLARE_ALIGNED(8, static const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };

-DECLARE_ALIGNED(8, static const uint64_t,

-                ff_ph_003f) = { 0x003f003f003f003fULL };

-DECLARE_ALIGNED(8, static const uint64_t,

-                ff_ph_0900) = { 0x0900090009000900ULL };

-DECLARE_ALIGNED(8, static const uint64_t,

-                ff_ph_1200) = { 0x1200120012001200ULL };

-DECLARE_ALIGNED(8, static const uint64_t,

-                ff_ph_1b00) = { 0x1b001b001b001b00ULL };

-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_fe) = { 0xfefefefefefefefeULL };

-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_80) = { 0x8080808080808080ULL };

-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_04) = { 0x0404040404040404ULL };

-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_03) = { 0x0303030303030303ULL };

-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_01) = { 0x0101010101010101ULL };

 void vp8_loop_filter_horizontal_edge_mmi(

     unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,

     const unsigned char *limit, const unsigned char *thresh, int count) {

-  uint32_t tmp[1];

+  uint64_t tmp[1];

   mips_reg addr[2];

   double ftmp[12];

+  double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;

+  /* clang-format off */

   __asm__ volatile (

+    "dli        %[tmp0],    0x0001000100010001                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_01]                             \n\t"

+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"

+    "dli        %[tmp0],    0x8080808080808080                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"

+    "dli        %[tmp0],    0x0404040404040404                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"

+    "dli        %[tmp0],    0x0303030303030303                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"

     "1:                                                             \n\t"

     "gsldlc1    %[ftmp10],  0x07(%[limit])                          \n\t"

     "gsldrc1    %[ftmp10],  0x00(%[limit])                          \n\t"

@@ -91,9 +88,9 @@

     "pasubub    %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"

     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"

     "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"

-    "and        %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"

-    "li         %[tmp0],    0x01                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"

+    "pand       %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"

+    "dli        %[tmp0],    0x01                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"

     "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp10]           \n\t"

     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"

     "gsldlc1    %[ftmp10],  0x07(%[blimit])                         \n\t"

@@ -134,8 +131,8 @@

     "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"

     "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp8]            \n\t"

-    "li         %[tmp0],    0x0b                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"

+    "dli        %[tmp0],    0x0b                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"

     "psrah      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"

     "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"

     "packsshb   %[ftmp8],   %[ftmp0],           %[ftmp11]           \n\t"

@@ -149,8 +146,8 @@

     "packsshb   %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"

     "paddsh     %[ftmp9],   %[ftmp9],           %[ff_ph_01]         \n\t"

-    "li         %[tmp0],    0x01                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"

+    "dli        %[tmp0],    0x01                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"

     "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"

     "psrah      %[ftmp9],   %[ftmp9],           %[ftmp10]           \n\t"

     "packsshb   %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t"

@@ -188,17 +185,18 @@

       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),

       [tmp0]"=&r"(tmp[0]),

       [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),

-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)

+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),

+      [ff_ph_01]"=&f"(ff_ph_01),        [ff_pb_fe]"=&f"(ff_pb_fe),

+      [ff_pb_80]"=&f"(ff_pb_80),        [ff_pb_04]"=&f"(ff_pb_04),

+      [ff_pb_03]"=&f"(ff_pb_03)

     : [limit]"r"(limit),                [blimit]"r"(blimit),

       [thresh]"r"(thresh),

       [src_pixel_step]"r"((mips_reg)src_pixel_step),

       [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),

-      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),

-      [ff_ph_01]"f"(ff_ph_01),          [ff_pb_fe]"f"(ff_pb_fe),

-      [ff_pb_80]"f"(ff_pb_80),          [ff_pb_04]"f"(ff_pb_04),

-      [ff_pb_03]"f"(ff_pb_03)

+      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2))

     : "memory"

);

+  /* clang-format on */

 void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,

@@ -206,11 +204,23 @@

                                        const unsigned char *blimit,

                                        const unsigned char *limit,

                                        const unsigned char *thresh, int count) {

-  uint32_t tmp[1];

+  uint64_t tmp[1];

   mips_reg addr[2];

   double ftmp[13];

+  double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80;

+  /* clang-format off */

   __asm__ volatile (

+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"

+    "dli        %[tmp0],    0x0001000100010001                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_01]                             \n\t"

+    "dli        %[tmp0],    0x0303030303030303                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"

+    "dli        %[tmp0],    0x0404040404040404                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"

+    "dli        %[tmp0],    0x8080808080808080                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"

     MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)

     MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])

     MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)

@@ -315,8 +325,8 @@

     /* abs (p1-q1) */

     "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"

     "pand       %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"

-    "li         %[tmp0],    0x01                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp1]                                \n\t"

+    "dli        %[tmp0],    0x01                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp1]                                \n\t"

     "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp1]            \n\t"

     "paddusb    %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"

     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t"

@@ -354,8 +364,8 @@

     "paddsb     %[ftmp11],  %[ftmp2],           %[ff_pb_04]         \n\t"

     "paddsb     %[ftmp12],  %[ftmp2],           %[ff_pb_03]         \n\t"

-    "li         %[tmp0],    0x0b                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp7]                                \n\t"

+    "dli        %[tmp0],    0x0b                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp7]                                \n\t"

     "pxor      %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"

     "pxor      %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"

     "punpcklbh %[ftmp0],    %[ftmp0],           %[ftmp12]           \n\t"

@@ -379,8 +389,8 @@

     "paddsh     %[ftmp0],   %[ftmp0],           %[ff_ph_01]         \n\t"

     "paddsh     %[ftmp8],   %[ftmp8],           %[ff_ph_01]         \n\t"

-    "li         %[tmp0],    0x01                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp7]                                \n\t"

+    "dli        %[tmp0],    0x01                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp7]                                \n\t"

     "psrah      %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"

     "psrah      %[ftmp8],   %[ftmp8],           %[ftmp7]            \n\t"

     "packsshb   %[ftmp2],   %[ftmp0],           %[ftmp8]            \n\t"

@@ -450,15 +460,16 @@

       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),

       [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),

       [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),

-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)

+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),

+      [ff_ph_01]"=&f"(ff_ph_01),        [ff_pb_03]"=&f"(ff_pb_03),

+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_80]"=&f"(ff_pb_80),

+      [ff_pb_fe]"=&f"(ff_pb_fe)

     : [limit]"r"(limit),                [blimit]"r"(blimit),

       [thresh]"r"(thresh),

-      [src_pixel_step]"r"((mips_reg)src_pixel_step),

-      [ff_ph_01]"f"(ff_ph_01),          [ff_pb_03]"f"(ff_pb_03),

-      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_80]"f"(ff_pb_80),

-      [ff_pb_fe]"f"(ff_pb_fe)

+      [src_pixel_step]"r"((mips_reg)src_pixel_step)

     : "memory"

);

+  /* clang-format on */

 /* clang-format off */

@@ -484,10 +495,29 @@

 void vp8_mbloop_filter_horizontal_edge_mmi(

     unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,

     const unsigned char *limit, const unsigned char *thresh, int count) {

-  uint32_t tmp[1];

+  uint64_t tmp[1];

   double ftmp[13];

+  double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900,

+      ff_ph_1200, ff_ph_1b00;

+  /* clang-format off */

   __asm__ volatile (

+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"

+    "dli        %[tmp0],    0x8080808080808080                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"

+    "dli        %[tmp0],    0x0404040404040404                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"

+    "dli        %[tmp0],    0x0303030303030303                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"

+    "dli        %[tmp0],    0x003f003f003f003f                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_003f]                           \n\t"

+    "dli        %[tmp0],    0x0900090009000900                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_0900]                           \n\t"

+    "dli        %[tmp0],    0x1200120012001200                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_1200]                           \n\t"

+    "dli        %[tmp0],    0x1b001b001b001b00                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_1b00]                           \n\t"

     MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)

     MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])

     "1:                                                             \n\t"

@@ -550,8 +580,8 @@

     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"

     "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"

     "pand       %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"

-    "li         %[tmp0],    0x01                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"

+    "dli        %[tmp0],    0x01                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"

     "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"

     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"

     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp12]           \n\t"

@@ -584,8 +614,8 @@

     "pandn      %[ftmp12],  %[ftmp1],           %[ftmp2]            \n\t"

     "pand       %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"

-    "li         %[tmp0],    0x0b                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"

+    "dli        %[tmp0],    0x0b                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"

     "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_03]         \n\t"

     VP8_MBLOOP_HPSRAB

     "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"

@@ -593,8 +623,8 @@

     VP8_MBLOOP_HPSRAB

     "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"

-    "li         %[tmp0],    0x07                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"

+    "dli        %[tmp0],    0x07                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"

     "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"

     VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])

@@ -649,18 +679,20 @@

       [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),

       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),

       [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),

-      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count)

+      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count),

+      [ff_pb_fe]"=&f"(ff_pb_fe),          [ff_pb_80]"=&f"(ff_pb_80),

+      [ff_pb_04]"=&f"(ff_pb_04),          [ff_pb_03]"=&f"(ff_pb_03),

+      [ff_ph_0900]"=&f"(ff_ph_0900),      [ff_ph_1b00]"=&f"(ff_ph_1b00),

+      [ff_ph_1200]"=&f"(ff_ph_1200),      [ff_ph_003f]"=&f"(ff_ph_003f)

     : [limit]"r"(limit),                  [blimit]"r"(blimit),

       [thresh]"r"(thresh),

-      [src_pixel_step]"r"((mips_reg)src_pixel_step),

-      [ff_pb_fe]"f"(ff_pb_fe),            [ff_pb_80]"f"(ff_pb_80),

-      [ff_pb_04]"f"(ff_pb_04),            [ff_pb_03]"f"(ff_pb_03),

-      [ff_ph_0900]"f"(ff_ph_0900),        [ff_ph_1b00]"f"(ff_ph_1b00),

-      [ff_ph_1200]"f"(ff_ph_1200),        [ff_ph_003f]"f"(ff_ph_003f)

+      [src_pixel_step]"r"((mips_reg)src_pixel_step)

     : "memory"

);

+  /* clang-format on */

+/* clang-format off */

 #define VP8_MBLOOP_VPSRAB_ADDH                                          \

   "pxor       %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t" \

   "pxor       %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \

@@ -673,15 +705,30 @@

   "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t" \

   "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t" \

   "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"

+/* clang-format on */

 void vp8_mbloop_filter_vertical_edge_mmi(

     unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,

     const unsigned char *limit, const unsigned char *thresh, int count) {

   mips_reg tmp[1];

-  DECLARE_ALIGNED(8, const uint64_t, srct[1]);

+  DECLARE_ALIGNED(8, const uint64_t, srct[2]);

   double ftmp[14];

+  double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;

+  /* clang-format off */

   __asm__ volatile (

+    "dli        %[tmp0],    0x003f003f003f003f                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_003f]                           \n\t"

+    "dli        %[tmp0],    0x0900090009000900                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_0900]                           \n\t"

+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"

+    "dli        %[tmp0],    0x8080808080808080                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"

+    "dli        %[tmp0],    0x0404040404040404                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"

+    "dli        %[tmp0],    0x0303030303030303                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"

     MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)

     "1:                                                             \n\t"

@@ -783,8 +830,8 @@

     /* abs (p1-q1) / 2 */

     "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"

     "pand       %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"

-    "li         %[tmp0],    0x01                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"

+    "dli        %[tmp0],    0x01                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"

     "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"

     "paddusb    %[ftmp12],  %[ftmp1],           %[ftmp12]           \n\t"

     "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp13]           \n\t"

@@ -824,8 +871,8 @@

     "pandn      %[ftmp0],   %[ftmp1],           %[ftmp0]            \n\t"

     "paddsb     %[ftmp4],   %[ftmp3],           %[ff_pb_04]         \n\t"

-    "li         %[tmp0],    0x0b                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"

+    "dli        %[tmp0],    0x0b                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp12]                               \n\t"

     "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp4]            \n\t"

     "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp4]            \n\t"

     "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"

@@ -842,8 +889,8 @@

     /* ftmp6: ps0 */

     "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"

-    "li         %[tmp0],    0x07                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"

+    "dli        %[tmp0],    0x07                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp12]                               \n\t"

     VP8_MBLOOP_VPSRAB_ADDH

     "paddh      %[ftmp1],   %[ff_ph_0900],      %[ff_ph_0900]       \n\t"

     "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_0900]       \n\t"

@@ -948,17 +995,19 @@

       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),

       [ftmp12]"=&f"(ftmp[12]),            [ftmp13]"=&f"(ftmp[13]),

       [tmp0]"=&r"(tmp[0]),                [src_ptr]"+&r"(src_ptr),

-      [count]"+&r"(count)

+      [count]"+&r"(count),

+      [ff_ph_003f]"=&f"(ff_ph_003f),    [ff_ph_0900]"=&f"(ff_ph_0900),

+      [ff_pb_03]"=&f"(ff_pb_03),        [ff_pb_04]"=&f"(ff_pb_04),

+      [ff_pb_80]"=&f"(ff_pb_80),        [ff_pb_fe]"=&f"(ff_pb_fe)

     : [limit]"r"(limit),                [blimit]"r"(blimit),

       [srct]"r"(srct),                  [thresh]"r"(thresh),

-      [src_pixel_step]"r"((mips_reg)src_pixel_step),

-      [ff_ph_003f]"f"(ff_ph_003f),      [ff_ph_0900]"f"(ff_ph_0900),

-      [ff_pb_03]"f"(ff_pb_03),          [ff_pb_04]"f"(ff_pb_04),

-      [ff_pb_80]"f"(ff_pb_80),          [ff_pb_fe]"f"(ff_pb_fe)

+      [src_pixel_step]"r"((mips_reg)src_pixel_step)

     : "memory"

);

+  /* clang-format on */

+/* clang-format off */

 #define VP8_SIMPLE_HPSRAB                                               \

   "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t" \

   "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t" \

@@ -966,23 +1015,38 @@

   "psrah      %[ftmp1],   %[ftmp5],           %[ftmp10]           \n\t" \

   "psllh      %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t" \

   "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"

+/* clang-format on */

 void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,

                                                 int src_pixel_step,

                                                 const unsigned char *blimit) {

-  uint32_t tmp[1], count = 2;

+  uint64_t tmp[1], count = 2;

   mips_reg addr[2];

   double ftmp[12];

+  double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;

+  /* clang-format off */

   __asm__ volatile (

-    "li         %[tmp0],    0x08                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"

-    "li         %[tmp0],    0x03                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"

-    "li         %[tmp0],    0x0b                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"

-    "li         %[tmp0],    0x01                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp11]                               \n\t"

+    "dli        %[tmp0],    0x0b                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"

+    "dli        %[tmp0],    0x01                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp11]                               \n\t"

+    "dli        %[tmp0],    0x08                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"

+    "dli        %[tmp0],    0x03                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"

+    "dli        %[tmp0],    0x0b                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"

+    "dli        %[tmp0],    0x01                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp11]                               \n\t"

+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"

+    "dli        %[tmp0],    0x8080808080808080                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"

+    "dli        %[tmp0],    0x0404040404040404                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"

+    "dli        %[tmp0],    0x0101010101010101                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_01]                             \n\t"

     "1:                                                             \n\t"

     "gsldlc1    %[ftmp3],   0x07(%[blimit])                         \n\t"

@@ -996,7 +1060,7 @@

     "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"

     "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"

     "pasubub    %[ftmp1],   %[ftmp7],           %[ftmp2]            \n\t"

-    "and        %[ftmp1],   %[ftmp1],           %[ff_pb_fe]         \n\t"

+    "pand       %[ftmp1],   %[ftmp1],           %[ff_pb_fe]         \n\t"

     "psrlh      %[ftmp1],   %[ftmp1],           %[ftmp11]           \n\t"

     MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])

@@ -1020,7 +1084,7 @@

     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"

     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"

     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"

-    "and        %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"

+    "pand       %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"

     "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"

     VP8_SIMPLE_HPSRAB

@@ -1048,30 +1112,43 @@

       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),

       [tmp0]"=&r"(tmp[0]),

       [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),

-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)

+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),

+      [ff_pb_fe]"=&f"(ff_pb_fe),        [ff_pb_80]"=&f"(ff_pb_80),

+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_01]"=&f"(ff_pb_01)

     : [blimit]"r"(blimit),

       [src_pixel_step]"r"((mips_reg)src_pixel_step),

-      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),

-      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),

-      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_01]"f"(ff_pb_01)

+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1))

     : "memory"

);

+  /* clang-format on */

 void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,

                                               int src_pixel_step,

                                               const unsigned char *blimit) {

-  uint32_t tmp[1], count = 2;

+  uint64_t tmp[1], count = 2;

   mips_reg addr[2];

-  DECLARE_ALIGNED(8, const uint64_t, srct[1]);

-  double ftmp[12];

+  DECLARE_ALIGNED(8, const uint64_t, srct[2]);

+  double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;

+  /* clang-format off */

   __asm__ volatile (

-    "li         %[tmp0],    0x08                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"

-    "li         %[tmp0],    0x20                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"

+    "dli        %[tmp0],    0x08                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"

+    "dli        %[tmp0],    0x20                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"

+    "dli        %[tmp0],    0x08                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"

+    "dli        %[tmp0],    0x20                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"

+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"

+    "dli        %[tmp0],    0x8080808080808080                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"

+    "dli        %[tmp0],    0x0404040404040404                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"

+    "dli        %[tmp0],    0x0101010101010101                      \n\t"

+    "dmtc1      %[tmp0],    %[ff_pb_01]                             \n\t"

     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4])

     MMI_SUBU(%[src_ptr], %[src_ptr], 0x02)

@@ -1118,8 +1195,8 @@

     "punpckhwd  %[ftmp3],   %[ftmp2],           %[ftmp5]            \n\t"

     "punpcklwd  %[ftmp2],   %[ftmp2],           %[ftmp5]            \n\t"

-    "li         %[tmp0],    0x01                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"

+    "dli        %[tmp0],    0x01                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"

     "pasubub    %[ftmp6],   %[ftmp3],           %[ftmp0]            \n\t"

     "pand       %[ftmp6],   %[ftmp6],           %[ff_pb_fe]         \n\t"

     "psrlh      %[ftmp6],   %[ftmp6],           %[ftmp9]            \n\t"

@@ -1149,14 +1226,14 @@

     "pand       %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"

     "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"

-    "li         %[tmp0],    0x03                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"

+    "dli        %[tmp0],    0x03                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"

     "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"

     "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"

     "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"

-    "li         %[tmp0],    0x0b                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"

+    "dli        %[tmp0],    0x0b                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"

     "psrah      %[ftmp7],   %[ftmp5],           %[ftmp9]            \n\t"

     "psllh      %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"

     "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"

@@ -1164,14 +1241,14 @@

     "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"

     "psubsb     %[ftmp5],   %[ftmp5],           %[ff_pb_01]         \n\t"

-    "li         %[tmp0],    0x03                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"

+    "dli        %[tmp0],    0x03                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"

     "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"

     "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"

     "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"

-    "li         %[tmp0],    0x0b                                    \n\t"

-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"

+    "dli        %[tmp0],    0x0b                                    \n\t"

+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"

     "psrah      %[ftmp5],   %[ftmp5],           %[ftmp9]            \n\t"

     "psllh      %[ftmp5],   %[ftmp5],           %[ftmp8]            \n\t"

     "por        %[ftmp0],   %[ftmp0],           %[ftmp5]            \n\t"

@@ -1235,16 +1312,17 @@

       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),

       [tmp0]"=&r"(tmp[0]),

       [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),

-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)

+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),

+      [ff_pb_fe]"=&f"(ff_pb_fe),        [ff_pb_80]"=&f"(ff_pb_80),

+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_01]"=&f"(ff_pb_01)

     : [blimit]"r"(blimit),              [srct]"r"(srct),

       [src_pixel_step]"r"((mips_reg)src_pixel_step),

       [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),

       [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),

-      [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)),

-      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),

-      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_01]"f"(ff_pb_01)

+      [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3))

     : "memory"

);

+  /* clang-format on */

 /* Horizontal MB filtering */

--- a/vp8/common/mips/mmi/sixtap_filter_mmi.c

+++ b/vp8/common/mips/mmi/sixtap_filter_mmi.c

@@ -70,9 +70,8 @@

                                              unsigned int output_height,

                                              unsigned int output_width,

                                              const int16_t *vp8_filter) {

-  uint32_t tmp[1];

-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };

+  uint64_t tmp[1];

+  double ff_ph_40;

 #if _MIPS_SIM == _ABIO32

   register double fzero asm("$f0");

   register double ftmp0 asm("$f2");

@@ -103,7 +102,10 @@

   register double ftmp11 asm("$f12");

 #endif  // _MIPS_SIM == _ABIO32

+  /* clang-format off */

   __asm__ volatile (

+    "dli        %[tmp0],        0x0040004000400040                    \n\t"

+    "dmtc1      %[tmp0],        %[ff_ph_40]                           \n\t"

     "ldc1       %[ftmp0],       0x00(%[vp8_filter])                   \n\t"

     "ldc1       %[ftmp1],       0x10(%[vp8_filter])                   \n\t"

     "ldc1       %[ftmp2],       0x20(%[vp8_filter])                   \n\t"

@@ -111,10 +113,10 @@

     "ldc1       %[ftmp4],       0x40(%[vp8_filter])                   \n\t"

     "ldc1       %[ftmp5],       0x50(%[vp8_filter])                   \n\t"

     "pxor       %[fzero],       %[fzero],           %[fzero]          \n\t"

-    "li         %[tmp0],        0x07                                  \n\t"

-    "mtc1       %[tmp0],        %[ftmp7]                              \n\t"

-    "li         %[tmp0],        0x08                                  \n\t"

-    "mtc1       %[tmp0],        %[ftmp11]                             \n\t"

+    "dli        %[tmp0],        0x07                                  \n\t"

+    "dmtc1      %[tmp0],        %[ftmp7]                              \n\t"

+    "dli        %[tmp0],        0x08                                  \n\t"

+    "dmtc1      %[tmp0],        %[ftmp11]                             \n\t"

     "1:                                                               \n\t"

     "gsldlc1    %[ftmp9],       0x05(%[src_ptr])                      \n\t"

@@ -166,12 +168,12 @@

       [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),

       [ftmp11]"=&f"(ftmp11),            [tmp0]"=&r"(tmp[0]),

       [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),

-      [src_ptr]"+&r"(src_ptr)

+      [src_ptr]"+&r"(src_ptr),          [ff_ph_40]"=&f"(ff_ph_40)

     : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),

-      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width),

-      [ff_ph_40]"f"(ff_ph_40)

+      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width)

     : "memory"

);

+  /* clang-format on */

 /* Horizontal filter:  pixel_step is always W */

@@ -178,9 +180,10 @@

 static INLINE void vp8_filter_block1dc_v6_mmi(

     uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,

     int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) {

-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };

-  uint32_t tmp[1];

+  double ff_ph_40;

+  uint64_t tmp[1];

   mips_reg addr[1];

 #if _MIPS_SIM == _ABIO32

   register double fzero asm("$f0");

   register double ftmp0 asm("$f2");

@@ -215,7 +218,10 @@

   register double ftmp13 asm("$f14");

 #endif  // _MIPS_SIM == _ABIO32

+  /* clang-format off */

   __asm__ volatile (

+    "dli        %[tmp0],      0x0040004000400040                      \n\t"

+    "dmtc1      %[tmp0],      %[ff_ph_40]                             \n\t"

     "ldc1       %[ftmp0],     0x00(%[vp8_filter])                     \n\t"

     "ldc1       %[ftmp1],     0x10(%[vp8_filter])                     \n\t"

     "ldc1       %[ftmp2],     0x20(%[vp8_filter])                     \n\t"

@@ -223,8 +229,8 @@

     "ldc1       %[ftmp4],     0x40(%[vp8_filter])                     \n\t"

     "ldc1       %[ftmp5],     0x50(%[vp8_filter])                     \n\t"

     "pxor       %[fzero],     %[fzero],        %[fzero]               \n\t"

-    "li         %[tmp0],      0x07                                    \n\t"

-    "mtc1       %[tmp0],      %[ftmp13]                               \n\t"

+    "dli        %[tmp0],      0x07                                    \n\t"

+    "dmtc1      %[tmp0],      %[ftmp13]                               \n\t"

     /* In order to make full use of memory load delay slot,

      * Operation of memory loading and calculating has been rearranged.

@@ -285,15 +291,16 @@

       [ftmp11]"=&f"(ftmp11),            [ftmp12]"=&f"(ftmp12),

       [ftmp13]"=&f"(ftmp13),            [tmp0]"=&r"(tmp[0]),

       [addr0]"=&r"(addr[0]),            [src_ptr]"+&r"(src_ptr),

-      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)

+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),

+      [ff_ph_40]"=&f"(ff_ph_40)

     : [pixels_per_line]"r"((mips_reg)pixels_per_line),

       [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),

       [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),

       [vp8_filter]"r"(vp8_filter),

-      [output_pitch]"r"((mips_reg)output_pitch),

-      [ff_ph_40]"f"(ff_ph_40)

+      [output_pitch]"r"((mips_reg)output_pitch)

     : "memory"

);

+  /* clang-format on */

 /* When xoffset == 0, vp8_filter= {0,0,128,0,0,0},

@@ -313,6 +320,7 @@

   register double ftmp1 asm("$f2");

 #endif  // _MIPS_SIM == _ABIO32

+  /* clang-format off */

   __asm__ volatile (

     "pxor       %[fzero],       %[fzero],           %[fzero]          \n\t"

@@ -335,6 +343,7 @@

       [output_width]"r"(output_width)

     : "memory"

);

+  /* clang-format on */

 static INLINE void vp8_filter_block1dc_v6_filter0_mmi(

@@ -350,6 +359,7 @@

   register double ftmp1 asm("$f2");

 #endif  // _MIPS_SIM == _ABIO32

+  /* clang-format on */

   __asm__ volatile (

     "pxor       %[fzero],     %[fzero],        %[fzero]               \n\t"

@@ -371,6 +381,7 @@

       [output_pitch]"r"((mips_reg)output_pitch)

     : "memory"

);

+  /* clang-format on */

 #define sixtapNxM(n, m)                                                        \

--- a/vp8/encoder/mips/mmi/dct_mmi.c

+++ b/vp8/encoder/mips/mmi/dct_mmi.c

@@ -46,6 +46,7 @@

 void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {

   uint64_t tmp[1];

   int16_t *ip = input;

+  double ff_ph_op1, ff_ph_op3;

 #if _MIPS_SIM == _ABIO32

   register double ftmp0 asm("$f0");

@@ -83,13 +84,16 @@

   DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };

   DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };

   DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };

-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL };

-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL };

   DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };

   DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };

   DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };

+  /* clang-format off */

   __asm__ volatile (

+    "dli        %[tmp0],    0x14e808a914e808a9              \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_op1]                    \n\t"

+    "dli        %[tmp0],    0xeb1808a9eb1808a9              \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_op3]                    \n\t"

     "pxor       %[ftmp0],   %[ftmp0],      %[ftmp0]         \n\t"

     "gsldlc1    %[ftmp1],   0x07(%[ip])                     \n\t"

     "gsldrc1    %[ftmp1],   0x00(%[ip])                     \n\t"

@@ -129,7 +133,7 @@

     // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12

     MMI_LI(%[tmp0], 0x0c)

-    "mtc1       %[tmp0],    %[ftmp11]                       \n\t"

+    "dmtc1      %[tmp0],    %[ftmp11]                       \n\t"

     "ldc1       %[ftmp12],  %[ff_pw_14500]                  \n\t"

     "punpcklhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"

     "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op1]    \n\t"

@@ -169,7 +173,7 @@

     "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"

     "paddh      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"

     MMI_LI(%[tmp0], 0x04)

-    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"

+    "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"

     "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"

     "psrah      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"

@@ -211,9 +215,9 @@

       [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),

       [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),

       [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),

-      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip)

+      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip),

+      [ff_ph_op1] "=&f"(ff_ph_op1), [ff_ph_op3] "=&f"(ff_ph_op3)

     : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),

-      [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3),

       [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),

       [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),

       [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),

@@ -220,6 +224,7 @@

       [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)

     : "memory"

);

+  /* clang-format on */

 void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {

@@ -228,17 +233,22 @@

 void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {

-  double ftmp[13];

-  uint32_t tmp[1];

-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };

-  DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL };

-  DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL };

-  DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL };

+  double ftmp[13], ff_ph_01, ff_pw_01, ff_pw_03, ff_pw_mask;

+  uint64_t tmp[1];

+  /* clang-format off */

   __asm__ volatile (

+    "dli        %[tmp0],    0x0001000100010001                  \n\t"

+    "dmtc1      %[tmp0],    %[ff_ph_01]                         \n\t"

+    "dli        %[tmp0],    0x0000000100000001                  \n\t"

+    "dmtc1      %[tmp0],    %[ff_pw_01]                         \n\t"

+    "dli        %[tmp0],    0x0000000300000003                  \n\t"

+    "dmtc1      %[tmp0],    %[ff_pw_03]                         \n\t"

+    "dli        %[tmp0],    0x0001000000010000                  \n\t"

+    "dmtc1      %[tmp0],    %[ff_pw_mask]                       \n\t"

     MMI_LI(%[tmp0], 0x02)

     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"

     "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"

     "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"

@@ -337,7 +347,7 @@

     "psubw      %[ftmp4],   %[ftmp9],       %[ftmp10]           \n\t"

     MMI_LI(%[tmp0], 0x03)

-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"

     "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp1]            \n\t"

     "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"

@@ -393,7 +403,7 @@

     "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"

     MMI_LI(%[tmp0], 0x72)

-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"

     "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"

     "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"

     "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"

@@ -413,13 +423,12 @@

       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),

       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),

       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),

-      [ftmp12]"=&f"(ftmp[12]),

-      [tmp0]"=&r"(tmp[0]),

-      [ip]"+&r"(input)

-    : [op]"r"(output),

-      [ff_pw_01]"f"(ff_pw_01),          [pitch]"r"((mips_reg)pitch),

-      [ff_pw_03]"f"(ff_pw_03),          [ff_pw_mask]"f"(ff_pw_mask),

-      [ff_ph_01]"f"(ff_ph_01)

+      [ftmp12]"=&f"(ftmp[12]),          [ff_pw_mask]"=&f"(ff_pw_mask),

+      [tmp0]"=&r"(tmp[0]),              [ff_pw_01]"=&f"(ff_pw_01),

+      [ip]"+&r"(input),                 [ff_pw_03]"=&f"(ff_pw_03),

+      [ff_ph_01]"=&f"(ff_ph_01)

+    : [op]"r"(output),                  [pitch]"r"((mips_reg)pitch)

     : "memory"

);

+  /* clang-format on */

--- a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c

+++ b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c

@@ -42,16 +42,17 @@

   double ftmp[13];

   uint64_t tmp[1];

-  DECLARE_ALIGNED(8, const uint64_t, ones) = { 0xffffffffffffffffULL };

-  int eob = 0;

+  int64_t eob = 0;

+  double ones;

   __asm__ volatile(

       // loop 0 ~ 7

       "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"

+      "pcmpeqh    %[ones],    %[ones],        %[ones]         \n\t"

       "gsldlc1    %[ftmp1],   0x07(%[coeff_ptr])              \n\t"

       "gsldrc1    %[ftmp1],   0x00(%[coeff_ptr])              \n\t"

-      "li         %[tmp0],    0x0f                            \n\t"

-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"

+      "dli        %[tmp0],    0x0f                            \n\t"

+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"

       "gsldlc1    %[ftmp2],   0x0f(%[coeff_ptr])              \n\t"

       "gsldrc1    %[ftmp2],   0x08(%[coeff_ptr])              \n\t"

@@ -165,18 +166,18 @@

       "gssdlc1    %[ftmp6],   0x1f(%[dqcoeff_ptr])            \n\t"

       "gssdrc1    %[ftmp6],   0x18(%[dqcoeff_ptr])            \n\t"

-      "li         %[tmp0],    0x10                            \n\t"

-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"

+      "dli        %[tmp0],    0x10                            \n\t"

+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"

       "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"

       "psrlw      %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"

       "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"

-      "li         %[tmp0],    0xaa                            \n\t"

-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"

+      "dli        %[tmp0],    0xaa                            \n\t"

+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"

       "pshufh     %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"

       "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"

-      "li         %[tmp0],    0xffff                          \n\t"

-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"

+      "dli        %[tmp0],    0xffff                          \n\t"

+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"

       "pand       %[ftmp10],  %[ftmp10],       %[ftmp9]       \n\t"

       "gssdlc1    %[ftmp10],  0x07(%[eob])                    \n\t"

       "gssdrc1    %[ftmp10],  0x00(%[eob])                    \n\t"

@@ -184,7 +185,8 @@

         [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),

         [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),

         [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),

-        [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])

+        [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),

+        [tmp0] "=&r"(tmp[0]), [ones] "=&f"(ones)

       : [coeff_ptr] "r"((mips_reg)coeff_ptr),

         [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr),

         [dequant_ptr] "r"((mips_reg)dequant_ptr),

@@ -191,8 +193,7 @@

         [round_ptr] "r"((mips_reg)round_ptr),

         [quant_ptr] "r"((mips_reg)quant_ptr),

         [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr),

-        [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob),

-        [ones] "f"(ones)

+        [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob)

       : "memory");

   *d->eob = eob;

--- a/vpx_dsp/mips/sad_mmi.c

+++ b/vpx_dsp/mips/sad_mmi.c

@@ -364,6 +364,7 @@

   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;

   mips_reg l_counter = counter;

+  /* clang-format off */

   __asm__ volatile (

     "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"

     "1:                                                         \n\t"

@@ -383,6 +384,7 @@

     : [src_stride]"r"((mips_reg)src_stride),

       [ref_stride]"r"((mips_reg)ref_stride)

);

+  /* clang-format on */

   return sad;

@@ -405,7 +407,9 @@

   unsigned int sad;

   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;

   mips_reg l_counter = counter;

+  mips_reg l_second_pred = (mips_reg)second_pred;

+  /* clang-format off */

   __asm__ volatile (

     "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"

     "1:                                                         \n\t"

@@ -424,11 +428,12 @@

     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),

       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),

       [src]"+&r"(src), [ref]"+&r"(ref),

-      [second_pred]"+&r"((mips_reg)second_pred),

+      [second_pred]"+&r"(l_second_pred),

       [sad]"=&r"(sad)

     : [src_stride]"r"((mips_reg)src_stride),

       [ref_stride]"r"((mips_reg)ref_stride)

);

+  /* clang-format on */

   return sad;

@@ -450,6 +455,7 @@

   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;

   mips_reg l_counter = counter;

+  /* clang-format off */

   __asm__ volatile (

     "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"

     "1:                                                         \n\t"

@@ -469,6 +475,7 @@

     : [src_stride]"r"((mips_reg)src_stride),

       [ref_stride]"r"((mips_reg)ref_stride)

);

+  /* clang-format on */

   return sad;

@@ -493,7 +500,9 @@

   unsigned int sad;

   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;

   mips_reg l_counter = counter;

+  mips_reg l_second_pred = (mips_reg)second_pred;

+  /* clang-format off */

   __asm__ volatile (

     "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"

     "1:                                                         \n\t"

@@ -512,11 +521,12 @@

     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),

       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),

       [src]"+&r"(src), [ref]"+&r"(ref),

-      [second_pred]"+&r"((mips_reg)second_pred),

+      [second_pred]"+&r"(l_second_pred),

       [sad]"=&r"(sad)

     : [src_stride]"r"((mips_reg)src_stride),

       [ref_stride]"r"((mips_reg)ref_stride)

);

+  /* clang-format on */

   return sad;

@@ -539,6 +549,7 @@

   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;

   mips_reg l_counter = counter;

+  /* clang-format off */

   __asm__ volatile (

     "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"

     "1:                                                         \n\t"

@@ -558,6 +569,7 @@

     : [src_stride]"r"((mips_reg)src_stride),

       [ref_stride]"r"((mips_reg)ref_stride)

);

+  /* clang-format on */

   return sad;

@@ -586,7 +598,9 @@

   unsigned int sad;

   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;

   mips_reg l_counter = counter;

+  mips_reg l_second_pred = (mips_reg)second_pred;

+  /* clang-format off */

   __asm__ volatile (

     "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"

     "1:                                                         \n\t"

@@ -605,11 +619,12 @@

     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),

       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),

       [src]"+&r"(src), [ref]"+&r"(ref),

-      [second_pred]"+&r"((mips_reg)second_pred),

+      [second_pred]"+&r"(l_second_pred),

       [sad]"=&r"(sad)

     : [src_stride]"r"((mips_reg)src_stride),

       [ref_stride]"r"((mips_reg)ref_stride)

);

+  /* clang-format on */

   return sad;

@@ -632,6 +647,7 @@

   double ftmp1, ftmp2, ftmp3;

   mips_reg l_counter = counter;

+  /* clang-format off */

   __asm__ volatile (

     "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"

     "1:                                                         \n\t"

@@ -651,6 +667,7 @@

     : [src_stride]"r"((mips_reg)src_stride),

       [ref_stride]"r"((mips_reg)ref_stride)

);

+  /* clang-format on */

   return sad;

@@ -679,7 +696,9 @@

   unsigned int sad;

   double ftmp1, ftmp2, ftmp3;

   mips_reg l_counter = counter;

+  mips_reg l_second_pred = (mips_reg)second_pred;

+  /* clang-format off */

   __asm__ volatile (

     "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"

     "1:                                                         \n\t"

@@ -697,11 +716,12 @@

     "mfc1       %[sad],     %[ftmp3]                            \n\t"

     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),

       [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),

-      [second_pred]"+&r"((mips_reg)second_pred),

+      [second_pred]"+&r"(l_second_pred),

       [sad]"=&r"(sad)

     : [src_stride]"r"((mips_reg)src_stride),

       [ref_stride]"r"((mips_reg)ref_stride)

);

+  /* clang-format on */

   return sad;

@@ -724,6 +744,7 @@

   double ftmp1, ftmp2, ftmp3;

   mips_reg l_counter = counter;

+  /* clang-format off */

   __asm__ volatile (

     "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"

     "1:                                                         \n\t"

@@ -743,6 +764,7 @@

     : [src_stride]"r"((mips_reg)src_stride),

       [ref_stride]"r"((mips_reg)ref_stride)

);

+  /* clang-format on */

   return sad;

@@ -767,7 +789,9 @@

   unsigned int sad;

   double ftmp1, ftmp2, ftmp3;

   mips_reg l_counter = counter;

+  mips_reg l_second_pred = (mips_reg)second_pred;

+  /* clang-format off */

   __asm__ volatile (

     "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"

     "1:                                                         \n\t"

@@ -785,11 +809,12 @@

     "mfc1       %[sad],     %[ftmp3]                            \n\t"

     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),

       [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),

-      [second_pred]"+&r"((mips_reg)second_pred),

+      [second_pred]"+&r"(l_second_pred),

       [sad]"=&r"(sad)

     : [src_stride]"r"((mips_reg)src_stride),

       [ref_stride]"r"((mips_reg)ref_stride)

);

+  /* clang-format on */

   return sad;

--- a/vpx_dsp/mips/variance_mmi.c

+++ b/vpx_dsp/mips/variance_mmi.c

@@ -414,6 +414,7 @@

   *sse = 0;

+  /* clang-format off */

   __asm__ volatile (

     "li         %[tmp0],    0x20                                \n\t"

     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

@@ -496,6 +497,7 @@

       [high]"r"(&high), [sse]"r"(sse)

     : "memory"

);

+  /* clang-format on */

   return *sse - (((int64_t)sum * sum) / (64 * high));

@@ -519,6 +521,7 @@

   *sse = 0;

+  /* clang-format off */

   __asm__ volatile (

     "li         %[tmp0],    0x20                                \n\t"

     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

@@ -577,6 +580,7 @@

       [sse]"r"(sse)

     : "memory"

);

+  /* clang-format on */

   return *sse - (((int64_t)sum * sum) / 2048);

@@ -590,6 +594,7 @@

   *sse = 0;

+  /* clang-format off */

   __asm__ volatile (

     "li         %[tmp0],    0x20                                \n\t"

     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

@@ -653,6 +658,7 @@

       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)

     : "memory"

);

+  /* clang-format on */

   return *sse - (((int64_t)sum * sum) / (32 * high));

@@ -676,6 +682,7 @@

   *sse = 0;

+  /* clang-format off */

   __asm__ volatile (

     "li         %[tmp0],    0x20                                \n\t"

     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

@@ -729,6 +736,7 @@

       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)

     : "memory"

);

+  /* clang-format on */

   return *sse - (((int64_t)sum * sum) / (16 * high));

@@ -753,6 +761,7 @@

   *sse = 0;

+  /* clang-format off */

   __asm__ volatile (

     "li         %[tmp0],    0x20                                \n\t"

     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

@@ -801,6 +810,7 @@

       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)

     : "memory"

);

+  /* clang-format on */

   return *sse - (((int64_t)sum * sum) / (8 * high));

@@ -825,6 +835,7 @@

   *sse = 0;

+  /* clang-format off */

   __asm__ volatile (

     "li         %[tmp0],    0x20                                \n\t"

     "mtc1       %[tmp0],    %[ftmp10]                           \n\t"

@@ -872,6 +883,7 @@

       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)

     : "memory"

);

+  /* clang-format on */

   return *sse - (((int64_t)sum * sum) / (4 * high));

@@ -894,6 +906,7 @@

   *sse = 0;

+  /* clang-format off */

   __asm__ volatile (

     "li         %[tmp0],    0x20                                \n\t"

     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

@@ -925,6 +938,7 @@

       [high]"r"(&high), [sse]"r"(sse)

     : "memory"

);

+  /* clang-format on */

   return *sse;

@@ -947,6 +961,7 @@

   *sse = 0;

+  /* clang-format off */

   __asm__ volatile (

     "li         %[tmp0],    0x20                                \n\t"

     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"

@@ -978,6 +993,7 @@

       [high]"r"(&high), [sse]"r"(sse)

     : "memory"

);

+  /* clang-format on */

   return *sse;

@@ -1021,22 +1037,39 @@

   uint8_t *temp2_ptr = temp2;

   mips_reg l_counter = counter;

   double ftmp[15];

+  double ff_ph_40, mask;

+  double filter_x0, filter_x1, filter_y0, filter_y1;

   mips_reg tmp[2];

-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };

-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };

+  uint64_t x0, x1, y0, y1, all;

   const uint8_t *filter_x = bilinear_filters[x_offset];

   const uint8_t *filter_y = bilinear_filters[y_offset];

+  x0 = (uint64_t)filter_x[0];

+  x1 = (uint64_t)filter_x[1];

+  y0 = (uint64_t)filter_y[0];

+  y1 = (uint64_t)filter_y[1];

+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;

+  /* clang-format off */

   __asm__ volatile (

     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

+    MMI_MTC1(%[all], %[ftmp14])

+    "punpcklbh  %[ftmp14],  %[ftmp14],      %[ftmp0]            \n\t"

+    "pshufh     %[filter_x0], %[ftmp14],    %[ftmp0]            \n\t"

+    MMI_LI(%[tmp0], 0x10)

+    MMI_MTC1(%[tmp0], %[mask])

+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"

+    "pshufh     %[filter_x1], %[ftmp14],    %[ftmp0]            \n\t"

+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"

+    "pshufh     %[filter_y0], %[ftmp14],    %[ftmp0]            \n\t"

+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"

+    "pshufh     %[filter_y1], %[ftmp14],    %[ftmp0]            \n\t"

     MMI_LI(%[tmp0], 0x07)

     MMI_MTC1(%[tmp0], %[ftmp14])

-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"

-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"

-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"

-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"

+    MMI_LI(%[tmp0], 0x0040004000400040)

+    MMI_MTC1(%[tmp0], %[ff_ph_40])

+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)

+    MMI_MTC1(%[tmp0], %[mask])

     // fdata3: fdata3[0] ~ fdata3[15]

     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A

@@ -1072,15 +1105,13 @@

       [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),

       [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),

       [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),

-      [counter]"+&r"(l_counter)

-    : [filter_x0] "f"((uint64_t)filter_x[0]),

-      [filter_x1] "f"((uint64_t)filter_x[1]),

-      [filter_y0] "f"((uint64_t)filter_y[0]),

-      [filter_y1] "f"((uint64_t)filter_y[1]),

-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),

-      [mask] "f"(mask)

+      [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),

+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),

+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)

+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)

     : "memory"

);

+  /* clang-format on */

 #define SUBPIX_VAR16XN(H)                                                      \

@@ -1105,19 +1136,38 @@

   mips_reg l_counter = counter;

   double ftmp[15];

   mips_reg tmp[2];

-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };

-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };

+  double ff_ph_40, mask;

+  uint64_t x0, x1, y0, y1, all;

+  double filter_x0, filter_x1, filter_y0, filter_y1;

   const uint8_t *filter_x = bilinear_filters[x_offset];

   const uint8_t *filter_y = bilinear_filters[y_offset];

+  x0 = (uint64_t)filter_x[0];

+  x1 = (uint64_t)filter_x[1];

+  y0 = (uint64_t)filter_y[0];

+  y1 = (uint64_t)filter_y[1];

+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;

+  /* clang-format off */

   __asm__ volatile (

     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

+    MMI_MTC1(%[all], %[ftmp14])

+    "punpcklbh  %[ftmp14],  %[ftmp14],      %[ftmp0]            \n\t"

+    "pshufh     %[filter_x0], %[ftmp14],    %[ftmp0]            \n\t"

+    MMI_LI(%[tmp0], 0x10)

+    MMI_MTC1(%[tmp0], %[mask])

+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"

+    "pshufh     %[filter_x1], %[ftmp14],    %[ftmp0]            \n\t"

+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"

+    "pshufh     %[filter_y0], %[ftmp14],    %[ftmp0]            \n\t"

+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"

+    "pshufh     %[filter_y1], %[ftmp14],    %[ftmp0]            \n\t"

+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

     MMI_LI(%[tmp0], 0x07)

     MMI_MTC1(%[tmp0], %[ftmp14])

-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"

-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"

-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"

-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"

+    MMI_LI(%[tmp0], 0x0040004000400040)

+    MMI_MTC1(%[tmp0], %[ff_ph_40])

+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)

+    MMI_MTC1(%[tmp0], %[mask])

     // fdata3: fdata3[0] ~ fdata3[7]

     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A

@@ -1154,15 +1204,13 @@

       [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),

       [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),

       [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),

-      [counter]"+&r"(l_counter)

-    : [filter_x0] "f"((uint64_t)filter_x[0]),

-      [filter_x1] "f"((uint64_t)filter_x[1]),

-      [filter_y0] "f"((uint64_t)filter_y[0]),

-      [filter_y1] "f"((uint64_t)filter_y[1]),

-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),

-      [mask] "f"(mask)

+      [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),

+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),

+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)

+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)

     : "memory"

);

+  /* clang-format on */

 #define SUBPIX_VAR8XN(H)                                                      \

@@ -1188,19 +1236,38 @@

   mips_reg l_counter = counter;

   double ftmp[7];

   mips_reg tmp[2];

-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };

-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };

+  double ff_ph_40, mask;

+  uint64_t x0, x1, y0, y1, all;

+  double filter_x0, filter_x1, filter_y0, filter_y1;

   const uint8_t *filter_x = bilinear_filters[x_offset];

   const uint8_t *filter_y = bilinear_filters[y_offset];

+  x0 = (uint64_t)filter_x[0];

+  x1 = (uint64_t)filter_x[1];

+  y0 = (uint64_t)filter_y[0];

+  y1 = (uint64_t)filter_y[1];

+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;

+  /* clang-format off */

   __asm__ volatile (

     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

+    MMI_MTC1(%[all], %[ftmp6])

+    "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"

+    "pshufh     %[filter_x0], %[ftmp6],     %[ftmp0]            \n\t"

+    MMI_LI(%[tmp0], 0x10)

+    MMI_MTC1(%[tmp0], %[mask])

+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"

+    "pshufh     %[filter_x1], %[ftmp6],     %[ftmp0]            \n\t"

+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"

+    "pshufh     %[filter_y0], %[ftmp6],     %[ftmp0]            \n\t"

+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"

+    "pshufh     %[filter_y1], %[ftmp6],     %[ftmp0]            \n\t"

+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"

     MMI_LI(%[tmp0], 0x07)

     MMI_MTC1(%[tmp0], %[ftmp6])

-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"

-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"

-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"

-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"

+    MMI_LI(%[tmp0], 0x0040004000400040)

+    MMI_MTC1(%[tmp0], %[ff_ph_40])

+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)

+    MMI_MTC1(%[tmp0], %[mask])

     // fdata3: fdata3[0] ~ fdata3[3]

     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A

@@ -1232,15 +1299,14 @@

     : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),

       [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),

       [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr),

-      [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)

-    : [filter_x0] "f"((uint64_t)filter_x[0]),

-      [filter_x1] "f"((uint64_t)filter_x[1]),

-      [filter_y0] "f"((uint64_t)filter_y[0]),

-      [filter_y1] "f"((uint64_t)filter_y[1]),

-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),

-      [mask] "f"(mask)

+      [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter),

+      [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),

+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),

+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)

+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)

     : "memory"

);

+  /* clang-format on */

 #define SUBPIX_VAR4XN(H)                                                      \

--

⑨