shithub: libvpx

ref: cdf16c22baea1ead5900bcdbd4d44b8c2ce80728
dir: /vp9/common/mips/msa/vp9_macros_msa.h/

View raw version
/*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
#define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_

#include <msa.h>

#include "./vpx_config.h"
#include "vpx/vpx_integer.h"

#if HAVE_MSA
/* load macros */
#define LOAD_UB(psrc) *((const v16u8 *)(psrc))
#define LOAD_SB(psrc) *((const v16i8 *)(psrc))
#define LOAD_UH(psrc) *((const v8u16 *)(psrc))
#define LOAD_SH(psrc) *((const v8i16 *)(psrc))
#define LOAD_UW(psrc) *((const v4u32 *)(psrc))
#define LOAD_SW(psrc) *((const v4i32 *)(psrc))
#define LOAD_UD(psrc) *((const v2u64 *)(psrc))
#define LOAD_SD(psrc) *((const v2i64 *)(psrc))

/* store macros */
#define STORE_UB(vec, pdest) *((v16u8 *)(pdest)) = (vec)
#define STORE_SB(vec, pdest) *((v16i8 *)(pdest)) = (vec)
#define STORE_UH(vec, pdest) *((v8u16 *)(pdest)) = (vec)
#define STORE_SH(vec, pdest) *((v8i16 *)(pdest)) = (vec)
#define STORE_UW(vec, pdest) *((v4u32 *)(pdest)) = (vec)
#define STORE_SW(vec, pdest) *((v4i32 *)(pdest)) = (vec)
#define STORE_UD(vec, pdest) *((v2u64 *)(pdest)) = (vec)
#define STORE_SD(vec, pdest) *((v2i64 *)(pdest)) = (vec)

#if (__mips_isa_rev >= 6)
#define LOAD_WORD(psrc) ({                         \
  const uint8_t *src_m = (const uint8_t *)(psrc);  \
  uint32_t val_m;                                  \
                                                   \
  __asm__ __volatile__ (                           \
      "lw  %[val_m],  %[src_m]  \n\t"              \
                                                   \
      : [val_m] "=r" (val_m)                       \
      : [src_m] "m" (*src_m)                       \
  );                                               \
                                                   \
  val_m;                                           \
})

#if (__mips == 64)
#define LOAD_DWORD(psrc) ({                        \
  const uint8_t *src_m = (const uint8_t *)(psrc);  \
  uint64_t val_m = 0;                              \
                                                   \
  __asm__ __volatile__ (                           \
      "ld  %[val_m],  %[src_m]  \n\t"              \
                                                   \
      : [val_m] "=r" (val_m)                       \
      : [src_m] "m" (*src_m)                       \
  );                                               \
                                                   \
  val_m;                                           \
})
#else  // !(__mips == 64)
#define LOAD_DWORD(psrc) ({                                      \
  const uint8_t *src1_m = (const uint8_t *)(psrc);               \
  const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4;         \
  uint32_t val0_m, val1_m;                                       \
  uint64_t genval_m = 0;                                         \
                                                                 \
  __asm__ __volatile__ (                                         \
      "lw  %[val0_m],  %[src1_m]  \n\t"                          \
                                                                 \
      : [val0_m] "=r" (val0_m)                                   \
      : [src1_m] "m" (*src1_m)                                   \
  );                                                             \
                                                                 \
  __asm__ __volatile__ (                                         \
      "lw  %[val1_m],  %[src2_m]  \n\t"                          \
                                                                 \
      : [val1_m] "=r" (val1_m)                                   \
      : [src2_m] "m" (*src2_m)                                   \
  );                                                             \
                                                                 \
  genval_m = (uint64_t)(val1_m);                                 \
  genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000);  \
  genval_m = (uint64_t)(genval_m | (uint64_t)val0_m);            \
                                                                 \
  genval_m;                                                      \
})
#endif  // (__mips == 64)
#define STORE_WORD_WITH_OFFSET_1(pdst, val) {    \
  uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1;  \
  const uint32_t val_m = (val);                  \
                                                 \
  __asm__ __volatile__ (                         \
      "sw  %[val_m],  %[dst_ptr_m]  \n\t"        \
                                                 \
      : [dst_ptr_m] "=m" (*dst_ptr_m)            \
      : [val_m] "r" (val_m)                      \
  );                                             \
}

#define STORE_WORD(pdst, val) {            \
  uint8_t *dst_ptr_m = (uint8_t *)(pdst);  \
  const uint32_t val_m = (val);            \
                                           \
  __asm__ __volatile__ (                   \
      "sw  %[val_m],  %[dst_ptr_m]  \n\t"  \
                                           \
      : [dst_ptr_m] "=m" (*dst_ptr_m)      \
      : [val_m] "r" (val_m)                \
  );                                       \
}

#define STORE_DWORD(pdst, val) {           \
  uint8_t *dst_ptr_m = (uint8_t *)(pdst);  \
  const uint64_t val_m = (val);            \
                                           \
  __asm__ __volatile__ (                   \
      "sd  %[val_m],  %[dst_ptr_m]  \n\t"  \
                                           \
      : [dst_ptr_m] "=m" (*dst_ptr_m)      \
      : [val_m] "r" (val_m)                \
  );                                       \
}
#else  // !(__mips_isa_rev >= 6)
#define LOAD_WORD(psrc) ({                         \
  const uint8_t *src_m = (const uint8_t *)(psrc);  \
  uint32_t val_m;                                  \
                                                   \
  __asm__ __volatile__ (                           \
      "ulw  %[val_m],  %[src_m]  \n\t"             \
                                                   \
      : [val_m] "=r" (val_m)                       \
      : [src_m] "m" (*src_m)                       \
  );                                               \
                                                   \
  val_m;                                           \
})

#if (__mips == 64)
#define LOAD_DWORD(psrc) ({                        \
  const uint8_t *src_m = (const uint8_t *)(psrc);  \
  uint64_t val_m = 0;                              \
                                                   \
  __asm__ __volatile__ (                           \
      "uld  %[val_m],  %[src_m]  \n\t"             \
                                                   \
      : [val_m] "=r" (val_m)                       \
      : [src_m] "m" (*src_m)                       \
  );                                               \
                                                   \
  val_m;                                           \
})
#else  // !(__mips == 64)
#define LOAD_DWORD(psrc) ({                                      \
  const uint8_t *src1_m = (const uint8_t *)(psrc);               \
  const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4;         \
  uint32_t val0_m, val1_m;                                       \
  uint64_t genval_m = 0;                                         \
                                                                 \
  __asm__ __volatile__ (                                         \
      "ulw  %[val0_m],  %[src1_m]  \n\t"                         \
                                                                 \
      : [val0_m] "=r" (val0_m)                                   \
      : [src1_m] "m" (*src1_m)                                   \
  );                                                             \
                                                                 \
  __asm__ __volatile__ (                                         \
      "ulw  %[val1_m],  %[src2_m]  \n\t"                         \
                                                                 \
      : [val1_m] "=r" (val1_m)                                   \
      : [src2_m] "m" (*src2_m)                                   \
  );                                                             \
                                                                 \
  genval_m = (uint64_t)(val1_m);                                 \
  genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000);  \
  genval_m = (uint64_t)(genval_m | (uint64_t)val0_m);            \
                                                                 \
  genval_m;                                                      \
})
#endif  // (__mips == 64)

#define STORE_WORD_WITH_OFFSET_1(pdst, val) {    \
  uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1;  \
  const uint32_t val_m = (val);                  \
                                                 \
  __asm__ __volatile__ (                         \
      "usw  %[val_m],  %[dst_ptr_m]  \n\t"       \
                                                 \
      : [dst_ptr_m] "=m" (*dst_ptr_m)            \
      : [val_m] "r" (val_m)                      \
  );                                             \
}

#define STORE_WORD(pdst, val) {             \
  uint8_t *dst_ptr_m = (uint8_t *)(pdst);   \
  const uint32_t val_m = (val);             \
                                            \
  __asm__ __volatile__ (                    \
      "usw  %[val_m],  %[dst_ptr_m]  \n\t"  \
                                            \
      : [dst_ptr_m] "=m" (*dst_ptr_m)       \
      : [val_m] "r" (val_m)                 \
  );                                        \
}

#define STORE_DWORD(pdst, val) {                            \
  uint8_t *dst1_m = (uint8_t *)(pdst);                      \
  uint8_t *dst2_m = ((uint8_t *)(pdst)) + 4;                \
  uint32_t val0_m, val1_m;                                  \
                                                            \
  val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
  val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
                                                            \
  __asm__ __volatile__ (                                    \
      "usw  %[val0_m],  %[dst1_m]  \n\t"                    \
      "usw  %[val1_m],  %[dst2_m]  \n\t"                    \
                                                            \
      : [dst1_m] "=m" (*dst1_m), [dst2_m] "=m" (*dst2_m)    \
      : [val0_m] "r" (val0_m), [val1_m] "r" (val1_m)        \
  );                                                        \
}
#endif  // (__mips_isa_rev >= 6)

#define LOAD_2VECS_UB(psrc, stride,   \
                      val0, val1) {   \
  val0 = LOAD_UB(psrc + 0 * stride);  \
  val1 = LOAD_UB(psrc + 1 * stride);  \
}

#define LOAD_4VECS_UB(psrc, stride,              \
                      val0, val1, val2, val3) {  \
  val0 = LOAD_UB(psrc + 0 * stride);             \
  val1 = LOAD_UB(psrc + 1 * stride);             \
  val2 = LOAD_UB(psrc + 2 * stride);             \
  val3 = LOAD_UB(psrc + 3 * stride);             \
}

#define LOAD_4VECS_SB(psrc, stride,              \
                      val0, val1, val2, val3) {  \
  val0 = LOAD_SB(psrc + 0 * stride);             \
  val1 = LOAD_SB(psrc + 1 * stride);             \
  val2 = LOAD_SB(psrc + 2 * stride);             \
  val3 = LOAD_SB(psrc + 3 * stride);             \
}

#define LOAD_5VECS_UB(psrc, stride,                    \
                      out0, out1, out2, out3, out4) {  \
  LOAD_4VECS_UB((psrc), (stride),                      \
                (out0), (out1), (out2), (out3));       \
  out4 = LOAD_UB(psrc + 4 * stride);                   \
}

#define LOAD_5VECS_SB(psrc, stride,                    \
                      out0, out1, out2, out3, out4) {  \
  LOAD_4VECS_SB((psrc), (stride),                      \
                (out0), (out1), (out2), (out3));       \
  out4 = LOAD_SB(psrc + 4 * stride);                   \
}

#define LOAD_7VECS_SB(psrc, stride,            \
                      val0, val1, val2, val3,  \
                      val4, val5, val6) {      \
  val0 = LOAD_SB((psrc) + 0 * (stride));       \
  val1 = LOAD_SB((psrc) + 1 * (stride));       \
  val2 = LOAD_SB((psrc) + 2 * (stride));       \
  val3 = LOAD_SB((psrc) + 3 * (stride));       \
  val4 = LOAD_SB((psrc) + 4 * (stride));       \
  val5 = LOAD_SB((psrc) + 5 * (stride));       \
  val6 = LOAD_SB((psrc) + 6 * (stride));       \
}

#define LOAD_8VECS_UB(psrc, stride,               \
                      out0, out1, out2, out3,     \
                      out4, out5, out6, out7) {   \
  LOAD_4VECS_UB((psrc), (stride),                 \
                (out0), (out1), (out2), (out3));  \
  LOAD_4VECS_UB((psrc + 4 * stride), (stride),    \
                (out4), (out5), (out6), (out7));  \
}

#define LOAD_8VECS_SB(psrc, stride,               \
                      out0, out1, out2, out3,     \
                      out4, out5, out6, out7) {   \
  LOAD_4VECS_SB((psrc), (stride),                 \
                (out0), (out1), (out2), (out3));  \
  LOAD_4VECS_SB((psrc + 4 * stride), (stride),    \
                (out4), (out5), (out6), (out7));  \
}

#define LOAD_2VECS_SH(psrc, stride,       \
                      val0, val1) {       \
  val0 = LOAD_SH((psrc) + 0 * (stride));  \
  val1 = LOAD_SH((psrc) + 1 * (stride));  \
}

#define LOAD_4VECS_SH(psrc, stride,                          \
                      val0, val1, val2, val3) {              \
  LOAD_2VECS_SH((psrc), (stride), val0, val1);               \
  LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3);  \
}

#define LOAD_8VECS_SH(psrc, stride,              \
                      val0, val1, val2, val3,    \
                      val4, val5, val6, val7) {  \
  LOAD_4VECS_SH((psrc), (stride),                \
                val0, val1, val2, val3);         \
  LOAD_4VECS_SH((psrc + 4 * stride), (stride),   \
                val4, val5, val6, val7);         \
}

#define LOAD_16VECS_SH(psrc, stride,                  \
                       val0, val1, val2, val3,        \
                       val4, val5, val6, val7,        \
                       val8, val9, val10, val11,      \
                       val12, val13, val14, val15) {  \
  LOAD_8VECS_SH((psrc), (stride),                     \
                val0, val1, val2, val3,               \
                val4, val5, val6, val7);              \
  LOAD_8VECS_SH((psrc + 8 * (stride)), (stride),      \
                val8, val9, val10, val11,             \
                val12, val13, val14, val15);          \
}

#define STORE_4VECS_UB(dst_out, pitch,         \
                       in0, in1, in2, in3) {   \
  STORE_UB((in0), (dst_out));                  \
  STORE_UB((in1), ((dst_out) + (pitch)));      \
  STORE_UB((in2), ((dst_out) + 2 * (pitch)));  \
  STORE_UB((in3), ((dst_out) + 3 * (pitch)));  \
}

#define STORE_8VECS_UB(dst_out, pitch_in,               \
                       in0, in1, in2, in3,              \
                       in4, in5, in6, in7) {            \
  STORE_4VECS_UB(dst_out, pitch_in,                     \
                 in0, in1, in2, in3);                   \
  STORE_4VECS_UB((dst_out + 4 * (pitch_in)), pitch_in,  \
                 in4, in5, in6, in7);                   \
}

#define VEC_INSERT_4W_UB(src, src0, src1, src2, src3) {  \
  src = (v16u8)__msa_insert_w((v4i32)(src), 0, (src0));  \
  src = (v16u8)__msa_insert_w((v4i32)(src), 1, (src1));  \
  src = (v16u8)__msa_insert_w((v4i32)(src), 2, (src2));  \
  src = (v16u8)__msa_insert_w((v4i32)(src), 3, (src3));  \
}

#define VEC_INSERT_2DW_UB(src, src0, src1) {             \
  src = (v16u8)__msa_insert_d((v2i64)(src), 0, (src0));  \
  src = (v16u8)__msa_insert_d((v2i64)(src), 1, (src1));  \
}

#define STORE_4VECS_SH(ptr, stride,           \
                       in0, in1, in2, in3) {  \
  STORE_SH(in0, ((ptr) + 0 * stride));        \
  STORE_SH(in1, ((ptr) + 1 * stride));        \
  STORE_SH(in2, ((ptr) + 2 * stride));        \
  STORE_SH(in3, ((ptr) + 3 * stride));        \
}

#define STORE_8VECS_SH(ptr, stride,           \
                       in0, in1, in2, in3,    \
                       in4, in5, in6, in7) {  \
  STORE_SH(in0, ((ptr) + 0 * stride));        \
  STORE_SH(in1, ((ptr) + 1 * stride));        \
  STORE_SH(in2, ((ptr) + 2 * stride));        \
  STORE_SH(in3, ((ptr) + 3 * stride));        \
  STORE_SH(in4, ((ptr) + 4 * stride));        \
  STORE_SH(in5, ((ptr) + 5 * stride));        \
  STORE_SH(in6, ((ptr) + 6 * stride));        \
  STORE_SH(in7, ((ptr) + 7 * stride));        \
}

#define CLIP_UNSIGNED_CHAR_H(in) ({                   \
  v8i16 max_m = __msa_ldi_h(255);                     \
  v8i16 out_m;                                        \
                                                      \
  out_m = __msa_maxi_s_h((v8i16)(in), 0);             \
  out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
  out_m;                                              \
})

#define TRANSPOSE4X8_H(in0, in1, in2, in3,                   \
                       in4, in5, in6, in7,                   \
                       out0, out1, out2, out3,               \
                       out4, out5, out6, out7) {             \
  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                      \
  v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                      \
  v8i16 zero_m = { 0 };                                      \
                                                             \
  tmp0_n = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0));         \
  tmp1_n = __msa_ilvr_h((v8i16)(in3), (v8i16)(in2));         \
  tmp2_n = __msa_ilvr_h((v8i16)(in5), (v8i16)(in4));         \
  tmp3_n = __msa_ilvr_h((v8i16)(in7), (v8i16)(in6));         \
                                                             \
  ILV_W_LRLR_SH((tmp0_n), (tmp1_n), (tmp2_n), (tmp3_n),      \
                tmp2_m, tmp0_m, tmp3_m, tmp1_m);             \
                                                             \
  out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);  \
  out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);  \
  out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);  \
  out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);  \
                                                             \
  out4 = zero_m;                                             \
  out5 = zero_m;                                             \
  out6 = zero_m;                                             \
  out7 = zero_m;                                             \
}

#define TRANSPOSE8X4_H(in0, in1, in2, in3,        \
                       out0, out1, out2, out3) {  \
  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;           \
                                                  \
  ILV_H_LRLR_SH((in0), (in1), (in2), (in3),       \
                tmp2_m, tmp0_m, tmp3_m, tmp1_m);  \
                                                  \
  ILV_W_LRLR_SH(tmp0_m, tmp1_m, tmp2_m, tmp3_m,   \
                out1, out0, out3, out2);          \
}

/* halfword 8x8 transpose macro */
#define TRANSPOSE8x8_H_SH(in0, in1, in2, in3,                 \
                          in4, in5, in6, in7,                 \
                          out0, out1, out2, out3,             \
                          out4, out5, out6, out7) {           \
  v8i16 s0_m, s1_m;                                           \
  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                       \
  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                       \
                                                              \
  s0_m = __msa_ilvr_h((v8i16)(in6), (v8i16)(in4));            \
  s1_m = __msa_ilvr_h((v8i16)(in7), (v8i16)(in5));            \
  tmp0_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m);            \
  tmp1_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m);            \
                                                              \
  s0_m = __msa_ilvl_h((v8i16)(in6), (v8i16)(in4));            \
  s1_m = __msa_ilvl_h((v8i16)(in7), (v8i16)(in5));            \
  tmp2_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m);            \
  tmp3_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m);            \
                                                              \
  s0_m = __msa_ilvr_h((v8i16)(in2), (v8i16)(in0));            \
  s1_m = __msa_ilvr_h((v8i16)(in3), (v8i16)(in1));            \
  tmp4_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m);            \
  tmp5_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m);            \
                                                              \
  s0_m = __msa_ilvl_h((v8i16)(in2), (v8i16)(in0));            \
  s1_m = __msa_ilvl_h((v8i16)(in3), (v8i16)(in1));            \
  tmp6_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m);            \
  tmp7_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m);            \
                                                              \
  out0 = (v8i16)__msa_pckev_d((v2i64)tmp0_m, (v2i64)tmp4_m);  \
  out1 = (v8i16)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);  \
  out2 = (v8i16)__msa_pckev_d((v2i64)tmp1_m, (v2i64)tmp5_m);  \
  out3 = (v8i16)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);  \
  out4 = (v8i16)__msa_pckev_d((v2i64)tmp2_m, (v2i64)tmp6_m);  \
  out5 = (v8i16)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);  \
  out6 = (v8i16)__msa_pckev_d((v2i64)tmp3_m, (v2i64)tmp7_m);  \
  out7 = (v8i16)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);  \
}

/* interleave macros */
/* no in-place support */
#define ILV_B_LRLR_UB(in0, in1, in2, in3,                  \
                      out0, out1, out2, out3) {            \
  out0 = (v16u8)__msa_ilvl_b((v16i8)(in1), (v16i8)(in0));  \
  out1 = (v16u8)__msa_ilvr_b((v16i8)(in1), (v16i8)(in0));  \
  out2 = (v16u8)__msa_ilvl_b((v16i8)(in3), (v16i8)(in2));  \
  out3 = (v16u8)__msa_ilvr_b((v16i8)(in3), (v16i8)(in2));  \
}

#define ILV_H_LRLR_SH(in0, in1, in2, in3,           \
                      out0, out1, out2, out3) {     \
  out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0));  \
  out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0));  \
  out2 = __msa_ilvl_h((v8i16)(in3), (v8i16)(in2));  \
  out3 = __msa_ilvr_h((v8i16)(in3), (v8i16)(in2));  \
}

#define ILV_W_LRLR_SH(in0, in1, in2, in3,                  \
                      out0, out1, out2, out3) {            \
  out0 = (v8i16)__msa_ilvl_w((v4i32)(in1), (v4i32)(in0));  \
  out1 = (v8i16)__msa_ilvr_w((v4i32)(in1), (v4i32)(in0));  \
  out2 = (v8i16)__msa_ilvl_w((v4i32)(in3), (v4i32)(in2));  \
  out3 = (v8i16)__msa_ilvr_w((v4i32)(in3), (v4i32)(in2));  \
}

#define ILV_H_LR_SH(in0, in1, out0, out1) {         \
  out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0));  \
  out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0));  \
}

#define ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l,            \
                        out0, out1) {                          \
  out0 = (v16u8)__msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r));  \
  out1 = (v16u8)__msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r));  \
}

#define ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,     \
                        out0, out1) {                   \
  out0 = __msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r));  \
  out1 = __msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r));  \
}

#define ILVR_B_4VECS_UB(in0_r, in1_r, in2_r, in3_r,  \
                        in0_l, in1_l, in2_l, in3_l,  \
                        out0, out1, out2, out3) {    \
  ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l,        \
                  out0, out1);                       \
  ILVR_B_2VECS_UB(in2_r, in3_r, in2_l, in3_l,        \
                  out2, out3);                       \
}

#define ILVR_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r,  \
                        in0_l, in1_l, in2_l, in3_l,  \
                        out0, out1, out2, out3) {    \
  ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,        \
                  out0, out1);                       \
  ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,        \
                  out2, out3);                       \
}

#define ILVR_B_6VECS_SB(in0_r, in1_r, in2_r,   \
                        in3_r, in4_r, in5_r,   \
                        in0_l, in1_l, in2_l,   \
                        in3_l, in4_l, in5_l,   \
                        out0, out1, out2,      \
                        out3, out4, out5) {    \
  ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,  \
                  out0, out1);                 \
  ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,  \
                  out2, out3);                 \
  ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l,  \
                  out4, out5);                 \
}

#define ILVR_B_8VECS_SB(in0_r, in1_r, in2_r, in3_r,  \
                        in4_r, in5_r, in6_r, in7_r,  \
                        in0_l, in1_l, in2_l, in3_l,  \
                        in4_l, in5_l, in6_l, in7_l,  \
                        out0, out1, out2, out3,      \
                        out4, out5, out6, out7) {    \
  ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,        \
                  out0, out1);                       \
  ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,        \
                  out2, out3);                       \
  ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l,        \
                  out4, out5);                       \
  ILVR_B_2VECS_SB(in6_r, in7_r, in6_l, in7_l,        \
                  out6, out7);                       \
}

#define ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,     \
                        out0, out1) {                   \
  out0 = __msa_ilvl_b((v16i8)(in0_l), (v16i8)(in0_r));  \
  out1 = __msa_ilvl_b((v16i8)(in1_l), (v16i8)(in1_r));  \
}

#define ILVL_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r,  \
                        in0_l, in1_l, in2_l, in3_l,  \
                        out0, out1, out2, out3) {    \
  ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,        \
                  out0, out1);                       \
  ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,        \
                  out2, out3);                       \
}

#define ILVL_B_6VECS_SB(in0_r, in1_r, in2_r,   \
                        in3_r, in4_r, in5_r,   \
                        in0_l, in1_l, in2_l,   \
                        in3_l, in4_l, in5_l,   \
                        out0, out1, out2,      \
                        out3, out4, out5) {    \
  ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,  \
                  out0, out1);                 \
  ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,  \
                  out2, out3);                 \
  ILVL_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l,  \
                  out4, out5);                 \
}

#define ILVR_D_2VECS_SB(out0, in0_l, in0_r,                    \
                        out1, in1_l, in1_r) {                  \
  out0 = (v16i8)__msa_ilvr_d((v2i64)(in0_l), (v2i64)(in0_r));  \
  out1 = (v16i8)__msa_ilvr_d((v2i64)(in1_l), (v2i64)(in1_r));  \
}

#define ILVR_D_3VECS_SB(out0, in0_l, in0_r,                    \
                        out1, in1_l, in1_r,                    \
                        out2, in2_l, in2_r) {                  \
  ILVR_D_2VECS_SB(out0, in0_l, in0_r,                          \
                  out1, in1_l, in1_r);                         \
  out2 = (v16i8)__msa_ilvr_d((v2i64)(in2_l), (v2i64)(in2_r));  \
}

#define ILVR_D_4VECS_SB(out0, in0_l, in0_r,    \
                        out1, in1_l, in1_r,    \
                        out2, in2_l, in2_r,    \
                        out3, in3_l, in3_r) {  \
  ILVR_D_2VECS_SB(out0, in0_l, in0_r,          \
                  out1, in1_l, in1_r);         \
  ILVR_D_2VECS_SB(out2, in2_l, in2_r,          \
                  out3, in3_l, in3_r);         \
}

#define DOTP_S_W_4VECS_SW(m0, c0, m1, c1,            \
                          m2, c2, m3, c3,            \
                          out0, out1, out2, out3) {  \
  out0 = __msa_dotp_s_w((v8i16)(m0), (v8i16)(c0));   \
  out1 = __msa_dotp_s_w((v8i16)(m1), (v8i16)(c1));   \
  out2 = __msa_dotp_s_w((v8i16)(m2), (v8i16)(c2));   \
  out3 = __msa_dotp_s_w((v8i16)(m3), (v8i16)(c3));   \
}

#define SPLATI_H_4VECS_SH(coeff, val0, val1, val2, val3,  \
                          out0, out1, out2, out3) {       \
  out0 = __msa_splati_h((v8i16)(coeff), (val0));          \
  out1 = __msa_splati_h((v8i16)(coeff), (val1));          \
  out2 = __msa_splati_h((v8i16)(coeff), (val2));          \
  out3 = __msa_splati_h((v8i16)(coeff), (val3));          \
}

#define PCKEV_H_2VECS_SH(in0_l, in0_r, in1_l, in1_r,     \
                         out0, out1) {                   \
  out0 = __msa_pckev_h((v8i16)(in0_l), (v8i16)(in0_r));  \
  out1 = __msa_pckev_h((v8i16)(in1_l), (v8i16)(in1_r));  \
}

#define PCKEV_H_4VECS_SH(in0_l, in0_r, in1_l, in1_r,     \
                         in2_l, in2_r, in3_l, in3_r,     \
                         out0, out1, out2, out3) {       \
  out0 = __msa_pckev_h((v8i16)(in0_l), (v8i16)(in0_r));  \
  out1 = __msa_pckev_h((v8i16)(in1_l), (v8i16)(in1_r));  \
  out2 = __msa_pckev_h((v8i16)(in2_l), (v8i16)(in2_r));  \
  out3 = __msa_pckev_h((v8i16)(in3_l), (v8i16)(in3_r));  \
}

#define XORI_B_2VECS_UB(val0, val1,               \
                        out0, out1, xor_val) {    \
  out0 = __msa_xori_b((v16u8)(val0), (xor_val));  \
  out1 = __msa_xori_b((v16u8)(val1), (xor_val));  \
}

#define XORI_B_2VECS_SB(val0, val1,                      \
                        out0, out1, xor_val) {           \
  out0 = (v16i8)__msa_xori_b((v16u8)(val0), (xor_val));  \
  out1 = (v16i8)__msa_xori_b((v16u8)(val1), (xor_val));  \
}

#define XORI_B_3VECS_SB(val0, val1, val2,                \
                        out0, out1, out2, xor_val) {     \
  XORI_B_2VECS_SB(val0, val1,  out0, out1, xor_val);     \
  out2 = (v16i8)__msa_xori_b((v16u8)(val2), (xor_val));  \
}

#define XORI_B_4VECS_UB(val0, val1, val2, val3,      \
                        out0, out1, out2, out3,      \
                        xor_val) {                   \
  XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val);  \
  XORI_B_2VECS_UB(val2, val3, out2, out3, xor_val);  \
}

#define XORI_B_4VECS_SB(val0, val1, val2, val3,      \
                        out0, out1, out2, out3,      \
                        xor_val) {                   \
  XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val);  \
  XORI_B_2VECS_SB(val2, val3, out2, out3, xor_val);  \
}

#define XORI_B_7VECS_SB(val0, val1, val2, val3,      \
                        val4, val5, val6,            \
                        out0, out1, out2, out3,      \
                        out4, out5, out6,            \
                        xor_val) {                   \
  XORI_B_4VECS_SB(val0, val1, val2, val3,            \
                  out0, out1, out2, out3, xor_val);  \
  XORI_B_3VECS_SB(val4, val5, val6,                  \
                  out4, out5, out6, xor_val);        \
}

#define SRARI_H_4VECS_UH(val0, val1, val2, val3,                  \
                         out0, out1, out2, out3,                  \
                         shift_right_val) {                       \
  out0 = (v8u16)__msa_srari_h((v8i16)(val0), (shift_right_val));  \
  out1 = (v8u16)__msa_srari_h((v8i16)(val1), (shift_right_val));  \
  out2 = (v8u16)__msa_srari_h((v8i16)(val2), (shift_right_val));  \
  out3 = (v8u16)__msa_srari_h((v8i16)(val3), (shift_right_val));  \
}

#define SRARI_H_4VECS_SH(val0, val1, val2, val3,           \
                         out0, out1, out2, out3,           \
                         shift_right_val) {                \
  out0 = __msa_srari_h((v8i16)(val0), (shift_right_val));  \
  out1 = __msa_srari_h((v8i16)(val1), (shift_right_val));  \
  out2 = __msa_srari_h((v8i16)(val2), (shift_right_val));  \
  out3 = __msa_srari_h((v8i16)(val3), (shift_right_val));  \
}

#define SRARI_W_4VECS_SW(val0, val1, val2, val3,           \
                         out0, out1, out2, out3,           \
                         shift_right_val) {                \
  out0 = __msa_srari_w((v4i32)(val0), (shift_right_val));  \
  out1 = __msa_srari_w((v4i32)(val1), (shift_right_val));  \
  out2 = __msa_srari_w((v4i32)(val2), (shift_right_val));  \
  out3 = __msa_srari_w((v4i32)(val3), (shift_right_val));  \
}

#define SRARI_SATURATE_UNSIGNED_H(input, right_shift_val, sat_val) ({  \
  v8u16 out_m;                                                         \
                                                                       \
  out_m = (v8u16)__msa_srari_h((v8i16)(input), (right_shift_val));     \
  out_m = __msa_sat_u_h(out_m, (sat_val));                             \
  out_m;                                                               \
})

#define SRARI_SATURATE_SIGNED_H(input, right_shift_val, sat_val) ({  \
  v8i16 out_m;                                                       \
                                                                     \
  out_m = __msa_srari_h((v8i16)(input), (right_shift_val));          \
  out_m = __msa_sat_s_h(out_m, (sat_val));                           \
  out_m;                                                             \
})

#define PCKEV_2B_XORI128_STORE_4_BYTES_4(in1, in2,        \
                                         pdst, stride) {  \
  uint32_t out0_m, out1_m, out2_m, out3_m;                \
  v16i8 tmp0_m;                                           \
  uint8_t *dst_m = (uint8_t *)(pdst);                     \
                                                          \
  tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1));     \
  tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128);       \
                                                          \
  out0_m = __msa_copy_u_w((v4i32)tmp0_m, 0);              \
  out1_m = __msa_copy_u_w((v4i32)tmp0_m, 1);              \
  out2_m = __msa_copy_u_w((v4i32)tmp0_m, 2);              \
  out3_m = __msa_copy_u_w((v4i32)tmp0_m, 3);              \
                                                          \
  STORE_WORD(dst_m, out0_m);                              \
  dst_m += stride;                                        \
  STORE_WORD(dst_m, out1_m);                              \
  dst_m += stride;                                        \
  STORE_WORD(dst_m, out2_m);                              \
  dst_m += stride;                                        \
  STORE_WORD(dst_m, out3_m);                              \
}

#define PCKEV_B_4_XORI128_STORE_8_BYTES_4(in1, in2,        \
                                          in3, in4,        \
                                          pdst, stride) {  \
  uint64_t out0_m, out1_m, out2_m, out3_m;                 \
  v16i8 tmp0_m, tmp1_m;                                    \
  uint8_t *dst_m = (uint8_t *)(pdst);                      \
                                                           \
  tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1));      \
  tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3));      \
                                                           \
  tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128);        \
  tmp1_m = (v16i8)__msa_xori_b((v16u8)tmp1_m, 128);        \
                                                           \
  out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0);               \
  out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1);               \
  out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0);               \
  out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1);               \
                                                           \
  STORE_DWORD(dst_m, out0_m);                              \
  dst_m += stride;                                         \
  STORE_DWORD(dst_m, out1_m);                              \
  dst_m += stride;                                         \
  STORE_DWORD(dst_m, out2_m);                              \
  dst_m += stride;                                         \
  STORE_DWORD(dst_m, out3_m);                              \
}

/* Only for signed vecs */
#define PCKEV_B_XORI128_STORE_VEC(in1, in2, pdest) {  \
  v16i8 tmp_m;                                        \
                                                      \
  tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2));  \
  tmp_m = (v16i8)__msa_xori_b((v16u8)tmp_m, 128);     \
  STORE_SB(tmp_m, (pdest));                           \
}

/* Only for signed vecs */
#define PCKEV_B_4_XORI128_AVG_STORE_8_BYTES_4(in1, dst0,       \
                                              in2, dst1,       \
                                              in3, dst2,       \
                                              in4, dst3,       \
                                              pdst, stride) {  \
  uint64_t out0_m, out1_m, out2_m, out3_m;                     \
  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                        \
  uint8_t *dst_m = (uint8_t *)(pdst);                          \
                                                               \
  tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1));   \
  tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3));   \
                                                               \
  tmp2_m = (v16u8)__msa_ilvr_d((v2i64)(dst1), (v2i64)(dst0));  \
  tmp3_m = (v16u8)__msa_ilvr_d((v2i64)(dst3), (v2i64)(dst2));  \
                                                               \
  tmp0_m = __msa_xori_b(tmp0_m, 128);                          \
  tmp1_m = __msa_xori_b(tmp1_m, 128);                          \
                                                               \
  tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m);                     \
  tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m);                     \
                                                               \
  out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0);                   \
  out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1);                   \
  out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0);                   \
  out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1);                   \
                                                               \
  STORE_DWORD(dst_m, out0_m);                                  \
  dst_m += stride;                                             \
  STORE_DWORD(dst_m, out1_m);                                  \
  dst_m += stride;                                             \
  STORE_DWORD(dst_m, out2_m);                                  \
  dst_m += stride;                                             \
  STORE_DWORD(dst_m, out3_m);                                  \
}

/* Only for signed vecs */
#define PCKEV_B_XORI128_AVG_STORE_VEC(in1, in2, dst, pdest) {  \
  v16u8 tmp_m;                                                 \
                                                               \
  tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2));    \
  tmp_m = __msa_xori_b(tmp_m, 128);                            \
  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst));                 \
  STORE_UB(tmp_m, (pdest));                                    \
}

#define PCKEV_B_STORE_8_BYTES_4(in1, in2, in3, in4,    \
                                pdst, stride) {        \
  uint64_t out0_m, out1_m, out2_m, out3_m;             \
  v16i8 tmp0_m, tmp1_m;                                \
  uint8_t *dst_m = (uint8_t *)(pdst);                  \
                                                       \
  tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1));  \
  tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3));  \
                                                       \
  out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0);           \
  out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1);           \
  out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0);           \
  out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1);           \
                                                       \
  STORE_DWORD(dst_m, out0_m);                          \
  dst_m += stride;                                     \
  STORE_DWORD(dst_m, out1_m);                          \
  dst_m += stride;                                     \
  STORE_DWORD(dst_m, out2_m);                          \
  dst_m += stride;                                     \
  STORE_DWORD(dst_m, out3_m);                          \
}

/* Only for unsigned vecs */
#define PCKEV_B_STORE_VEC(in1, in2, pdest) {          \
  v16i8 tmp_m;                                        \
                                                      \
  tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2));  \
  STORE_SB(tmp_m, (pdest));                           \
}

#define PCKEV_B_AVG_STORE_8_BYTES_4(in1, dst0, in2, dst1,       \
                                    in3, dst2, in4, dst3,       \
                                    pdst, stride) {             \
  uint64_t out0_m, out1_m, out2_m, out3_m;                      \
  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                         \
  uint8_t *dst_m = (uint8_t *)(pdst);                           \
                                                                \
  tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1));    \
  tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3));    \
                                                                \
  tmp2_m = (v16u8)__msa_pckev_d((v2i64)(dst1), (v2i64)(dst0));  \
  tmp3_m = (v16u8)__msa_pckev_d((v2i64)(dst3), (v2i64)(dst2));  \
                                                                \
  tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m);                      \
  tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m);                      \
                                                                \
  out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0);                    \
  out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1);                    \
  out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0);                    \
  out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1);                    \
                                                                \
  STORE_DWORD(dst_m, out0_m);                                   \
  dst_m += stride;                                              \
  STORE_DWORD(dst_m, out1_m);                                   \
  dst_m += stride;                                              \
  STORE_DWORD(dst_m, out2_m);                                   \
  dst_m += stride;                                              \
  STORE_DWORD(dst_m, out3_m);                                   \
}

#define PCKEV_B_AVG_STORE_VEC(in1, in2, dst, pdest) {        \
  v16u8 tmp_m;                                               \
                                                             \
  tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2));  \
  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst));               \
  STORE_UB(tmp_m, (pdest));                                  \
}

/* Generic for Vector types and GP operations */
#define BUTTERFLY_4(in0, in1, in2, in3,        \
                    out0, out1, out2, out3) {  \
  out0 = (in0) + (in3);                        \
  out1 = (in1) + (in2);                        \
                                               \
  out2 = (in1) - (in2);                        \
  out3 = (in0) - (in3);                        \
}

/* Generic for Vector types and GP operations */
#define BUTTERFLY_8(in0, in1, in2, in3,        \
                    in4, in5, in6, in7,        \
                    out0, out1, out2, out3,    \
                    out4, out5, out6, out7) {  \
  out0 = (in0) + (in7);                        \
  out1 = (in1) + (in6);                        \
  out2 = (in2) + (in5);                        \
  out3 = (in3) + (in4);                        \
                                               \
  out4 = (in3) - (in4);                        \
  out5 = (in2) - (in5);                        \
  out6 = (in1) - (in6);                        \
  out7 = (in0) - (in7);                        \
}
#endif  /* HAVE_MSA */
#endif  /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */