shithub: libvpx

--- a/vp9/common/mips/msa/vp9_macros_msa.h

+++ b/vp9/common/mips/msa/vp9_macros_msa.h

@@ -440,6 +440,17 @@

 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)

+/* Description : Store vectors of word elements with stride

+   Arguments   : Inputs  - in0, in1, stride

+                         - pdst    (destination pointer to store to)

+   Details     : Store 4 word elements from 'in0' to (pdst)

+                 Store 4 word elements from 'in1' to (pdst + stride)

+*/

+#define ST_SW2(in0, in1, pdst, stride) {  \

+  ST_SW(in0, (pdst));                     \

+  ST_SW(in1, (pdst) + stride);            \

+}

 /* Description : Store as 2x4 byte block to destination memory from input vector

    Arguments   : Inputs  - in, stidx, pdst, stride

                  Return Type - unsigned byte

@@ -781,6 +792,39 @@

 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)

+/* Description : Dot product & addition of halfword vector elements

+   Arguments   : Inputs  - mult0, mult1

+                           cnst0, cnst1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Signed halfword elements from 'mult0' are multiplied with

+                 signed halfword elements from 'cnst0' producing a result

+                 twice the size of input i.e. signed word.

+                 The multiplication result of adjacent odd-even elements

+                 are added to the 'out0' vector

+*/

+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \

+  out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \

+  out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \

+}

+#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)

+/* Description : Dot product & addition of double word vector elements

+   Arguments   : Inputs  - mult0, mult1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Each signed word element from 'mult0' is multiplied with itself

+                 producing an intermediate result twice the size of input

+                 i.e. signed double word

+                 The multiplication result of adjacent odd-even elements

+                 are added to the 'out0' vector

+*/

+#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) {                       \

+  out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0);  \

+  out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1);  \

+}

+#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)

 /* Description : Minimum values between unsigned elements of

                  either vector are copied to the output vector

    Arguments   : Inputs  - in0, in1, min_vec

@@ -861,6 +905,34 @@

   HADD_UB2(RTYPE, in2, in3, out2, out3);                               \

 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)

+/* Description : Horizontal subtraction of unsigned byte vector elements

+   Arguments   : Inputs  - in0, in1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Each unsigned odd byte element from 'in0' is subtracted from

+                 even unsigned byte element from 'in0' (pairwise) and the

+                 halfword result is written to 'out0'

+*/

+#define HSUB_UB2(RTYPE, in0, in1, out0, out1) {          \

+  out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \

+  out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \

+}

+#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)

+/* Description : Horizontal subtraction of signed halfword vector elements

+   Arguments   : Inputs  - in0, in1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Each signed odd halfword element from 'in0' is subtracted from

+                 even signed halfword element from 'in0' (pairwise) and the

+                 word result is written to 'out0'

+*/

+#define HSUB_UH2(RTYPE, in0, in1, out0, out1) {          \

+  out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \

+  out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \

+}

+#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)

 /* Description : Insert specified word elements from input vectors to 1

                  destination vector

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -948,7 +948,7 @@

   specialize qw/vp9_fdct8x8_quant/;

 } else {

   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";

-  specialize qw/vp9_block_error avx2/, "$sse2_x86inc";

+  specialize qw/vp9_block_error avx2 msa/, "$sse2_x86inc";

   add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";

   specialize qw/vp9_block_error_fp sse2/;

--- /dev/null

+++ b/vp9/encoder/mips/msa/vp9_error_msa.c

@@ -1,0 +1,114 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vp9_rtcd.h"

+#include "vp9/common/mips/msa/vp9_macros_msa.h"

+#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize)                                   \

+static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr,     \

+                                             const int16_t *dq_coeff_ptr,  \

+                                             int64_t *ssz) {               \

+  int64_t err = 0;                                                         \

+  uint32_t loop_cnt;                                                       \

+  v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h;                             \

+  v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w;                              \

+  v2i64 sq_coeff_r, sq_coeff_l;                                            \

+  v2i64 err0, err_dup0, err1, err_dup1;                                    \

+                                                                           \

+  coeff = LD_SH(coeff_ptr);                                                \

+  dq_coeff = LD_SH(dq_coeff_ptr);                                          \

+  UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \

+  ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \

+  HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \

+  DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w,                  \

+              sq_coeff_r, sq_coeff_l);                                     \

+  DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1);                 \

+                                                                           \

+  coeff = LD_SH(coeff_ptr + 8);                                            \

+  dq_coeff = LD_SH(dq_coeff_ptr + 8);                                      \

+  UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \

+  ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \

+  HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \

+  DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);              \

+  DPADD_SD2_SD(diff_r, diff_l, err0, err1);                                \

+                                                                           \

+  coeff_ptr += 16;                                                         \

+  dq_coeff_ptr += 16;                                                      \

+                                                                           \

+  for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) {                       \

+    coeff = LD_SH(coeff_ptr);                                              \

+    dq_coeff = LD_SH(dq_coeff_ptr);                                        \

+    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \

+    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \

+    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \

+    DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \

+    DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \

+                                                                           \

+    coeff = LD_SH(coeff_ptr + 8);                                          \

+    dq_coeff = LD_SH(dq_coeff_ptr + 8);                                    \

+    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \

+    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \

+    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \

+    DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \

+    DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \

+                                                                           \

+    coeff_ptr += 16;                                                       \

+    dq_coeff_ptr += 16;                                                    \

+  }                                                                        \

+                                                                           \

+  err_dup0 = __msa_splati_d(sq_coeff_r, 1);                                \

+  err_dup1 = __msa_splati_d(sq_coeff_l, 1);                                \

+  sq_coeff_r += err_dup0;                                                  \

+  sq_coeff_l += err_dup1;                                                  \

+  *ssz = __msa_copy_s_d(sq_coeff_r, 0);                                    \

+  *ssz += __msa_copy_s_d(sq_coeff_l, 0);                                   \

+                                                                           \

+  err_dup0 = __msa_splati_d(err0, 1);                                      \

+  err_dup1 = __msa_splati_d(err1, 1);                                      \

+  err0 += err_dup0;                                                        \

+  err1 += err_dup1;                                                        \

+  err = __msa_copy_s_d(err0, 0);                                           \

+  err += __msa_copy_s_d(err1, 0);                                          \

+                                                                           \

+  return err;                                                              \

+}

+BLOCK_ERROR_BLOCKSIZE_MSA(16);

+BLOCK_ERROR_BLOCKSIZE_MSA(64);

+BLOCK_ERROR_BLOCKSIZE_MSA(256);

+BLOCK_ERROR_BLOCKSIZE_MSA(1024);

+int64_t vp9_block_error_msa(const tran_low_t *coeff_ptr,

+                            const tran_low_t *dq_coeff_ptr,

+                            intptr_t blk_size, int64_t *ssz) {

+  int64_t err;

+  const int16_t *coeff = (const int16_t *)coeff_ptr;

+  const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr;

+  switch (blk_size) {

+    case 16:

+      err = block_error_16size_msa(coeff, dq_coeff, ssz);

+      break;

+    case 64:

+      err = block_error_64size_msa(coeff, dq_coeff, ssz);

+      break;

+    case 256:

+      err = block_error_256size_msa(coeff, dq_coeff, ssz);

+      break;

+    case 1024:

+      err = block_error_1024size_msa(coeff, dq_coeff, ssz);

+      break;

+    default:

+      err = vp9_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz);

+      break;

+  }

+  return err;

+}

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -152,11 +152,12 @@

 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c

 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c

+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c

+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h

-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c

 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

--

⑨