ref: 8ee9b855a0cb9930dd859e391552b40a1ca6c35d
parent: f16ea6a6eb3b0e06b9e3e586ef15be0d2b0e870c
parent: e45c1f55b4e8d10a3fe66a986749c849c72fae58
author: Kaustubh Raste <kaustubh.raste@imgtec.com>
date: Thu Mar 23 03:44:16 EDT 2017
Merge "Fix mips msa fwd xform mismatch"
--- a/vpx_dsp/mips/fwd_dct32x32_msa.c
+++ b/vpx_dsp/mips/fwd_dct32x32_msa.c
@@ -927,21 +927,21 @@
}
void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
- int sum = LD_HADD(input, stride);
- sum += LD_HADD(input + 8, stride);
- sum += LD_HADD(input + 16, stride);
- sum += LD_HADD(input + 24, stride);
- sum += LD_HADD(input + 32 * 8, stride);
- sum += LD_HADD(input + 32 * 8 + 8, stride);
- sum += LD_HADD(input + 32 * 8 + 16, stride);
- sum += LD_HADD(input + 32 * 8 + 24, stride);
- sum += LD_HADD(input + 32 * 16, stride);
- sum += LD_HADD(input + 32 * 16 + 8, stride);
- sum += LD_HADD(input + 32 * 16 + 16, stride);
- sum += LD_HADD(input + 32 * 16 + 24, stride);
- sum += LD_HADD(input + 32 * 24, stride);
- sum += LD_HADD(input + 32 * 24 + 8, stride);
- sum += LD_HADD(input + 32 * 24 + 16, stride);
- sum += LD_HADD(input + 32 * 24 + 24, stride);
+ int sum, i;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v4i32 vec_w = { 0 };
+
+ for (i = 0; i < 16; ++i) {
+ LD_SH4(input, 8, in0, in1, in2, in3);
+ input += stride;
+ LD_SH4(input, 8, in4, in5, in6, in7);
+ input += stride;
+ ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
+ ADD2(in0, in2, in4, in6, in0, in4);
+ vec_w += __msa_hadd_s_w(in0, in0);
+ vec_w += __msa_hadd_s_w(in4, in4);
+ }
+
+ sum = HADD_SW_S32(vec_w);
out[0] = (int16_t)(sum >> 3);
}
--- a/vpx_dsp/mips/fwd_txfm_msa.c
+++ b/vpx_dsp/mips/fwd_txfm_msa.c
@@ -216,7 +216,15 @@
}
void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
- out[0] = LD_HADD(input, stride);
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v4i32 vec_w;
+
+ LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+ ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
+ ADD2(in0, in2, in4, in6, in0, in4);
+ vec_w = __msa_hadd_s_w(in0, in0);
+ vec_w += __msa_hadd_s_w(in4, in4);
+ out[0] = HADD_SW_S32(vec_w);
out[1] = 0;
}
@@ -237,9 +245,25 @@
}
void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
- int sum = LD_HADD(input, stride);
- sum += LD_HADD(input + 8, stride);
- sum += LD_HADD(input + 16 * 8, stride);
- sum += LD_HADD(input + 16 * 8 + 8, stride);
+ int sum, i;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v4i32 vec_w = { 0 };
+
+ for (i = 0; i < 4; ++i) {
+ LD_SH2(input, 8, in0, in1);
+ input += stride;
+ LD_SH2(input, 8, in2, in3);
+ input += stride;
+ LD_SH2(input, 8, in4, in5);
+ input += stride;
+ LD_SH2(input, 8, in6, in7);
+ input += stride;
+ ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
+ ADD2(in0, in2, in4, in6, in0, in4);
+ vec_w += __msa_hadd_s_w(in0, in0);
+ vec_w += __msa_hadd_s_w(in4, in4);
+ }
+
+ sum = HADD_SW_S32(vec_w);
out[0] = (int16_t)(sum >> 1);
}
--- a/vpx_dsp/mips/fwd_txfm_msa.h
+++ b/vpx_dsp/mips/fwd_txfm_msa.h
@@ -14,22 +14,6 @@
#include "vpx_dsp/mips/txfm_macros_msa.h"
#include "vpx_dsp/txfm_common.h"
-#define LD_HADD(psrc, stride) \
- ({ \
- v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \
- v4i32 vec_w_m; \
- \
- LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \
- ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \
- LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \
- ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \
- in0_m, in4_m); \
- in0_m += in4_m; \
- \
- vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \
- HADD_SW_S32(vec_w_m); \
- })
-
#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \