ref: 407fad235629e93bc31635761f21d58155cebd8a
parent: c1553f859f1a8378b32919541e85f958285c2360
author: Kaustubh Raste <kaustubh.raste@imgtec.com>
date: Fri Jan 27 06:11:42 EST 2017
Add mips msa vpx Integer projection row/col functions average improvement ~4x-5x Change-Id: I17c41383250282b39f5ecae0197ef1df7de20801
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -393,6 +393,20 @@
make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa),
make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa)));
+INSTANTIATE_TEST_CASE_P(
+ MSA, IntProRowTest,
+ ::testing::Values(make_tuple(16, &vpx_int_pro_row_msa, &vpx_int_pro_row_c),
+ make_tuple(32, &vpx_int_pro_row_msa, &vpx_int_pro_row_c),
+ make_tuple(64, &vpx_int_pro_row_msa,
+ &vpx_int_pro_row_c)));
+
+INSTANTIATE_TEST_CASE_P(
+ MSA, IntProColTest,
+ ::testing::Values(make_tuple(16, &vpx_int_pro_col_msa, &vpx_int_pro_col_c),
+ make_tuple(32, &vpx_int_pro_col_msa, &vpx_int_pro_col_c),
+ make_tuple(64, &vpx_int_pro_col_msa,
+ &vpx_int_pro_col_c)));
+
INSTANTIATE_TEST_CASE_P(MSA, SatdTest,
::testing::Values(make_tuple(16, &vpx_satd_msa),
make_tuple(64, &vpx_satd_msa),
--- a/vpx_dsp/mips/avg_msa.c
+++ b/vpx_dsp/mips/avg_msa.c
@@ -389,3 +389,175 @@
return satd;
}
+
+void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref,
+ const int ref_stride, const int height) {
+ int i;
+ v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ v8i16 hbuf_r = { 0 };
+ v8i16 hbuf_l = { 0 };
+ v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l;
+ v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l;
+
+ if (16 == height) {
+ for (i = 2; i--;) {
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ }
+
+ SRA_2V(hbuf_r, hbuf_l, 3);
+ ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+ } else if (32 == height) {
+ for (i = 2; i--;) {
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ }
+
+ SRA_2V(hbuf_r, hbuf_l, 4);
+ ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+ } else if (64 == height) {
+ for (i = 4; i--;) {
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ }
+
+ SRA_2V(hbuf_r, hbuf_l, 5);
+ ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+ } else {
+ const int norm_factor = height >> 1;
+ int cnt;
+
+ for (cnt = 0; cnt < 16; cnt++) {
+ hbuf[cnt] = 0;
+ }
+
+ for (i = 0; i < height; ++i) {
+ for (cnt = 0; cnt < 16; cnt++) {
+ hbuf[cnt] += ref[cnt];
+ }
+
+ ref += ref_stride;
+ }
+
+ for (cnt = 0; cnt < 16; cnt++) {
+ hbuf[cnt] /= norm_factor;
+ }
+ }
+}
+
+int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) {
+ int16_t sum;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8u16 ref0_h;
+
+ if (16 == width) {
+ ref0 = LD_UB(ref);
+ ref0_h = __msa_hadd_u_h(ref0, ref0);
+ sum = HADD_UH_U32(ref0_h);
+ } else if (32 == width) {
+ LD_UB2(ref, 16, ref0, ref1);
+ ref0_h = __msa_hadd_u_h(ref0, ref0);
+ ref0_h += __msa_hadd_u_h(ref1, ref1);
+ sum = HADD_UH_U32(ref0_h);
+ } else if (64 == width) {
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref0_h = __msa_hadd_u_h(ref0, ref0);
+ ref0_h += __msa_hadd_u_h(ref1, ref1);
+ ref0_h += __msa_hadd_u_h(ref2, ref2);
+ ref0_h += __msa_hadd_u_h(ref3, ref3);
+ sum = HADD_UH_U32(ref0_h);
+ } else {
+ int idx;
+
+ sum = 0;
+ for (idx = 0; idx < width; ++idx) {
+ sum += ref[idx];
+ }
+ }
+
+ return sum;
+}
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -1049,6 +1049,7 @@
}
#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
/* Description : Interleave even byte elements from vectors
Arguments : Inputs - in0, in1, in2, in3
@@ -1559,6 +1560,12 @@
Details : Each element of vector 'in0' is right shifted by 'shift' and
the result is written in-place. 'shift' is a GP variable.
*/
+#define SRA_2V(in0, in1, shift) \
+ { \
+ in0 = in0 >> shift; \
+ in1 = in1 >> shift; \
+ }
+
#define SRA_4V(in0, in1, in2, in3, shift) \
{ \
in0 = in0 >> shift; \
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -895,10 +895,10 @@
specialize qw/vpx_satd sse2 neon msa/;
add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
- specialize qw/vpx_int_pro_row sse2 neon/;
+ specialize qw/vpx_int_pro_row sse2 neon msa/;
add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
- specialize qw/vpx_int_pro_col sse2 neon/;
+ specialize qw/vpx_int_pro_col sse2 neon msa/;
add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
specialize qw/vpx_vector_var neon sse2/;