ref: b7310e2affd82ce75eee4385fca36275d048f457
parent: 8b5eddf709b5ecd09c2cec98c5418a2e3b0cfe14
author: Kaustubh Raste <kaustubh.raste@imgtec.com>
date: Mon Oct 10 12:15:06 EDT 2016
Optimize sad_64width_x4d_msa function Reduced HADD_UH_U32 macro calls Change-Id: Ie089b9a443de516646b46e8f72156aa826ca8cfa
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -909,25 +909,40 @@
sum_m; \
})
-/* Description : Horizontal addition of 8 unsigned halfword elements
- Arguments : Inputs - in (unsigned halfword vector)
- Outputs - sum_m (u32 sum)
- Return Type - unsigned word
- Details : 8 unsigned halfword elements of input vector are added
- together and the resulting integer sum is returned
+/* Description : Horizontal addition of 4 unsigned word elements
+ Arguments : Input - in (unsigned word vector)
+ Output - sum_m (u32 sum)
+ Return Type - unsigned word (GP)
+ Details : 4 unsigned word elements of 'in' vector are added together and
+ the resulting integer sum is returned
*/
-#define HADD_UH_U32(in) \
+#define HADD_UW_U32(in) \
({ \
- v4u32 res_m; \
v2u64 res0_m, res1_m; \
uint32_t sum_m; \
\
- res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
- res0_m = __msa_hadd_u_d(res_m, res_m); \
+ res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \
res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
- res0_m = res0_m + res1_m; \
+ res0_m += res1_m; \
sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \
sum_m; \
+ })
+
+/* Description : Horizontal addition of 8 unsigned halfword elements
+ Arguments : Input - in (unsigned halfword vector)
+ Output - sum_m (u32 sum)
+ Return Type - unsigned word
+ Details : 8 unsigned halfword elements of 'in' vector are added
+ together and the resulting integer sum is returned
+*/
+#define HADD_UH_U32(in) \
+ ({ \
+ v4u32 res_m; \
+ uint32_t sum_m; \
+ \
+ res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
+ sum_m = HADD_UW_U32(res_m); \
+ sum_m; \
})
/* Description : Horizontal addition of unsigned byte vector elements
--- a/vpx_dsp/mips/sad_msa.c
+++ b/vpx_dsp/mips/sad_msa.c
@@ -1030,6 +1030,7 @@
v8u16 sad2_1 = { 0 };
v8u16 sad3_0 = { 0 };
v8u16 sad3_1 = { 0 };
+ v4u32 sad;
ref0_ptr = aref_ptr[0];
ref1_ptr = aref_ptr[1];
@@ -1061,14 +1062,21 @@
sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
}
- sad_array[0] = HADD_UH_U32(sad0_0);
- sad_array[0] += HADD_UH_U32(sad0_1);
- sad_array[1] = HADD_UH_U32(sad1_0);
- sad_array[1] += HADD_UH_U32(sad1_1);
- sad_array[2] = HADD_UH_U32(sad2_0);
- sad_array[2] += HADD_UH_U32(sad2_1);
- sad_array[3] = HADD_UH_U32(sad3_0);
- sad_array[3] += HADD_UH_U32(sad3_1);
+ sad = __msa_hadd_u_w(sad0_0, sad0_0);
+ sad += __msa_hadd_u_w(sad0_1, sad0_1);
+ sad_array[0] = HADD_UW_U32(sad);
+
+ sad = __msa_hadd_u_w(sad1_0, sad1_0);
+ sad += __msa_hadd_u_w(sad1_1, sad1_1);
+ sad_array[1] = HADD_UW_U32(sad);
+
+ sad = __msa_hadd_u_w(sad2_0, sad2_0);
+ sad += __msa_hadd_u_w(sad2_1, sad2_1);
+ sad_array[2] = HADD_UW_U32(sad);
+
+ sad = __msa_hadd_u_w(sad3_0, sad3_0);
+ sad += __msa_hadd_u_w(sad3_1, sad3_1);
+ sad_array[3] = HADD_UW_U32(sad);
}
static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,