ref: 17ffb22d4daf4c4645b4c79086d8c27a2768775e
parent: 2c45cd174a9582909ee2a7ba9cdb3feb917840cf
author: Scott LaVarnway <slavarnway@google.com>
date: Thu Jul 19 23:50:49 EDT 2018
VPX: Improve HBD vpx_hadamard_16x16_sse2() ~12% improvement. Change-Id: Ieca4d870a4c1c5ea2c689e27fc4550fcbab9f867
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -264,8 +264,9 @@
}
}
-void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
- tran_low_t *coeff) {
+static INLINE void hadamard_8x8_sse2(int16_t const *src_diff,
+ ptrdiff_t src_stride, tran_low_t *_coeff,
+ int is_final) {
__m128i src[8];
src[0] = _mm_load_si128((const __m128i *)src_diff);
src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
@@ -279,37 +280,72 @@
hadamard_col8_sse2(src, 0);
hadamard_col8_sse2(src, 1);
- store_tran_low(src[0], coeff);
- coeff += 8;
- store_tran_low(src[1], coeff);
- coeff += 8;
- store_tran_low(src[2], coeff);
- coeff += 8;
- store_tran_low(src[3], coeff);
- coeff += 8;
- store_tran_low(src[4], coeff);
- coeff += 8;
- store_tran_low(src[5], coeff);
- coeff += 8;
- store_tran_low(src[6], coeff);
- coeff += 8;
- store_tran_low(src[7], coeff);
+ if (is_final) {
+ store_tran_low(src[0], _coeff);
+ _coeff += 8;
+ store_tran_low(src[1], _coeff);
+ _coeff += 8;
+ store_tran_low(src[2], _coeff);
+ _coeff += 8;
+ store_tran_low(src[3], _coeff);
+ _coeff += 8;
+ store_tran_low(src[4], _coeff);
+ _coeff += 8;
+ store_tran_low(src[5], _coeff);
+ _coeff += 8;
+ store_tran_low(src[6], _coeff);
+ _coeff += 8;
+ store_tran_low(src[7], _coeff);
+ } else {
+ int16_t *coeff = (int16_t *)_coeff;
+ _mm_store_si128((__m128i *)coeff, src[0]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[1]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[2]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[3]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[4]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[5]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[6]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[7]);
+ }
}
+void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
+}
+
void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // For high bitdepths, it is unnecessary to store_tran_low
+ // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+ // next stage. Output to an intermediate buffer first, then store_tran_low()
+ // in the final stage.
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+ int16_t *t_coeff = temp_coeff;
+#else
+ int16_t *t_coeff = coeff;
+#endif
int idx;
for (idx = 0; idx < 4; ++idx) {
int16_t const *src_ptr =
src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
- vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+ hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
+ 0);
}
for (idx = 0; idx < 64; idx += 8) {
- __m128i coeff0 = load_tran_low(coeff);
- __m128i coeff1 = load_tran_low(coeff + 64);
- __m128i coeff2 = load_tran_low(coeff + 128);
- __m128i coeff3 = load_tran_low(coeff + 192);
+ __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
__m128i b0 = _mm_add_epi16(coeff0, coeff1);
__m128i b1 = _mm_sub_epi16(coeff0, coeff1);
@@ -332,6 +368,7 @@
store_tran_low(coeff3, coeff + 192);
coeff += 8;
+ t_coeff += 8;
}
}