ref: 2d36522991a313585a2e1895500d95ff522fc373
parent: 28801f91c4c030da55d483840691582440f8f8f4
author: Linfeng Zhang <linfengz@google.com>
date: Mon May 7 09:38:04 EDT 2018
Update vpx_sum_squares_2d_i16_sse2() Change-Id: I5a2ca2ed246277cf6b1ef2ffac34ce5c40aa0158
--- a/vpx_dsp/sum_squares.c
+++ b/vpx_dsp/sum_squares.c
@@ -10,8 +10,7 @@
#include "./vpx_dsp_rtcd.h"
-uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
- int size) {
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size) {
int r, c;
uint64_t ss = 0;
@@ -20,7 +19,7 @@
const int16_t v = src[c];
ss += v * v;
}
- src += src_stride;
+ src += stride;
}
return ss;
--- a/vpx_dsp/x86/mem_sse2.h
+++ b/vpx_dsp/x86/mem_sse2.h
@@ -15,6 +15,11 @@
#include "./vpx_config.h"
+static INLINE __m128i loadh_epi64(const __m128i s, const void *const src) {
+ return _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
__m128i *const d) {
d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
--- a/vpx_dsp/x86/sum_squares_sse2.c
+++ b/vpx_dsp/x86/sum_squares_sse2.c
@@ -10,120 +10,96 @@
#include <assert.h>
#include <emmintrin.h>
-#include <stdio.h>
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/mem_sse2.h"
-static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
- int stride) {
- const __m128i v_val_0_w =
- _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
- const __m128i v_val_1_w =
- _mm_loadl_epi64((const __m128i *)(src + 1 * stride));
- const __m128i v_val_2_w =
- _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
- const __m128i v_val_3_w =
- _mm_loadl_epi64((const __m128i *)(src + 3 * stride));
+uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
+ // Over 75% of all calls are with size == 4.
+ if (size == 4) {
+ __m128i s[2], sq[2], ss;
- const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
- const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
- const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
- const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+ s[0] = _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
+ s[0] = loadh_epi64(s[0], src + 1 * stride);
+ s[1] = _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
+ s[1] = loadh_epi64(s[1], src + 3 * stride);
+ sq[0] = _mm_madd_epi16(s[0], s[0]);
+ sq[1] = _mm_madd_epi16(s[1], s[1]);
+ sq[0] = _mm_add_epi32(sq[0], sq[1]);
+ ss = _mm_add_epi32(sq[0], _mm_srli_si128(sq[0], 8));
+ ss = _mm_add_epi32(ss, _mm_srli_epi64(ss, 32));
- const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
- const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
- const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+ return (uint64_t)_mm_cvtsi128_si32(ss);
+ } else {
+ // Generic case
+ int r = size;
+ const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+ __m128i v_acc_q = _mm_setzero_si128();
- const __m128i v_sum_d =
- _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
+ assert(size % 8 == 0);
- return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
-}
+ do {
+ int c = 0;
+ __m128i v_acc_d = _mm_setzero_si128();
-// TODO(jingning): Evaluate the performance impact here.
-#ifdef __GNUC__
-// This prevents GCC/Clang from inlining this function into
-// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack
-// maintenance instructions in the common case of 4x4.
-__attribute__((noinline))
-#endif
-static uint64_t
-vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) {
- int r, c;
- const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
- __m128i v_acc_q = _mm_setzero_si128();
+ do {
+ const int16_t *const b = src + c;
+ const __m128i v_val_0_w =
+ _mm_load_si128((const __m128i *)(b + 0 * stride));
+ const __m128i v_val_1_w =
+ _mm_load_si128((const __m128i *)(b + 1 * stride));
+ const __m128i v_val_2_w =
+ _mm_load_si128((const __m128i *)(b + 2 * stride));
+ const __m128i v_val_3_w =
+ _mm_load_si128((const __m128i *)(b + 3 * stride));
+ const __m128i v_val_4_w =
+ _mm_load_si128((const __m128i *)(b + 4 * stride));
+ const __m128i v_val_5_w =
+ _mm_load_si128((const __m128i *)(b + 5 * stride));
+ const __m128i v_val_6_w =
+ _mm_load_si128((const __m128i *)(b + 6 * stride));
+ const __m128i v_val_7_w =
+ _mm_load_si128((const __m128i *)(b + 7 * stride));
- for (r = 0; r < size; r += 8) {
- __m128i v_acc_d = _mm_setzero_si128();
+ const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+ const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+ const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+ const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+ const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
- for (c = 0; c < size; c += 8) {
- const int16_t *b = src + c;
- const __m128i v_val_0_w =
- _mm_load_si128((const __m128i *)(b + 0 * stride));
- const __m128i v_val_1_w =
- _mm_load_si128((const __m128i *)(b + 1 * stride));
- const __m128i v_val_2_w =
- _mm_load_si128((const __m128i *)(b + 2 * stride));
- const __m128i v_val_3_w =
- _mm_load_si128((const __m128i *)(b + 3 * stride));
- const __m128i v_val_4_w =
- _mm_load_si128((const __m128i *)(b + 4 * stride));
- const __m128i v_val_5_w =
- _mm_load_si128((const __m128i *)(b + 5 * stride));
- const __m128i v_val_6_w =
- _mm_load_si128((const __m128i *)(b + 6 * stride));
- const __m128i v_val_7_w =
- _mm_load_si128((const __m128i *)(b + 7 * stride));
+ const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+ const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
- const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
- const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
- const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
- const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
- const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
- const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
- const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
- const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+ const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+ const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
- const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
- const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
- const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
- const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+ v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+ v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
+ c += 8;
+ } while (c < size);
- const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
- const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
- v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
- v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
- }
+ src += 8 * stride;
+ r -= 8;
+ } while (r);
- v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
- v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
- src += 8 * stride;
- }
-
- v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
-
#if ARCH_X86_64
- return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+ return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
#else
- {
- uint64_t tmp;
- _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
- return tmp;
- }
+ {
+ uint64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
+ return tmp;
+ }
#endif
-}
-
-uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
- // 4 elements per row only requires half an XMM register, so this
- // must be a special case, but also note that over 75% of all calls
- // are with size == 4, so it is also the common case.
- if (size == 4) {
- return vpx_sum_squares_2d_i16_4x4_sse2(src, stride);
- } else {
- // Generic case
- assert(size % 8 == 0);
- return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size);
}
}