ref: a76b6b232cb612c786f80823d371e1518ce0da64
parent: f749905d0ad0c5dba4f85d8e003a4a317b9a98ec
author: Linfeng Zhang <linfengz@google.com>
date: Fri Jun 23 12:04:27 EDT 2017
Update load_input_data() in x86 Split to load_input_data4() and load_input_data8(). Use pack with signed saturation instruction for high bitdepth. Change-Id: Icda3e0129a6fdb4a51d1cafbdc652ae3a65f4e06
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -18,8 +18,8 @@
__m128i in[2];
const __m128i eight = _mm_set1_epi16(8);
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8);
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8);
switch (tx_type) {
case 0: // DCT_DCT
@@ -57,14 +57,14 @@
const __m128i final_rounding = _mm_set1_epi16(1 << 4);
// load input data
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8 * 1);
- in[2] = load_input_data(input + 8 * 2);
- in[3] = load_input_data(input + 8 * 3);
- in[4] = load_input_data(input + 8 * 4);
- in[5] = load_input_data(input + 8 * 5);
- in[6] = load_input_data(input + 8 * 6);
- in[7] = load_input_data(input + 8 * 7);
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8 * 1);
+ in[2] = load_input_data8(input + 8 * 2);
+ in[3] = load_input_data8(input + 8 * 3);
+ in[4] = load_input_data8(input + 8 * 4);
+ in[5] = load_input_data8(input + 8 * 5);
+ in[6] = load_input_data8(input + 8 * 6);
+ in[7] = load_input_data8(input + 8 * 7);
switch (tx_type) {
case 0: // DCT_DCT
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -27,8 +27,8 @@
__m128i in[2];
// Rows
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8);
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8);
idct4_sse2(in);
// Columns
@@ -491,10 +491,10 @@
const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
__m128i in[8], step1[8], step2[8], tmp[4];
- in[0] = load_input_data(input + 0 * 8);
- in[1] = load_input_data(input + 1 * 8);
- in[2] = load_input_data(input + 2 * 8);
- in[3] = load_input_data(input + 3 * 8);
+ in[0] = load_input_data4(input + 0 * 8);
+ in[1] = load_input_data4(input + 1 * 8);
+ in[2] = load_input_data4(input + 2 * 8);
+ in[3] = load_input_data4(input + 3 * 8);
transpose_16bit_4x4(in, in);
// in[0]: 00 10 20 30 01 11 21 31
@@ -721,14 +721,14 @@
static INLINE void idct16_load8x8(const tran_low_t *const input,
__m128i *const in) {
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8 * 2);
- in[2] = load_input_data(input + 8 * 4);
- in[3] = load_input_data(input + 8 * 6);
- in[4] = load_input_data(input + 8 * 8);
- in[5] = load_input_data(input + 8 * 10);
- in[6] = load_input_data(input + 8 * 12);
- in[7] = load_input_data(input + 8 * 14);
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8 * 2);
+ in[2] = load_input_data8(input + 8 * 4);
+ in[3] = load_input_data8(input + 8 * 6);
+ in[4] = load_input_data8(input + 8 * 8);
+ in[5] = load_input_data8(input + 8 * 10);
+ in[6] = load_input_data8(input + 8 * 12);
+ in[7] = load_input_data8(input + 8 * 14);
}
void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
@@ -1258,10 +1258,10 @@
int i;
// First 1-D inverse DCT
// Load input data.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8 * 2);
- in[2] = load_input_data(input + 8 * 4);
- in[3] = load_input_data(input + 8 * 6);
+ in[0] = load_input_data4(input + 0 * 16);
+ in[1] = load_input_data4(input + 1 * 16);
+ in[2] = load_input_data4(input + 2 * 16);
+ in[3] = load_input_data4(input + 3 * 16);
transpose_16bit_4x4(in, in);
@@ -1651,14 +1651,14 @@
int i;
// Load input data. Only need to load the top left 8x8 block.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 32);
- in[2] = load_input_data(input + 64);
- in[3] = load_input_data(input + 96);
- in[4] = load_input_data(input + 128);
- in[5] = load_input_data(input + 160);
- in[6] = load_input_data(input + 192);
- in[7] = load_input_data(input + 224);
+ in[0] = load_input_data8(input + 0 * 32);
+ in[1] = load_input_data8(input + 1 * 32);
+ in[2] = load_input_data8(input + 2 * 32);
+ in[3] = load_input_data8(input + 3 * 32);
+ in[4] = load_input_data8(input + 4 * 32);
+ in[5] = load_input_data8(input + 5 * 32);
+ in[6] = load_input_data8(input + 6 * 32);
+ in[7] = load_input_data8(input + 7 * 32);
transpose_16bit_8x8(in, in);
IDCT32_34
@@ -2008,10 +2008,10 @@
static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
int i;
for (i = 0; i < 8; ++i) {
- in[i] = load_input_data(input);
- in[i + 8] = load_input_data(input + 8);
- in[i + 16] = load_input_data(input + 16);
- in[i + 24] = load_input_data(input + 24);
+ in[i] = load_input_data8(input);
+ in[i + 8] = load_input_data8(input + 8);
+ in[i + 16] = load_input_data8(input + 16);
+ in[i + 24] = load_input_data8(input + 24);
input += 32;
}
}
--- a/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/vpx_dsp/x86/inv_txfm_sse2.h
@@ -76,24 +76,23 @@
return _mm_packs_epi32(t0, t1);
}
-// Function to allow 8 bit optimisations to be used when profile 0 is used with
+// Functions to allow 8 bit optimisations to be used when profile 0 is used with
// highbitdepth enabled
-static INLINE __m128i load_input_data(const tran_low_t *data) {
+static INLINE __m128i load_input_data4(const tran_low_t *data) {
#if CONFIG_VP9_HIGHBITDEPTH
- // in0: 0 X 1 X 2 X 3 X
- // in1: 4 X 5 X 6 X 7 X
- // t0: 0 4 X X 1 5 X X
- // t1: 2 6 X X 3 7 X X
- // t2: 0 2 4 6 X X X X
- // t3: 1 3 5 7 X X X X
- // rtn: 0 1 2 3 4 5 6 7
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i in = _mm_load_si128((const __m128i *)data);
+ return _mm_packs_epi32(in, zero);
+#else
+ return _mm_loadl_epi64((const __m128i *)data);
+#endif
+}
+
+static INLINE __m128i load_input_data8(const tran_low_t *data) {
+#if CONFIG_VP9_HIGHBITDEPTH
const __m128i in0 = _mm_load_si128((const __m128i *)data);
const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4));
- const __m128i t0 = _mm_unpacklo_epi16(in0, in1);
- const __m128i t1 = _mm_unpackhi_epi16(in0, in1);
- const __m128i t2 = _mm_unpacklo_epi16(t0, t1);
- const __m128i t3 = _mm_unpackhi_epi16(t0, t1);
- return _mm_unpacklo_epi16(t2, t3);
+ return _mm_packs_epi32(in0, in1);
#else
return _mm_load_si128((const __m128i *)data);
#endif
@@ -101,35 +100,35 @@
static INLINE void load_buffer_8x8(const tran_low_t *const input,
__m128i *const in) {
- in[0] = load_input_data(input + 0 * 8);
- in[1] = load_input_data(input + 1 * 8);
- in[2] = load_input_data(input + 2 * 8);
- in[3] = load_input_data(input + 3 * 8);
- in[4] = load_input_data(input + 4 * 8);
- in[5] = load_input_data(input + 5 * 8);
- in[6] = load_input_data(input + 6 * 8);
- in[7] = load_input_data(input + 7 * 8);
+ in[0] = load_input_data8(input + 0 * 8);
+ in[1] = load_input_data8(input + 1 * 8);
+ in[2] = load_input_data8(input + 2 * 8);
+ in[3] = load_input_data8(input + 3 * 8);
+ in[4] = load_input_data8(input + 4 * 8);
+ in[5] = load_input_data8(input + 5 * 8);
+ in[6] = load_input_data8(input + 6 * 8);
+ in[7] = load_input_data8(input + 7 * 8);
}
static INLINE void load_buffer_8x16(const tran_low_t *const input,
__m128i *const in) {
- in[0] = load_input_data(input + 0 * 16);
- in[1] = load_input_data(input + 1 * 16);
- in[2] = load_input_data(input + 2 * 16);
- in[3] = load_input_data(input + 3 * 16);
- in[4] = load_input_data(input + 4 * 16);
- in[5] = load_input_data(input + 5 * 16);
- in[6] = load_input_data(input + 6 * 16);
- in[7] = load_input_data(input + 7 * 16);
+ in[0] = load_input_data8(input + 0 * 16);
+ in[1] = load_input_data8(input + 1 * 16);
+ in[2] = load_input_data8(input + 2 * 16);
+ in[3] = load_input_data8(input + 3 * 16);
+ in[4] = load_input_data8(input + 4 * 16);
+ in[5] = load_input_data8(input + 5 * 16);
+ in[6] = load_input_data8(input + 6 * 16);
+ in[7] = load_input_data8(input + 7 * 16);
- in[8] = load_input_data(input + 8 * 16);
- in[9] = load_input_data(input + 9 * 16);
- in[10] = load_input_data(input + 10 * 16);
- in[11] = load_input_data(input + 11 * 16);
- in[12] = load_input_data(input + 12 * 16);
- in[13] = load_input_data(input + 13 * 16);
- in[14] = load_input_data(input + 14 * 16);
- in[15] = load_input_data(input + 15 * 16);
+ in[8] = load_input_data8(input + 8 * 16);
+ in[9] = load_input_data8(input + 9 * 16);
+ in[10] = load_input_data8(input + 10 * 16);
+ in[11] = load_input_data8(input + 11 * 16);
+ in[12] = load_input_data8(input + 12 * 16);
+ in[13] = load_input_data8(input + 13 * 16);
+ in[14] = load_input_data8(input + 14 * 16);
+ in[15] = load_input_data8(input + 15 * 16);
}
static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) {
--- a/vpx_dsp/x86/inv_txfm_ssse3.c
+++ b/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -36,10 +36,10 @@
__m128i tmp[4];
// Rows. Load 4-row input data.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8 * 1);
- in[2] = load_input_data(input + 8 * 2);
- in[3] = load_input_data(input + 8 * 3);
+ in[0] = load_input_data4(input + 0 * 8);
+ in[1] = load_input_data4(input + 1 * 8);
+ in[2] = load_input_data4(input + 2 * 8);
+ in[3] = load_input_data4(input + 3 * 8);
// 4x4 Transpose
transpose_16bit_4x4(in, in);
@@ -342,14 +342,14 @@
int i;
// Load input data. Only need to load the top left 8x8 block.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 32);
- in[2] = load_input_data(input + 64);
- in[3] = load_input_data(input + 96);
- in[4] = load_input_data(input + 128);
- in[5] = load_input_data(input + 160);
- in[6] = load_input_data(input + 192);
- in[7] = load_input_data(input + 224);
+ in[0] = load_input_data8(input + 0 * 32);
+ in[1] = load_input_data8(input + 1 * 32);
+ in[2] = load_input_data8(input + 2 * 32);
+ in[3] = load_input_data8(input + 3 * 32);
+ in[4] = load_input_data8(input + 4 * 32);
+ in[5] = load_input_data8(input + 5 * 32);
+ in[6] = load_input_data8(input + 6 * 32);
+ in[7] = load_input_data8(input + 7 * 32);
transpose_16bit_8x8(in, in);
idct32_34_first_half(in, stp1);
@@ -383,8 +383,8 @@
__m128i *in1) {
int i;
for (i = 0; i < 16; i++) {
- in0[i] = load_input_data(input);
- in1[i] = load_input_data(input + 8);
+ in0[i] = load_input_data8(input);
+ in1[i] = load_input_data8(input + 8);
input += 32;
}
}