ref: 0c613312446c1bb867fad1a093bb099ce29dcb14
parent: cbb83ba4aa99b40b0b4a2a407bfd6d0d8be87d1f
author: Linfeng Zhang <linfengz@google.com>
date: Fri Aug 4 11:10:12 EDT 2017
Update high bitdepth 16x16 idct x86 code Prepare for high bitdepth 16x16 idct sse4.1 code. Just functions moving and renaming. BUG=webm:1412 Change-Id: Ie056fe4494b1f299491968beadcef990e2ab714a
--- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -16,43 +16,6 @@
#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
-static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
- const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
- const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
- return _mm_packs_epi32(t0, t1);
-}
-
-static INLINE void highbd_write_buffer_8x1(uint16_t *dest, const __m128i in,
- const int bd) {
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- __m128i out;
-
- out = _mm_adds_epi16(in, final_rounding);
- out = _mm_srai_epi16(out, 6);
- recon_and_store_8_kernel(out, &dest, 0, bd);
-}
-
-static INLINE void recon_and_store_4_kernel(const __m128i in,
- uint16_t *const dest,
- const int bd) {
- __m128i d;
-
- d = _mm_loadl_epi64((const __m128i *)dest);
- d = add_clamp(d, in, bd);
- _mm_storel_epi64((__m128i *)dest, d);
-}
-
-static INLINE void highbd_write_buffer_4x1(uint16_t *const dest,
- const __m128i in, const int bd) {
- const __m128i final_rounding = _mm_set1_epi32(1 << 5);
- __m128i out;
-
- out = _mm_add_epi32(in, final_rounding);
- out = _mm_srai_epi32(out, 6);
- out = _mm_packs_epi32(out, out);
- recon_and_store_4_kernel(out, dest, bd);
-}
-
static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
__m128i *const out) {
__m128i temp1[2], temp2, sign[2];
@@ -107,26 +70,6 @@
out[15] = in[15];
}
-static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
- __m128i *const out) {
- out[0] = _mm_add_epi32(in[0], in[15]);
- out[1] = _mm_add_epi32(in[1], in[14]);
- out[2] = _mm_add_epi32(in[2], in[13]);
- out[3] = _mm_add_epi32(in[3], in[12]);
- out[4] = _mm_add_epi32(in[4], in[11]);
- out[5] = _mm_add_epi32(in[5], in[10]);
- out[6] = _mm_add_epi32(in[6], in[9]);
- out[7] = _mm_add_epi32(in[7], in[8]);
- out[8] = _mm_sub_epi32(in[7], in[8]);
- out[9] = _mm_sub_epi32(in[6], in[9]);
- out[10] = _mm_sub_epi32(in[5], in[10]);
- out[11] = _mm_sub_epi32(in[4], in[11]);
- out[12] = _mm_sub_epi32(in[3], in[12]);
- out[13] = _mm_sub_epi32(in[2], in[13]);
- out[14] = _mm_sub_epi32(in[1], in[14]);
- out[15] = _mm_sub_epi32(in[0], in[15]);
-}
-
static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
__m128i step1[16], step2[16];
__m128i temp1[4], temp2, sign[2];
@@ -314,14 +257,14 @@
input += 128;
}
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < 16; i += 8) {
int j;
- transpose_16bit_8x8(l + i * 8, out);
- transpose_16bit_8x8(r + i * 8, out + 8);
+ transpose_16bit_8x8(l + i, out);
+ transpose_16bit_8x8(r + i, out + 8);
idct16_8col(out);
for (j = 0; j < 16; ++j) {
- highbd_write_buffer_8x1(dest + j * stride, out[j], bd);
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
}
dest += 8;
}
@@ -354,32 +297,32 @@
input += 4 * 16;
}
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < 16; i += 4) {
int j;
- out[0] = all[0][4 * i + 0];
- out[1] = all[1][4 * i + 0];
- out[2] = all[0][4 * i + 1];
- out[3] = all[1][4 * i + 1];
- out[4] = all[0][4 * i + 2];
- out[5] = all[1][4 * i + 2];
- out[6] = all[0][4 * i + 3];
- out[7] = all[1][4 * i + 3];
+ out[0] = all[0][i + 0];
+ out[1] = all[1][i + 0];
+ out[2] = all[0][i + 1];
+ out[3] = all[1][i + 1];
+ out[4] = all[0][i + 2];
+ out[5] = all[1][i + 2];
+ out[6] = all[0][i + 3];
+ out[7] = all[1][i + 3];
transpose_32bit_8x4(out, out);
- out[8] = all[2][4 * i + 0];
- out[9] = all[3][4 * i + 0];
- out[10] = all[2][4 * i + 1];
- out[11] = all[3][4 * i + 1];
- out[12] = all[2][4 * i + 2];
- out[13] = all[3][4 * i + 2];
- out[14] = all[2][4 * i + 3];
- out[15] = all[3][4 * i + 3];
+ out[8] = all[2][i + 0];
+ out[9] = all[3][i + 0];
+ out[10] = all[2][i + 1];
+ out[11] = all[3][i + 1];
+ out[12] = all[2][i + 2];
+ out[13] = all[3][i + 2];
+ out[14] = all[2][i + 3];
+ out[15] = all[3][i + 3];
transpose_32bit_8x4(out + 8, out + 8);
highbd_idct16_4col(out);
for (j = 0; j < 16; ++j) {
- highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
}
dest += 4;
}
@@ -414,9 +357,9 @@
in[15] = _mm_setzero_si128();
idct16_8col(in);
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < 16; i += 8) {
int j;
- transpose_16bit_8x8(in + i * 8, out);
+ transpose_16bit_8x8(in + i, out);
out[8] = _mm_setzero_si128();
out[9] = _mm_setzero_si128();
out[10] = _mm_setzero_si128();
@@ -428,7 +371,7 @@
idct16_8col(out);
for (j = 0; j < 16; ++j) {
- highbd_write_buffer_8x1(dest + j * stride, out[j], bd);
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
}
dest += 8;
}
@@ -450,21 +393,21 @@
input += 4 * 16;
}
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < 16; i += 4) {
int j;
- out[0] = all[0][4 * i + 0];
- out[1] = all[1][4 * i + 0];
- out[2] = all[0][4 * i + 1];
- out[3] = all[1][4 * i + 1];
- out[4] = all[0][4 * i + 2];
- out[5] = all[1][4 * i + 2];
- out[6] = all[0][4 * i + 3];
- out[7] = all[1][4 * i + 3];
+ out[0] = all[0][i + 0];
+ out[1] = all[1][i + 0];
+ out[2] = all[0][i + 1];
+ out[3] = all[1][i + 1];
+ out[4] = all[0][i + 2];
+ out[5] = all[1][i + 2];
+ out[6] = all[0][i + 3];
+ out[7] = all[1][i + 3];
transpose_32bit_8x4(out, out);
highbd_idct16x16_38_4col(out);
for (j = 0; j < 16; ++j) {
- highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
}
dest += 4;
}
@@ -486,12 +429,12 @@
idct16x16_10_pass1(in, l);
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < 16; i += 8) {
int j;
- idct16x16_10_pass2(l + 8 * i, in);
+ idct16x16_10_pass2(l + i, in);
for (j = 0; j < 16; ++j) {
- highbd_write_buffer_8x1(dest + j * stride, in[j], bd);
+ highbd_write_buffer_8(dest + j * stride, in[j], bd);
}
dest += 8;
}
@@ -509,13 +452,13 @@
input += 4 * 16;
}
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < 16; i += 4) {
int j;
- transpose_32bit_4x4(&all[0][4 * i], out);
+ transpose_32bit_4x4(&all[0][i], out);
highbd_idct16x16_10_4col(out);
for (j = 0; j < 16; ++j) {
- highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
}
dest += 4;
}
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -143,7 +143,7 @@
io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
}
- recon_and_store_4(io, dest, stride, bd);
+ recon_and_store_4x4(io, dest, stride, bd);
}
void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -65,5 +65,5 @@
io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
}
- recon_and_store_4(io, dest, stride, bd);
+ recon_and_store_4x4(io, dest, stride, bd);
}
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -176,7 +176,7 @@
highbd_idct8x8_final_round(io);
}
- recon_and_store_8(io, dest, stride, bd);
+ recon_and_store_8x8(io, dest, stride, bd);
}
void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
@@ -219,7 +219,7 @@
highbd_idct8x8_final_round(io);
}
- recon_and_store_8(io, dest, stride, bd);
+ recon_and_store_8x8(io, dest, stride, bd);
}
void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -178,7 +178,7 @@
highbd_idct8x8_final_round(io);
}
- recon_and_store_8(io, dest, stride, bd);
+ recon_and_store_8x8(io, dest, stride, bd);
}
void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest,
@@ -221,5 +221,5 @@
highbd_idct8x8_final_round(io);
}
- recon_and_store_8(io, dest, stride, bd);
+ recon_and_store_8x8(io, dest, stride, bd);
}
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -184,6 +184,27 @@
io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
}
+
+static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
+ __m128i *const out) {
+ out[0] = _mm_add_epi32(in[0], in[15]);
+ out[1] = _mm_add_epi32(in[1], in[14]);
+ out[2] = _mm_add_epi32(in[2], in[13]);
+ out[3] = _mm_add_epi32(in[3], in[12]);
+ out[4] = _mm_add_epi32(in[4], in[11]);
+ out[5] = _mm_add_epi32(in[5], in[10]);
+ out[6] = _mm_add_epi32(in[6], in[9]);
+ out[7] = _mm_add_epi32(in[7], in[8]);
+ out[8] = _mm_sub_epi32(in[7], in[8]);
+ out[9] = _mm_sub_epi32(in[6], in[9]);
+ out[10] = _mm_sub_epi32(in[5], in[10]);
+ out[11] = _mm_sub_epi32(in[4], in[11]);
+ out[12] = _mm_sub_epi32(in[3], in[12]);
+ out[13] = _mm_sub_epi32(in[2], in[13]);
+ out[14] = _mm_sub_epi32(in[1], in[14]);
+ out[15] = _mm_sub_epi32(in[0], in[15]);
+}
+
static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
const int bd) {
const __m128i zero = _mm_set1_epi16(0);
@@ -221,11 +242,19 @@
}
}
-static INLINE void recon_and_store_4_dual(const __m128i in,
- uint16_t *const dest,
- const int stride, const int bd) {
+static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest,
+ const int bd) {
__m128i d;
+ d = _mm_loadl_epi64((const __m128i *)dest);
+ d = add_clamp(d, in, bd);
+ _mm_storel_epi64((__m128i *)dest, d);
+}
+
+static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest,
+ const int stride, const int bd) {
+ __m128i d;
+
d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
d = _mm_castps_si128(
_mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride)));
@@ -234,16 +263,15 @@
_mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d));
}
-static INLINE void recon_and_store_4(const __m128i *const in, uint16_t *dest,
- const int stride, const int bd) {
- recon_and_store_4_dual(in[0], dest, stride, bd);
+static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest,
+ const int stride, const int bd) {
+ recon_and_store_4x2(in[0], dest, stride, bd);
dest += 2 * stride;
- recon_and_store_4_dual(in[1], dest, stride, bd);
+ recon_and_store_4x2(in[1], dest, stride, bd);
}
-static INLINE void recon_and_store_8_kernel(const __m128i in,
- uint16_t **const dest,
- const int stride, const int bd) {
+static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest,
+ const int stride, const int bd) {
__m128i d;
d = _mm_load_si128((const __m128i *)(*dest));
@@ -252,16 +280,43 @@
*dest += stride;
}
-static INLINE void recon_and_store_8(const __m128i *const in, uint16_t *dest,
- const int stride, const int bd) {
- recon_and_store_8_kernel(in[0], &dest, stride, bd);
- recon_and_store_8_kernel(in[1], &dest, stride, bd);
- recon_and_store_8_kernel(in[2], &dest, stride, bd);
- recon_and_store_8_kernel(in[3], &dest, stride, bd);
- recon_and_store_8_kernel(in[4], &dest, stride, bd);
- recon_and_store_8_kernel(in[5], &dest, stride, bd);
- recon_and_store_8_kernel(in[6], &dest, stride, bd);
- recon_and_store_8_kernel(in[7], &dest, stride, bd);
+static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest,
+ const int stride, const int bd) {
+ recon_and_store_8(in[0], &dest, stride, bd);
+ recon_and_store_8(in[1], &dest, stride, bd);
+ recon_and_store_8(in[2], &dest, stride, bd);
+ recon_and_store_8(in[3], &dest, stride, bd);
+ recon_and_store_8(in[4], &dest, stride, bd);
+ recon_and_store_8(in[5], &dest, stride, bd);
+ recon_and_store_8(in[6], &dest, stride, bd);
+ recon_and_store_8(in[7], &dest, stride, bd);
+}
+
+static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
+ const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
+ const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
+ return _mm_packs_epi32(t0, t1);
+}
+
+static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in,
+ const int bd) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ __m128i out;
+
+ out = _mm_adds_epi16(in, final_rounding);
+ out = _mm_srai_epi16(out, 6);
+ recon_and_store_8(out, &dest, 0, bd);
+}
+
+static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in,
+ const int bd) {
+ const __m128i final_rounding = _mm_set1_epi32(1 << 5);
+ __m128i out;
+
+ out = _mm_add_epi32(in, final_rounding);
+ out = _mm_srai_epi32(out, 6);
+ out = _mm_packs_epi32(out, out);
+ recon_and_store_4(out, dest, bd);
}
#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_