ref: 39da7fb786eb05661db3ec866660772ae1583f2b
parent: 68805583e9544f14ebe72583bcc5b788f8da6c7f
author: Linfeng Zhang <linfengz@google.com>
date: Tue Aug 8 13:39:04 EDT 2017
Clean highbd idct x86 code with inline functions Created inline functions highbd_butterfly_cospi16_sse2() and highbd_butterfly_cospi16_sse4_1() BUG=webm:1412 Change-Id: Icbc53a73712b6207379872a5e88d0a4d09e2322a
--- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -18,18 +18,12 @@
static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
__m128i *const out) {
- __m128i temp1[2], temp2, sign[2];
// stage 5
out[0] = _mm_add_epi32(in[0], in[3]);
out[1] = _mm_add_epi32(in[1], in[2]);
out[2] = _mm_sub_epi32(in[1], in[2]);
out[3] = _mm_sub_epi32(in[0], in[3]);
- temp2 = _mm_sub_epi32(in[6], in[5]);
- abs_extend_64bit_sse2(temp2, temp1, sign);
- out[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
- temp2 = _mm_add_epi32(in[6], in[5]);
- abs_extend_64bit_sse2(temp2, temp1, sign);
- out[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+ highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]);
out[8] = _mm_add_epi32(in[8], in[11]);
out[9] = _mm_add_epi32(in[9], in[10]);
out[10] = _mm_sub_epi32(in[9], in[10]);
@@ -42,7 +36,6 @@
static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
__m128i *const out) {
- __m128i temp1[2], temp2, sign[2];
out[0] = _mm_add_epi32(in[0], in[7]);
out[1] = _mm_add_epi32(in[1], in[6]);
out[2] = _mm_add_epi32(in[2], in[5]);
@@ -53,19 +46,8 @@
out[7] = _mm_sub_epi32(in[0], in[7]);
out[8] = in[8];
out[9] = in[9];
- temp2 = _mm_sub_epi32(in[13], in[10]);
- abs_extend_64bit_sse2(temp2, temp1, sign);
- out[10] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
- temp2 = _mm_add_epi32(in[13], in[10]);
- abs_extend_64bit_sse2(temp2, temp1, sign);
- out[13] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
-
- temp2 = _mm_sub_epi32(in[12], in[11]);
- abs_extend_64bit_sse2(temp2, temp1, sign);
- out[11] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
- temp2 = _mm_add_epi32(in[12], in[11]);
- abs_extend_64bit_sse2(temp2, temp1, sign);
- out[12] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+ highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]);
+ highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]);
out[14] = in[14];
out[15] = in[15];
}
@@ -72,7 +54,6 @@
static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
__m128i step1[16], step2[16];
- __m128i temp1[4], temp2, sign[2];
// stage 2
highbd_butterfly_sse2(io[1], io[15], (int)cospi_30_64, (int)cospi_2_64,
@@ -99,12 +80,7 @@
step1[15] = _mm_add_epi32(step2[15], step2[14]);
// stage 4
- temp2 = _mm_add_epi32(io[0], io[8]);
- abs_extend_64bit_sse2(temp2, temp1, sign);
- step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
- temp2 = _mm_sub_epi32(io[0], io[8]);
- abs_extend_64bit_sse2(temp2, temp1, sign);
- step2[1] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+ highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
highbd_butterfly_sse2(io[4], io[12], (int)cospi_24_64, (int)cospi_8_64,
&step2[2], &step2[3]);
highbd_butterfly_sse2(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64,
--- a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
@@ -19,18 +19,12 @@
static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
__m128i *const out) {
- __m128i temp1[2], temp2;
// stage 5
out[0] = _mm_add_epi32(in[0], in[3]);
out[1] = _mm_add_epi32(in[1], in[2]);
out[2] = _mm_sub_epi32(in[1], in[2]);
out[3] = _mm_sub_epi32(in[0], in[3]);
- temp2 = _mm_sub_epi32(in[6], in[5]);
- extend_64bit(temp2, temp1);
- out[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
- temp2 = _mm_add_epi32(in[6], in[5]);
- extend_64bit(temp2, temp1);
- out[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+ highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]);
out[8] = _mm_add_epi32(in[8], in[11]);
out[9] = _mm_add_epi32(in[9], in[10]);
out[10] = _mm_sub_epi32(in[9], in[10]);
@@ -43,7 +37,6 @@
static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
__m128i *const out) {
- __m128i temp1[2], temp2;
out[0] = _mm_add_epi32(in[0], in[7]);
out[1] = _mm_add_epi32(in[1], in[6]);
out[2] = _mm_add_epi32(in[2], in[5]);
@@ -54,19 +47,8 @@
out[7] = _mm_sub_epi32(in[0], in[7]);
out[8] = in[8];
out[9] = in[9];
- temp2 = _mm_sub_epi32(in[13], in[10]);
- extend_64bit(temp2, temp1);
- out[10] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
- temp2 = _mm_add_epi32(in[13], in[10]);
- extend_64bit(temp2, temp1);
- out[13] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
-
- temp2 = _mm_sub_epi32(in[12], in[11]);
- extend_64bit(temp2, temp1);
- out[11] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
- temp2 = _mm_add_epi32(in[12], in[11]);
- extend_64bit(temp2, temp1);
- out[12] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+ highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]);
+ highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]);
out[14] = in[14];
out[15] = in[15];
}
@@ -73,7 +55,6 @@
static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
__m128i step1[16], step2[16];
- __m128i temp1[4], temp2;
// stage 2
highbd_butterfly_sse4_1(io[1], io[15], (int)cospi_30_64, (int)cospi_2_64,
@@ -92,26 +73,21 @@
&step1[5], &step1[6]);
step1[8] = _mm_add_epi32(step2[8], step2[9]);
step1[9] = _mm_sub_epi32(step2[8], step2[9]);
- step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
- step1[11] = _mm_add_epi32(step2[10], step2[11]);
- step1[12] = _mm_add_epi32(step2[13], step2[12]);
- step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
+ step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+ step1[11] = _mm_add_epi32(step2[11], step2[10]);
+ step1[12] = _mm_add_epi32(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi32(step2[12], step2[13]);
step1[14] = _mm_sub_epi32(step2[15], step2[14]);
step1[15] = _mm_add_epi32(step2[15], step2[14]);
// stage 4
- temp2 = _mm_add_epi32(io[0], io[8]);
- extend_64bit(temp2, temp1);
- step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
- temp2 = _mm_sub_epi32(io[0], io[8]);
- extend_64bit(temp2, temp1);
- step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+ highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]);
highbd_butterfly_sse4_1(io[4], io[12], (int)cospi_24_64, (int)cospi_8_64,
&step2[2], &step2[3]);
highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
(int)cospi_8_64, &step2[9], &step2[14]);
- highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
- (int)cospi_24_64, &step2[13], &step2[10]);
+ highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64,
+ -(int)cospi_24_64, &step2[13], &step2[10]);
step2[5] = _mm_sub_epi32(step1[4], step1[5]);
step1[4] = _mm_add_epi32(step1[4], step1[5]);
step2[6] = _mm_sub_epi32(step1[7], step1[6]);
@@ -147,10 +123,10 @@
&step1[5], &step1[6]);
step1[8] = _mm_add_epi32(step2[8], step2[9]);
step1[9] = _mm_sub_epi32(step2[8], step2[9]);
- step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
- step1[11] = _mm_add_epi32(step2[10], step2[11]);
- step1[12] = _mm_add_epi32(step2[13], step2[12]);
- step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13]
+ step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+ step1[11] = _mm_add_epi32(step2[11], step2[10]);
+ step1[12] = _mm_add_epi32(step2[12], step2[13]);
+ step1[13] = _mm_sub_epi32(step2[12], step2[13]);
step1[14] = _mm_sub_epi32(step2[15], step2[14]);
step1[15] = _mm_add_epi32(step2[15], step2[14]);
@@ -162,8 +138,8 @@
&step2[2], &step2[3]);
highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
(int)cospi_8_64, &step2[9], &step2[14]);
- highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
- (int)cospi_24_64, &step2[13], &step2[10]);
+ highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64,
+ -(int)cospi_24_64, &step2[13], &step2[10]);
step2[5] = _mm_sub_epi32(step1[4], step1[5]);
step1[4] = _mm_add_epi32(step1[4], step1[5]);
step2[6] = _mm_sub_epi32(step1[7], step1[6]);
@@ -193,12 +169,10 @@
&step1[4], &step1[7]);
step1[8] = step2[8];
step1[9] = step2[8];
- step1[10] =
- _mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10]
+ step1[10] = step2[11];
step1[11] = step2[11];
step1[12] = step2[12];
- step1[13] =
- _mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13]
+ step1[13] = step2[12];
step1[14] = step2[15];
step1[15] = step2[15];
@@ -210,8 +184,8 @@
step2[3] = _mm_setzero_si128();
highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
(int)cospi_8_64, &step2[9], &step2[14]);
- highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
- (int)cospi_24_64, &step2[13], &step2[10]);
+ highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64,
+ -(int)cospi_24_64, &step2[13], &step2[10]);
step2[5] = step1[4];
step2[6] = step1[7];
step2[8] = step1[8];
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -75,17 +75,12 @@
}
static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
- __m128i temp[2], sign[2], step[4];
+ __m128i step[4];
transpose_32bit_4x4(io, io);
// stage 1
- temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2]
- abs_extend_64bit_sse2(temp[0], temp, sign);
- step[0] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64);
- temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2]
- abs_extend_64bit_sse2(temp[0], temp, sign);
- step[1] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64);
+ highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]);
highbd_butterfly_sse2(io[1], io[3], (int)cospi_24_64, (int)cospi_8_64,
&step[2], &step[3]);
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -16,7 +16,7 @@
#include "vpx_dsp/x86/transpose_sse2.h"
static void highbd_idct8x8_half1d(__m128i *const io) {
- __m128i temp1[4], temp2[4], sign[2], step1[8], step2[8];
+ __m128i step1[8], step2[8];
transpose_32bit_4x4x2(io, io);
@@ -31,12 +31,7 @@
&step1[5], &step1[6]);
// stage 2
- temp2[0] = _mm_add_epi32(step1[0], step1[2]);
- abs_extend_64bit_sse2(temp2[0], temp1, sign);
- step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
- temp2[0] = _mm_sub_epi32(step1[0], step1[2]);
- abs_extend_64bit_sse2(temp2[0], temp1, sign);
- step2[1] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+ highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]);
highbd_butterfly_sse2(step1[1], step1[3], (int)cospi_24_64, (int)cospi_8_64,
&step2[2], &step2[3]);
step2[4] = _mm_add_epi32(step1[4], step1[5]);
@@ -50,12 +45,7 @@
step1[2] = _mm_sub_epi32(step2[1], step2[2]);
step1[3] = _mm_sub_epi32(step2[0], step2[3]);
step1[4] = step2[4];
- temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
- abs_extend_64bit_sse2(temp2[0], temp1, sign);
- step1[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
- temp2[0] = _mm_add_epi32(step2[6], step2[5]);
- abs_extend_64bit_sse2(temp2[0], temp1, sign);
- step1[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+ highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
step1[7] = step2[7];
// stage 4
@@ -63,7 +53,7 @@
}
static void highbd_idct8x8_12_half1d(__m128i *const io) {
- __m128i temp1[4], temp2[4], sign[2], step1[8], step2[8];
+ __m128i temp1[4], sign[2], step1[8], step2[8];
transpose_32bit_4x4(io, io);
@@ -94,12 +84,7 @@
step1[2] = _mm_sub_epi32(step2[0], step2[2]);
step1[3] = _mm_sub_epi32(step2[0], step2[3]);
step1[4] = step2[4];
- temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
- abs_extend_64bit_sse2(temp2[0], temp1, sign);
- step1[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
- temp2[0] = _mm_add_epi32(step2[6], step2[5]);
- abs_extend_64bit_sse2(temp2[0], temp1, sign);
- step1[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+ highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
step1[7] = step2[7];
// stage 4
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -18,7 +18,7 @@
#include "vpx_dsp/x86/transpose_sse2.h"
static void highbd_idct8x8_half1d(__m128i *const io) {
- __m128i temp1[4], temp2[4], step1[8], step2[8];
+ __m128i step1[8], step2[8];
transpose_32bit_4x4x2(io, io);
@@ -33,12 +33,7 @@
&step1[5], &step1[6]);
// stage 2
- temp2[0] = _mm_add_epi32(step1[0], step1[2]);
- extend_64bit(temp2[0], temp1);
- step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
- temp2[0] = _mm_sub_epi32(step1[0], step1[2]);
- extend_64bit(temp2[0], temp1);
- step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+ highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]);
highbd_butterfly_sse4_1(step1[1], step1[3], (int)cospi_24_64, (int)cospi_8_64,
&step2[2], &step2[3]);
step2[4] = _mm_add_epi32(step1[4], step1[5]);
@@ -52,12 +47,7 @@
step1[2] = _mm_sub_epi32(step2[1], step2[2]);
step1[3] = _mm_sub_epi32(step2[0], step2[3]);
step1[4] = step2[4];
- temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
- extend_64bit(temp2[0], temp1);
- step1[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
- temp2[0] = _mm_add_epi32(step2[6], step2[5]);
- extend_64bit(temp2[0], temp1);
- step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+ highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
step1[7] = step2[7];
// stage 4
@@ -65,7 +55,7 @@
}
static void highbd_idct8x8_12_half1d(__m128i *const io) {
- __m128i temp1[4], temp2[4], step1[8], step2[8];
+ __m128i temp1[2], step1[8], step2[8];
transpose_32bit_4x4(io, io);
@@ -96,12 +86,7 @@
step1[2] = _mm_sub_epi32(step2[0], step2[2]);
step1[3] = _mm_sub_epi32(step2[0], step2[3]);
step1[4] = step2[4];
- temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
- extend_64bit(temp2[0], temp1);
- step1[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
- temp2[0] = _mm_add_epi32(step2[6], step2[5]);
- extend_64bit(temp2[0], temp1);
- step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+ highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
step1[7] = step2[7];
// stage 4
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -140,6 +140,20 @@
*out1 = pack_4(temp2[0], temp2[1]);
}
+static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0,
+ const __m128i in1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp1[2], temp2, sign[2];
+
+ temp2 = _mm_add_epi32(in0, in1);
+ abs_extend_64bit_sse2(temp2, temp1, sign);
+ *out0 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+ temp2 = _mm_sub_epi32(in0, in1);
+ abs_extend_64bit_sse2(temp2, temp1, sign);
+ *out1 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+}
+
// Note: c0 and c1 must be non negative.
static INLINE void highbd_multiplication_sse2(const __m128i in, const int c0,
const int c1, __m128i *const out0,
--- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -59,6 +59,20 @@
*out1 = pack_4(temp2[0], temp2[1]);
}
+static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0,
+ const __m128i in1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp1[2], temp2;
+
+ temp2 = _mm_add_epi32(in0, in1);
+ extend_64bit(temp2, temp1);
+ *out0 = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+ temp2 = _mm_sub_epi32(in0, in1);
+ extend_64bit(temp2, temp1);
+ *out1 = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+}
+
static INLINE void highbd_multiplication_sse4_1(const __m128i in, const int c0,
const int c1,
__m128i *const out0,