shithub: libvpx

Download patch

ref: 39da7fb786eb05661db3ec866660772ae1583f2b
parent: 68805583e9544f14ebe72583bcc5b788f8da6c7f
author: Linfeng Zhang <linfengz@google.com>
date: Tue Aug 8 13:39:04 EDT 2017

Clean highbd idct x86 code with inline functions

Created inline functions highbd_butterfly_cospi16_sse2()
and highbd_butterfly_cospi16_sse4_1()

BUG=webm:1412

Change-Id: Icbc53a73712b6207379872a5e88d0a4d09e2322a

--- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -18,18 +18,12 @@
 
 static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
                                              __m128i *const out) {
-  __m128i temp1[2], temp2, sign[2];
   // stage 5
   out[0] = _mm_add_epi32(in[0], in[3]);
   out[1] = _mm_add_epi32(in[1], in[2]);
   out[2] = _mm_sub_epi32(in[1], in[2]);
   out[3] = _mm_sub_epi32(in[0], in[3]);
-  temp2 = _mm_sub_epi32(in[6], in[5]);
-  abs_extend_64bit_sse2(temp2, temp1, sign);
-  out[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
-  temp2 = _mm_add_epi32(in[6], in[5]);
-  abs_extend_64bit_sse2(temp2, temp1, sign);
-  out[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+  highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]);
   out[8] = _mm_add_epi32(in[8], in[11]);
   out[9] = _mm_add_epi32(in[9], in[10]);
   out[10] = _mm_sub_epi32(in[9], in[10]);
@@ -42,7 +36,6 @@
 
 static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
                                              __m128i *const out) {
-  __m128i temp1[2], temp2, sign[2];
   out[0] = _mm_add_epi32(in[0], in[7]);
   out[1] = _mm_add_epi32(in[1], in[6]);
   out[2] = _mm_add_epi32(in[2], in[5]);
@@ -53,19 +46,8 @@
   out[7] = _mm_sub_epi32(in[0], in[7]);
   out[8] = in[8];
   out[9] = in[9];
-  temp2 = _mm_sub_epi32(in[13], in[10]);
-  abs_extend_64bit_sse2(temp2, temp1, sign);
-  out[10] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
-  temp2 = _mm_add_epi32(in[13], in[10]);
-  abs_extend_64bit_sse2(temp2, temp1, sign);
-  out[13] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
-
-  temp2 = _mm_sub_epi32(in[12], in[11]);
-  abs_extend_64bit_sse2(temp2, temp1, sign);
-  out[11] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
-  temp2 = _mm_add_epi32(in[12], in[11]);
-  abs_extend_64bit_sse2(temp2, temp1, sign);
-  out[12] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+  highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]);
+  highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]);
   out[14] = in[14];
   out[15] = in[15];
 }
@@ -72,7 +54,6 @@
 
 static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
   __m128i step1[16], step2[16];
-  __m128i temp1[4], temp2, sign[2];
 
   // stage 2
   highbd_butterfly_sse2(io[1], io[15], (int)cospi_30_64, (int)cospi_2_64,
@@ -99,12 +80,7 @@
   step1[15] = _mm_add_epi32(step2[15], step2[14]);
 
   // stage 4
-  temp2 = _mm_add_epi32(io[0], io[8]);
-  abs_extend_64bit_sse2(temp2, temp1, sign);
-  step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
-  temp2 = _mm_sub_epi32(io[0], io[8]);
-  abs_extend_64bit_sse2(temp2, temp1, sign);
-  step2[1] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+  highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
   highbd_butterfly_sse2(io[4], io[12], (int)cospi_24_64, (int)cospi_8_64,
                         &step2[2], &step2[3]);
   highbd_butterfly_sse2(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64,
--- a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
@@ -19,18 +19,12 @@
 
 static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
                                              __m128i *const out) {
-  __m128i temp1[2], temp2;
   // stage 5
   out[0] = _mm_add_epi32(in[0], in[3]);
   out[1] = _mm_add_epi32(in[1], in[2]);
   out[2] = _mm_sub_epi32(in[1], in[2]);
   out[3] = _mm_sub_epi32(in[0], in[3]);
-  temp2 = _mm_sub_epi32(in[6], in[5]);
-  extend_64bit(temp2, temp1);
-  out[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
-  temp2 = _mm_add_epi32(in[6], in[5]);
-  extend_64bit(temp2, temp1);
-  out[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]);
   out[8] = _mm_add_epi32(in[8], in[11]);
   out[9] = _mm_add_epi32(in[9], in[10]);
   out[10] = _mm_sub_epi32(in[9], in[10]);
@@ -43,7 +37,6 @@
 
 static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
                                              __m128i *const out) {
-  __m128i temp1[2], temp2;
   out[0] = _mm_add_epi32(in[0], in[7]);
   out[1] = _mm_add_epi32(in[1], in[6]);
   out[2] = _mm_add_epi32(in[2], in[5]);
@@ -54,19 +47,8 @@
   out[7] = _mm_sub_epi32(in[0], in[7]);
   out[8] = in[8];
   out[9] = in[9];
-  temp2 = _mm_sub_epi32(in[13], in[10]);
-  extend_64bit(temp2, temp1);
-  out[10] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
-  temp2 = _mm_add_epi32(in[13], in[10]);
-  extend_64bit(temp2, temp1);
-  out[13] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
-
-  temp2 = _mm_sub_epi32(in[12], in[11]);
-  extend_64bit(temp2, temp1);
-  out[11] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
-  temp2 = _mm_add_epi32(in[12], in[11]);
-  extend_64bit(temp2, temp1);
-  out[12] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]);
+  highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]);
   out[14] = in[14];
   out[15] = in[15];
 }
@@ -73,7 +55,6 @@
 
 static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
   __m128i step1[16], step2[16];
-  __m128i temp1[4], temp2;
 
   // stage 2
   highbd_butterfly_sse4_1(io[1], io[15], (int)cospi_30_64, (int)cospi_2_64,
@@ -92,26 +73,21 @@
                           &step1[5], &step1[6]);
   step1[8] = _mm_add_epi32(step2[8], step2[9]);
   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
-  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
-  step1[11] = _mm_add_epi32(step2[10], step2[11]);
-  step1[12] = _mm_add_epi32(step2[13], step2[12]);
-  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+  step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+  step1[11] = _mm_add_epi32(step2[11], step2[10]);
+  step1[12] = _mm_add_epi32(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi32(step2[12], step2[13]);
   step1[14] = _mm_sub_epi32(step2[15], step2[14]);
   step1[15] = _mm_add_epi32(step2[15], step2[14]);
 
   // stage 4
-  temp2 = _mm_add_epi32(io[0], io[8]);
-  extend_64bit(temp2, temp1);
-  step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
-  temp2 = _mm_sub_epi32(io[0], io[8]);
-  extend_64bit(temp2, temp1);
-  step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]);
   highbd_butterfly_sse4_1(io[4], io[12], (int)cospi_24_64, (int)cospi_8_64,
                           &step2[2], &step2[3]);
   highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
                           (int)cospi_8_64, &step2[9], &step2[14]);
-  highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
-                          (int)cospi_24_64, &step2[13], &step2[10]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64,
+                          -(int)cospi_24_64, &step2[13], &step2[10]);
   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
   step1[4] = _mm_add_epi32(step1[4], step1[5]);
   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
@@ -147,10 +123,10 @@
                                &step1[5], &step1[6]);
   step1[8] = _mm_add_epi32(step2[8], step2[9]);
   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
-  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
-  step1[11] = _mm_add_epi32(step2[10], step2[11]);
-  step1[12] = _mm_add_epi32(step2[13], step2[12]);
-  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+  step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+  step1[11] = _mm_add_epi32(step2[11], step2[10]);
+  step1[12] = _mm_add_epi32(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi32(step2[12], step2[13]);
   step1[14] = _mm_sub_epi32(step2[15], step2[14]);
   step1[15] = _mm_add_epi32(step2[15], step2[14]);
 
@@ -162,8 +138,8 @@
                                &step2[2], &step2[3]);
   highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
                           (int)cospi_8_64, &step2[9], &step2[14]);
-  highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
-                          (int)cospi_24_64, &step2[13], &step2[10]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64,
+                          -(int)cospi_24_64, &step2[13], &step2[10]);
   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
   step1[4] = _mm_add_epi32(step1[4], step1[5]);
   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
@@ -193,12 +169,10 @@
                                &step1[4], &step1[7]);
   step1[8] = step2[8];
   step1[9] = step2[8];
-  step1[10] =
-      _mm_sub_epi32(_mm_setzero_si128(), step2[11]);  // step1[10] = -step1[10]
+  step1[10] = step2[11];
   step1[11] = step2[11];
   step1[12] = step2[12];
-  step1[13] =
-      _mm_sub_epi32(_mm_setzero_si128(), step2[12]);  // step1[13] = -step1[13]
+  step1[13] = step2[12];
   step1[14] = step2[15];
   step1[15] = step2[15];
 
@@ -210,8 +184,8 @@
   step2[3] = _mm_setzero_si128();
   highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
                           (int)cospi_8_64, &step2[9], &step2[14]);
-  highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
-                          (int)cospi_24_64, &step2[13], &step2[10]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64,
+                          -(int)cospi_24_64, &step2[13], &step2[10]);
   step2[5] = step1[4];
   step2[6] = step1[7];
   step2[8] = step1[8];
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -75,17 +75,12 @@
 }
 
 static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
-  __m128i temp[2], sign[2], step[4];
+  __m128i step[4];
 
   transpose_32bit_4x4(io, io);
 
   // stage 1
-  temp[0] = _mm_add_epi32(io[0], io[2]);  // input[0] + input[2]
-  abs_extend_64bit_sse2(temp[0], temp, sign);
-  step[0] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64);
-  temp[0] = _mm_sub_epi32(io[0], io[2]);  // input[0] - input[2]
-  abs_extend_64bit_sse2(temp[0], temp, sign);
-  step[1] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64);
+  highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]);
   highbd_butterfly_sse2(io[1], io[3], (int)cospi_24_64, (int)cospi_8_64,
                         &step[2], &step[3]);
 
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -16,7 +16,7 @@
 #include "vpx_dsp/x86/transpose_sse2.h"
 
 static void highbd_idct8x8_half1d(__m128i *const io) {
-  __m128i temp1[4], temp2[4], sign[2], step1[8], step2[8];
+  __m128i step1[8], step2[8];
 
   transpose_32bit_4x4x2(io, io);
 
@@ -31,12 +31,7 @@
                         &step1[5], &step1[6]);
 
   // stage 2
-  temp2[0] = _mm_add_epi32(step1[0], step1[2]);
-  abs_extend_64bit_sse2(temp2[0], temp1, sign);
-  step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
-  temp2[0] = _mm_sub_epi32(step1[0], step1[2]);
-  abs_extend_64bit_sse2(temp2[0], temp1, sign);
-  step2[1] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+  highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]);
   highbd_butterfly_sse2(step1[1], step1[3], (int)cospi_24_64, (int)cospi_8_64,
                         &step2[2], &step2[3]);
   step2[4] = _mm_add_epi32(step1[4], step1[5]);
@@ -50,12 +45,7 @@
   step1[2] = _mm_sub_epi32(step2[1], step2[2]);
   step1[3] = _mm_sub_epi32(step2[0], step2[3]);
   step1[4] = step2[4];
-  temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
-  abs_extend_64bit_sse2(temp2[0], temp1, sign);
-  step1[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
-  temp2[0] = _mm_add_epi32(step2[6], step2[5]);
-  abs_extend_64bit_sse2(temp2[0], temp1, sign);
-  step1[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+  highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
   step1[7] = step2[7];
 
   // stage 4
@@ -63,7 +53,7 @@
 }
 
 static void highbd_idct8x8_12_half1d(__m128i *const io) {
-  __m128i temp1[4], temp2[4], sign[2], step1[8], step2[8];
+  __m128i temp1[4], sign[2], step1[8], step2[8];
 
   transpose_32bit_4x4(io, io);
 
@@ -94,12 +84,7 @@
   step1[2] = _mm_sub_epi32(step2[0], step2[2]);
   step1[3] = _mm_sub_epi32(step2[0], step2[3]);
   step1[4] = step2[4];
-  temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
-  abs_extend_64bit_sse2(temp2[0], temp1, sign);
-  step1[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
-  temp2[0] = _mm_add_epi32(step2[6], step2[5]);
-  abs_extend_64bit_sse2(temp2[0], temp1, sign);
-  step1[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+  highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
   step1[7] = step2[7];
 
   // stage 4
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -18,7 +18,7 @@
 #include "vpx_dsp/x86/transpose_sse2.h"
 
 static void highbd_idct8x8_half1d(__m128i *const io) {
-  __m128i temp1[4], temp2[4], step1[8], step2[8];
+  __m128i step1[8], step2[8];
 
   transpose_32bit_4x4x2(io, io);
 
@@ -33,12 +33,7 @@
                           &step1[5], &step1[6]);
 
   // stage 2
-  temp2[0] = _mm_add_epi32(step1[0], step1[2]);
-  extend_64bit(temp2[0], temp1);
-  step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
-  temp2[0] = _mm_sub_epi32(step1[0], step1[2]);
-  extend_64bit(temp2[0], temp1);
-  step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]);
   highbd_butterfly_sse4_1(step1[1], step1[3], (int)cospi_24_64, (int)cospi_8_64,
                           &step2[2], &step2[3]);
   step2[4] = _mm_add_epi32(step1[4], step1[5]);
@@ -52,12 +47,7 @@
   step1[2] = _mm_sub_epi32(step2[1], step2[2]);
   step1[3] = _mm_sub_epi32(step2[0], step2[3]);
   step1[4] = step2[4];
-  temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
-  extend_64bit(temp2[0], temp1);
-  step1[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
-  temp2[0] = _mm_add_epi32(step2[6], step2[5]);
-  extend_64bit(temp2[0], temp1);
-  step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
   step1[7] = step2[7];
 
   // stage 4
@@ -65,7 +55,7 @@
 }
 
 static void highbd_idct8x8_12_half1d(__m128i *const io) {
-  __m128i temp1[4], temp2[4], step1[8], step2[8];
+  __m128i temp1[2], step1[8], step2[8];
 
   transpose_32bit_4x4(io, io);
 
@@ -96,12 +86,7 @@
   step1[2] = _mm_sub_epi32(step2[0], step2[2]);
   step1[3] = _mm_sub_epi32(step2[0], step2[3]);
   step1[4] = step2[4];
-  temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
-  extend_64bit(temp2[0], temp1);
-  step1[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
-  temp2[0] = _mm_add_epi32(step2[6], step2[5]);
-  extend_64bit(temp2[0], temp1);
-  step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
   step1[7] = step2[7];
 
   // stage 4
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -140,6 +140,20 @@
   *out1 = pack_4(temp2[0], temp2[1]);
 }
 
+static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0,
+                                                 const __m128i in1,
+                                                 __m128i *const out0,
+                                                 __m128i *const out1) {
+  __m128i temp1[2], temp2, sign[2];
+
+  temp2 = _mm_add_epi32(in0, in1);
+  abs_extend_64bit_sse2(temp2, temp1, sign);
+  *out0 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+  temp2 = _mm_sub_epi32(in0, in1);
+  abs_extend_64bit_sse2(temp2, temp1, sign);
+  *out1 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
+}
+
 // Note: c0 and c1 must be non negative.
 static INLINE void highbd_multiplication_sse2(const __m128i in, const int c0,
                                               const int c1, __m128i *const out0,
--- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -59,6 +59,20 @@
   *out1 = pack_4(temp2[0], temp2[1]);
 }
 
+static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0,
+                                                   const __m128i in1,
+                                                   __m128i *const out0,
+                                                   __m128i *const out1) {
+  __m128i temp1[2], temp2;
+
+  temp2 = _mm_add_epi32(in0, in1);
+  extend_64bit(temp2, temp1);
+  *out0 = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  temp2 = _mm_sub_epi32(in0, in1);
+  extend_64bit(temp2, temp1);
+  *out1 = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+}
+
 static INLINE void highbd_multiplication_sse4_1(const __m128i in, const int c0,
                                                 const int c1,
                                                 __m128i *const out0,