shithub: libvpx

Download patch

ref: d670678f26da66a0a903a41bc789b6f996eb6c33
parent: fa829e0e5aa3e43d14ecea5f327b6a2e27818cf8
author: Linfeng Zhang <linfengz@google.com>
date: Thu Aug 3 13:50:03 EDT 2017

Rename highbd_multiplication_and_add_xx() to highbd_butterfly_xx()

in idct x86 code

Change-Id: I5159499a73a5c1b680516f6ca9c3d84f00c35083

--- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -75,20 +75,20 @@
   __m128i temp1[4], temp2, sign[2];
 
   // stage 2
-  highbd_multiplication_and_add_sse2(io[1], io[15], (int)cospi_30_64,
-                                     (int)cospi_2_64, &step2[8], &step2[15]);
-  highbd_multiplication_and_add_sse2(io[9], io[7], (int)cospi_14_64,
-                                     (int)cospi_18_64, &step2[9], &step2[14]);
-  highbd_multiplication_and_add_sse2(io[5], io[11], (int)cospi_22_64,
-                                     (int)cospi_10_64, &step2[10], &step2[13]);
-  highbd_multiplication_and_add_sse2(io[13], io[3], (int)cospi_6_64,
-                                     (int)cospi_26_64, &step2[11], &step2[12]);
+  highbd_butterfly_sse2(io[1], io[15], (int)cospi_30_64, (int)cospi_2_64,
+                        &step2[8], &step2[15]);
+  highbd_butterfly_sse2(io[9], io[7], (int)cospi_14_64, (int)cospi_18_64,
+                        &step2[9], &step2[14]);
+  highbd_butterfly_sse2(io[5], io[11], (int)cospi_22_64, (int)cospi_10_64,
+                        &step2[10], &step2[13]);
+  highbd_butterfly_sse2(io[13], io[3], (int)cospi_6_64, (int)cospi_26_64,
+                        &step2[11], &step2[12]);
 
   // stage 3
-  highbd_multiplication_and_add_sse2(io[2], io[14], (int)cospi_28_64,
-                                     (int)cospi_4_64, &step1[4], &step1[7]);
-  highbd_multiplication_and_add_sse2(io[10], io[6], (int)cospi_12_64,
-                                     (int)cospi_20_64, &step1[5], &step1[6]);
+  highbd_butterfly_sse2(io[2], io[14], (int)cospi_28_64, (int)cospi_4_64,
+                        &step1[4], &step1[7]);
+  highbd_butterfly_sse2(io[10], io[6], (int)cospi_12_64, (int)cospi_20_64,
+                        &step1[5], &step1[6]);
   step1[8] = _mm_add_epi32(step2[8], step2[9]);
   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
   step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
@@ -105,12 +105,12 @@
   temp2 = _mm_sub_epi32(io[0], io[8]);
   abs_extend_64bit_sse2(temp2, temp1, sign);
   step2[1] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
-  highbd_multiplication_and_add_sse2(io[4], io[12], (int)cospi_24_64,
-                                     (int)cospi_8_64, &step2[2], &step2[3]);
-  highbd_multiplication_and_add_sse2(step1[14], step1[9], (int)cospi_24_64,
-                                     (int)cospi_8_64, &step2[9], &step2[14]);
-  highbd_multiplication_and_add_sse2(step1[10], step1[13], (int)cospi_8_64,
-                                     (int)cospi_24_64, &step2[13], &step2[10]);
+  highbd_butterfly_sse2(io[4], io[12], (int)cospi_24_64, (int)cospi_8_64,
+                        &step2[2], &step2[3]);
+  highbd_butterfly_sse2(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64,
+                        &step2[9], &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], (int)cospi_8_64, (int)cospi_24_64,
+                        &step2[13], &step2[10]);
   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
   step1[4] = _mm_add_epi32(step1[4], step1[5]);
   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
@@ -159,10 +159,10 @@
   step2[1] = step2[0];
   highbd_multiplication_sse2(io[4], (int)cospi_24_64, (int)cospi_8_64,
                              &step2[2], &step2[3]);
-  highbd_multiplication_and_add_sse2(step1[14], step1[9], (int)cospi_24_64,
-                                     (int)cospi_8_64, &step2[9], &step2[14]);
-  highbd_multiplication_and_add_sse2(step1[10], step1[13], (int)cospi_8_64,
-                                     (int)cospi_24_64, &step2[13], &step2[10]);
+  highbd_butterfly_sse2(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64,
+                        &step2[9], &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], (int)cospi_8_64, (int)cospi_24_64,
+                        &step2[13], &step2[10]);
   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
   step1[4] = _mm_add_epi32(step1[4], step1[5]);
   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
@@ -207,10 +207,10 @@
   step2[1] = step2[0];
   step2[2] = _mm_setzero_si128();
   step2[3] = _mm_setzero_si128();
-  highbd_multiplication_and_add_sse2(step1[14], step1[9], (int)cospi_24_64,
-                                     (int)cospi_8_64, &step2[9], &step2[14]);
-  highbd_multiplication_and_add_sse2(step1[10], step1[13], (int)cospi_8_64,
-                                     (int)cospi_24_64, &step2[13], &step2[10]);
+  highbd_butterfly_sse2(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64,
+                        &step2[9], &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], (int)cospi_8_64, (int)cospi_24_64,
+                        &step2[13], &step2[10]);
   step2[5] = step1[4];
   step2[6] = step1[7];
   step2[8] = step1[8];
--- a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
@@ -76,21 +76,20 @@
   __m128i temp1[4], temp2;
 
   // stage 2
-  highbd_multiplication_and_add_sse4_1(io[1], io[15], (int)cospi_30_64,
-                                       (int)cospi_2_64, &step2[8], &step2[15]);
-  highbd_multiplication_and_add_sse4_1(io[9], io[7], (int)cospi_14_64,
-                                       (int)cospi_18_64, &step2[9], &step2[14]);
-  highbd_multiplication_and_add_sse4_1(io[5], io[11], (int)cospi_22_64,
-                                       (int)cospi_10_64, &step2[10],
-                                       &step2[13]);
-  highbd_multiplication_and_add_sse4_1(
-      io[13], io[3], (int)cospi_6_64, (int)cospi_26_64, &step2[11], &step2[12]);
+  highbd_butterfly_sse4_1(io[1], io[15], (int)cospi_30_64, (int)cospi_2_64,
+                          &step2[8], &step2[15]);
+  highbd_butterfly_sse4_1(io[9], io[7], (int)cospi_14_64, (int)cospi_18_64,
+                          &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(io[5], io[11], (int)cospi_22_64, (int)cospi_10_64,
+                          &step2[10], &step2[13]);
+  highbd_butterfly_sse4_1(io[13], io[3], (int)cospi_6_64, (int)cospi_26_64,
+                          &step2[11], &step2[12]);
 
   // stage 3
-  highbd_multiplication_and_add_sse4_1(io[2], io[14], (int)cospi_28_64,
-                                       (int)cospi_4_64, &step1[4], &step1[7]);
-  highbd_multiplication_and_add_sse4_1(io[10], io[6], (int)cospi_12_64,
-                                       (int)cospi_20_64, &step1[5], &step1[6]);
+  highbd_butterfly_sse4_1(io[2], io[14], (int)cospi_28_64, (int)cospi_4_64,
+                          &step1[4], &step1[7]);
+  highbd_butterfly_sse4_1(io[10], io[6], (int)cospi_12_64, (int)cospi_20_64,
+                          &step1[5], &step1[6]);
   step1[8] = _mm_add_epi32(step2[8], step2[9]);
   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
   step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
@@ -107,13 +106,12 @@
   temp2 = _mm_sub_epi32(io[0], io[8]);
   extend_64bit(temp2, temp1);
   step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
-  highbd_multiplication_and_add_sse4_1(io[4], io[12], (int)cospi_24_64,
-                                       (int)cospi_8_64, &step2[2], &step2[3]);
-  highbd_multiplication_and_add_sse4_1(step1[14], step1[9], (int)cospi_24_64,
-                                       (int)cospi_8_64, &step2[9], &step2[14]);
-  highbd_multiplication_and_add_sse4_1(step1[10], step1[13], (int)cospi_8_64,
-                                       (int)cospi_24_64, &step2[13],
-                                       &step2[10]);
+  highbd_butterfly_sse4_1(io[4], io[12], (int)cospi_24_64, (int)cospi_8_64,
+                          &step2[2], &step2[3]);
+  highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
+                          (int)cospi_8_64, &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
+                          (int)cospi_24_64, &step2[13], &step2[10]);
   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
   step1[4] = _mm_add_epi32(step1[4], step1[5]);
   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
@@ -162,11 +160,10 @@
   step2[1] = step2[0];
   highbd_multiplication_sse4_1(io[4], (int)cospi_24_64, (int)cospi_8_64,
                                &step2[2], &step2[3]);
-  highbd_multiplication_and_add_sse4_1(step1[14], step1[9], (int)cospi_24_64,
-                                       (int)cospi_8_64, &step2[9], &step2[14]);
-  highbd_multiplication_and_add_sse4_1(step1[10], step1[13], (int)cospi_8_64,
-                                       (int)cospi_24_64, &step2[13],
-                                       &step2[10]);
+  highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
+                          (int)cospi_8_64, &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
+                          (int)cospi_24_64, &step2[13], &step2[10]);
   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
   step1[4] = _mm_add_epi32(step1[4], step1[5]);
   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
@@ -211,11 +208,10 @@
   step2[1] = step2[0];
   step2[2] = _mm_setzero_si128();
   step2[3] = _mm_setzero_si128();
-  highbd_multiplication_and_add_sse4_1(step1[14], step1[9], (int)cospi_24_64,
-                                       (int)cospi_8_64, &step2[9], &step2[14]);
-  highbd_multiplication_and_add_sse4_1(step1[10], step1[13], (int)cospi_8_64,
-                                       (int)cospi_24_64, &step2[13],
-                                       &step2[10]);
+  highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,
+                          (int)cospi_8_64, &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], (int)cospi_8_64,
+                          (int)cospi_24_64, &step2[13], &step2[10]);
   step2[5] = step1[4];
   step2[6] = step1[7];
   step2[8] = step1[8];
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -86,8 +86,8 @@
   temp[0] = _mm_sub_epi32(io[0], io[2]);  // input[0] - input[2]
   abs_extend_64bit_sse2(temp[0], temp, sign);
   step[1] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64);
-  highbd_multiplication_and_add_sse2(io[1], io[3], (int)cospi_24_64,
-                                     (int)cospi_8_64, &step[2], &step[3]);
+  highbd_butterfly_sse2(io[1], io[3], (int)cospi_24_64, (int)cospi_8_64,
+                        &step[2], &step[3]);
 
   // stage 2
   io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -28,8 +28,8 @@
   temp[0] = _mm_sub_epi32(io[0], io[2]);  // input[0] - input[2]
   extend_64bit(temp[0], temp);
   step[1] = multiplication_round_shift_sse4_1(temp, (int)cospi_16_64);
-  highbd_multiplication_and_add_sse4_1(io[1], io[3], (int)cospi_24_64,
-                                       (int)cospi_8_64, &step[2], &step[3]);
+  highbd_butterfly_sse4_1(io[1], io[3], (int)cospi_24_64, (int)cospi_8_64,
+                          &step[2], &step[3]);
 
   // stage 2
   io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -25,10 +25,10 @@
   step1[2] = io[4];
   step1[1] = io[2];
   step1[3] = io[6];
-  highbd_multiplication_and_add_sse2(io[1], io[7], (int)cospi_28_64,
-                                     (int)cospi_4_64, &step1[4], &step1[7]);
-  highbd_multiplication_and_add_sse2(io[5], io[3], (int)cospi_12_64,
-                                     (int)cospi_20_64, &step1[5], &step1[6]);
+  highbd_butterfly_sse2(io[1], io[7], (int)cospi_28_64, (int)cospi_4_64,
+                        &step1[4], &step1[7]);
+  highbd_butterfly_sse2(io[5], io[3], (int)cospi_12_64, (int)cospi_20_64,
+                        &step1[5], &step1[6]);
 
   // stage 2
   temp2[0] = _mm_add_epi32(step1[0], step1[2]);
@@ -37,8 +37,8 @@
   temp2[0] = _mm_sub_epi32(step1[0], step1[2]);
   abs_extend_64bit_sse2(temp2[0], temp1, sign);
   step2[1] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
-  highbd_multiplication_and_add_sse2(step1[1], step1[3], (int)cospi_24_64,
-                                     (int)cospi_8_64, &step2[2], &step2[3]);
+  highbd_butterfly_sse2(step1[1], step1[3], (int)cospi_24_64, (int)cospi_8_64,
+                        &step2[2], &step2[3]);
   step2[4] = _mm_add_epi32(step1[4], step1[5]);
   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -27,10 +27,10 @@
   step1[2] = io[4];
   step1[1] = io[2];
   step1[3] = io[6];
-  highbd_multiplication_and_add_sse4_1(io[1], io[7], (int)cospi_28_64,
-                                       (int)cospi_4_64, &step1[4], &step1[7]);
-  highbd_multiplication_and_add_sse4_1(io[5], io[3], (int)cospi_12_64,
-                                       (int)cospi_20_64, &step1[5], &step1[6]);
+  highbd_butterfly_sse4_1(io[1], io[7], (int)cospi_28_64, (int)cospi_4_64,
+                          &step1[4], &step1[7]);
+  highbd_butterfly_sse4_1(io[5], io[3], (int)cospi_12_64, (int)cospi_20_64,
+                          &step1[5], &step1[6]);
 
   // stage 2
   temp2[0] = _mm_add_epi32(step1[0], step1[2]);
@@ -39,8 +39,8 @@
   temp2[0] = _mm_sub_epi32(step1[0], step1[2]);
   extend_64bit(temp2[0], temp1);
   step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
-  highbd_multiplication_and_add_sse4_1(step1[1], step1[3], (int)cospi_24_64,
-                                       (int)cospi_8_64, &step2[2], &step2[3]);
+  highbd_butterfly_sse4_1(step1[1], step1[3], (int)cospi_24_64, (int)cospi_8_64,
+                          &step2[2], &step2[3]);
   step2[4] = _mm_add_epi32(step1[4], step1[5]);
   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -110,9 +110,10 @@
 }
 
 // Note: c0 and c1 must be non negative.
-static INLINE void highbd_multiplication_and_add_sse2(
-    const __m128i in0, const __m128i in1, const int c0, const int c1,
-    __m128i *const out0, __m128i *const out1) {
+static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1,
+                                         const int c0, const int c1,
+                                         __m128i *const out0,
+                                         __m128i *const out1) {
   const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0);
   const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0);
   __m128i temp1[4], temp2[4], sign1[2], sign2[2];
--- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -29,9 +29,10 @@
   return pack_4(t0, t1);
 }
 
-static INLINE void highbd_multiplication_and_add_sse4_1(
-    const __m128i in0, const __m128i in1, const int c0, const int c1,
-    __m128i *const out0, __m128i *const out1) {
+static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1,
+                                           const int c0, const int c1,
+                                           __m128i *const out0,
+                                           __m128i *const out1) {
   const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
   const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
   __m128i temp1[4], temp2[4];