shithub: libvpx

--- a/test/partial_idct_test.cc

+++ b/test/partial_idct_test.cc

@@ -650,6 +650,15 @@

   make_tuple(

       &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,

       &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 12, 2),

+  make_tuple(

+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,

+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 8, 2),

+  make_tuple(

+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,

+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 10, 2),

+  make_tuple(

+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,

+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 12, 2),

   make_tuple(&vpx_highbd_fdct8x8_c,

              &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,

              &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse2>, TX_8X8, 64, 8, 2),

@@ -668,6 +677,12 @@

   make_tuple(

       &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,

       &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse2>, TX_8X8, 12, 12, 2),

+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,

+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 8, 2),

+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,

+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 10, 2),

+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,

+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 12, 2),

   make_tuple(&vpx_highbd_fdct4x4_c,

              &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,

              &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 8, 2),

@@ -677,6 +692,12 @@

   make_tuple(

       &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,

       &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 12, 2),

+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,

+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 8, 2),

+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,

+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 10, 2),

+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,

+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 12, 2),

 #endif  // CONFIG_VP9_HIGHBITDEPTH

   make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,

              &wrapper<vpx_idct32x32_1024_add_sse2>, TX_32X32, 1024, 8, 1),

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -629,18 +629,18 @@

   add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

   add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

-  specialize qw/vpx_highbd_idct4x4_1_add neon/;

+  specialize qw/vpx_highbd_idct4x4_1_add neon sse2/;

   add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

   add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

   add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

-  specialize qw/vpx_highbd_idct8x8_1_add neon/;

+  specialize qw/vpx_highbd_idct8x8_1_add neon sse2/;

   add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

   add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

   add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

   add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

-  specialize qw/vpx_highbd_idct16x16_1_add neon/;

+  specialize qw/vpx_highbd_idct16x16_1_add neon sse2/;

   add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

   add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

--- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c

+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c

@@ -242,3 +242,8 @@

+void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                     int stride, int bd) {

+  highbd_idct_1_add_kernel(input, dest, stride, bd, 16);

+}

--- a/vpx_dsp/x86/highbd_idct32x32_add_sse2.c

+++ b/vpx_dsp/x86/highbd_idct32x32_add_sse2.c

@@ -9,6 +9,7 @@

*/

 #include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"

 #include "vpx_dsp/x86/inv_txfm_sse2.h"

 #include "vpx_dsp/x86/transpose_sse2.h"

 #include "vpx_dsp/x86/txfm_common_sse2.h"

@@ -15,27 +16,5 @@

 void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,

                                      int stride, int bd) {

-  __m128i dc_value, d;

-  const __m128i zero = _mm_setzero_si128();

-  const __m128i one = _mm_set1_epi16(1);

-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);

-  int a, i, j;

-  tran_low_t out;

-  out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

-  a = ROUND_POWER_OF_TWO(out, 6);

-  d = _mm_set1_epi32(a);

-  dc_value = _mm_packs_epi32(d, d);

-  for (i = 0; i < 32; ++i) {

-    for (j = 0; j < 4; ++j) {

-      d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));

-      d = _mm_adds_epi16(d, dc_value);

-      d = _mm_max_epi16(d, zero);

-      d = _mm_min_epi16(d, max);

-      _mm_storeu_si128((__m128i *)(&dest[j * 8]), d);

-    }

-    dest += stride;

-  }

+  highbd_idct_1_add_kernel(input, dest, stride, bd, 32);

--- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c

+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c

@@ -127,3 +127,26 @@

+void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                   int stride, int bd) {

+  const __m128i zero = _mm_setzero_si128();

+  // Faster than _mm_set1_epi16((1 << bd) - 1).

+  const __m128i one = _mm_set1_epi16(1);

+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);

+  int a1, i;

+  tran_low_t out;

+  __m128i dc, d;

+  out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

+  a1 = ROUND_POWER_OF_TWO(out, 4);

+  dc = _mm_set1_epi16(a1);

+  for (i = 0; i < 4; ++i) {

+    d = _mm_loadl_epi64((const __m128i *)dest);

+    d = add_dc_clamp(&zero, &max, &dc, &d);

+    _mm_storel_epi64((__m128i *)dest, d);

+    dest += stride;

+  }

+}

--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c

+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c

@@ -214,3 +214,8 @@

+void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                   int stride, int bd) {

+  highbd_idct_1_add_kernel(input, dest, stride, bd, 8);

+}

--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h

+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h

@@ -17,6 +17,43 @@

 #include "vpx_dsp/inv_txfm.h"

 #include "vpx_dsp/x86/txfm_common_sse2.h"

+static INLINE __m128i add_dc_clamp(const __m128i *const min,

+                                   const __m128i *const max,

+                                   const __m128i *const dc,

+                                   const __m128i *const in) {

+  __m128i out;

+  out = _mm_adds_epi16(*in, *dc);

+  out = _mm_max_epi16(out, *min);

+  out = _mm_min_epi16(out, *max);

+  return out;

+}

+static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,

+                                            uint16_t *dest, int stride, int bd,

+                                            const int size) {

+  const __m128i zero = _mm_setzero_si128();

+  // Faster than _mm_set1_epi16((1 << bd) - 1).

+  const __m128i one = _mm_set1_epi16(1);

+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);

+  int a1, i, j;

+  tran_low_t out;

+  __m128i dc, d;

+  out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

+  a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6);

+  dc = _mm_set1_epi16(a1);

+  for (i = 0; i < size; ++i) {

+    for (j = 0; j < (size >> 3); ++j) {

+      d = _mm_load_si128((const __m128i *)(&dest[j * 8]));

+      d = add_dc_clamp(&zero, &max, &dc, &d);

+      _mm_store_si128((__m128i *)(&dest[j * 8]), d);

+    }

+    dest += stride;

+  }

+}

 static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {

   __m128i ubounded, retval;

   const __m128i zero = _mm_set1_epi16(0);

--

⑨