shithub: opus

--- a/celt/bands.c

+++ b/celt/bands.c

@@ -1036,7 +1036,7 @@

          /* Finally do the actual quantization */

          if (encode)

-            cm = alg_quant(X, N, K, spread, B, ec, gain, ctx->resynth);

+            cm = alg_quant(X, N, K, spread, B, ec, gain, ctx->resynth, ctx->arch);

          } else {

             cm = alg_unquant(X, N, K, spread, B, ec, gain);

--- a/celt/tests/test_unit_mathops.c

+++ b/celt/tests/test_unit_mathops.c

@@ -57,6 +57,7 @@

 # endif

 # if defined(OPUS_X86_MAY_HAVE_SSE2)

 #  include "x86/pitch_sse2.c"

+#  include "x86/vq_sse2.c"

 # endif

 # if defined(OPUS_X86_MAY_HAVE_SSE4_1)

 #  include "x86/pitch_sse4_1.c"

--- a/celt/tests/test_unit_rotation.c

+++ b/celt/tests/test_unit_rotation.c

@@ -55,6 +55,7 @@

 # endif

 # if defined(OPUS_X86_MAY_HAVE_SSE2)

 #  include "x86/pitch_sse2.c"

+#  include "x86/vq_sse2.c"

 # endif

 # if defined(OPUS_X86_MAY_HAVE_SSE4_1)

 #  include "x86/pitch_sse4_1.c"

--- a/celt/vq.c

+++ b/celt/vq.c

@@ -158,11 +158,9 @@

    return collapse_mask;

-unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc,

-      opus_val16 gain, int resynth)

+opus_val16 op_pvq_search_c(celt_norm *X, int *iy, int K, int N, int arch)

    VARDECL(celt_norm, y);

-   VARDECL(int, iy);

    VARDECL(int, signx);

    int i, j;

    int pulsesLeft;

@@ -169,18 +167,12 @@

    opus_val32 sum;

    opus_val32 xy;

    opus_val16 yy;

-   unsigned collapse_mask;

    SAVE_STACK;

-   celt_assert2(K>0, "alg_quant() needs at least one pulse");

-   celt_assert2(N>1, "alg_quant() needs at least two dimensions");

+   (void)arch;

    ALLOC(y, N, celt_norm);

-   ALLOC(iy, N, int);

    ALLOC(signx, N, int);

-   exp_rotation(X, N, 1, B, K, spread);

    /* Get rid of the sign */

    sum = 0;

    j=0; do {

@@ -322,6 +314,28 @@

          but has the same performance otherwise. */

       iy[j] = (iy[j]^-signx[j]) + signx[j];

    } while (++j<N);

+   RESTORE_STACK;

+   return yy;

+}

+unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc,

+      opus_val16 gain, int resynth, int arch)

+{

+   VARDECL(int, iy);

+   opus_val16 yy;

+   unsigned collapse_mask;

+   SAVE_STACK;

+   celt_assert2(K>0, "alg_quant() needs at least one pulse");

+   celt_assert2(N>1, "alg_quant() needs at least two dimensions");

+   /* Covers vectorization by up to 4. */

+   ALLOC(iy, N+3, int);

+   exp_rotation(X, N, 1, B, K, spread);

+   yy = op_pvq_search(X, iy, K, N, arch);

    encode_pulses(iy, N, K, enc);

    if (resynth)

--- a/celt/vq.h

+++ b/celt/vq.h

@@ -37,11 +37,21 @@

 #include "entdec.h"

 #include "modes.h"

+#if (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT))

+#include "x86/vq_sse.h"

+#endif

 #if defined(MIPSr1_ASM)

 #include "mips/vq_mipsr1.h"

 #endif

+opus_val16 op_pvq_search_c(celt_norm *X, int *iy, int K, int N, int arch);

+#if !defined(OVERRIDE_OP_PVQ_SEARCH)

+#define op_pvq_search(x, iy, K, N, arch) \

+    (op_pvq_search_c(x, iy, K, N, arch))

+#endif

 /** Algebraic pulse-vector quantiser. The signal x is replaced by the sum of

   * the pitch and a combination of pulses such that its norm is still equal

   * to 1. This is the function that will typically require the most CPU.

@@ -52,7 +62,7 @@

  * @ret A mask indicating which blocks in the band received pulses

*/

 unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc,

-      opus_val16 gain, int resynth);

+      opus_val16 gain, int resynth, int arch);

 /** Algebraic pulse decoder

  * @param X Decoded normalised spectrum (returned)

--- /dev/null

+++ b/celt/x86/vq_sse.h

@@ -1,0 +1,50 @@

+/* Copyright (c) 2016  Jean-Marc Valin */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+#ifndef VQ_SSE_H

+#define VQ_SSE_H

+#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)

+#define OVERRIDE_OP_PVQ_SEARCH

+opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch);

+#if defined(OPUS_X86_PRESUME_SSE2)

+#define op_pvq_search(x, iy, K, N, arch) \

+    (op_pvq_search_sse2(x, iy, K, N, arch))

+#else

+extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])(

+      celt_norm *_X, int *iy, int K, int N, int arch);

+#  define op_pvq_search(X, iy, K, N, arch) \

+    ((*OP_PVQ_SEARCH_IMPL[(arch) & OPUS_ARCHMASK])(X, iy, K, N, arch))

+#endif

+#endif

+#endif

--- /dev/null

+++ b/celt/x86/vq_sse2.c

@@ -1,0 +1,217 @@

+/* Copyright (c) 2007-2008 CSIRO

+   Copyright (c) 2007-2009 Xiph.Org Foundation

+   Copyright (c) 2007-2016 Jean-Marc Valin */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+#ifdef HAVE_CONFIG_H

+#include "config.h"

+#endif

+#include <xmmintrin.h>

+#include <emmintrin.h>

+#include "celt_lpc.h"

+#include "stack_alloc.h"

+#include "mathops.h"

+#include "vq.h"

+#include "x86cpu.h"

+#ifndef FIXED_POINT

+opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)

+{

+   int i, j;

+   int pulsesLeft;

+   float xy, yy;

+   VARDECL(celt_norm, y);

+   VARDECL(celt_norm, X);

+   VARDECL(float, signy);

+   __m128 signmask;

+   __m128 sums;

+   __m128i fours;

+   SAVE_STACK;

+   (void)arch;

+   /* All bits set to zero, except for the sign bit. */

+   signmask = _mm_set_ps1(-0.f);

+   fours = _mm_set_epi32(4, 4, 4, 4);

+   ALLOC(y, N+3, celt_norm);

+   ALLOC(X, N+3, celt_norm);

+   ALLOC(signy, N+3, float);

+   OPUS_COPY(X, _X, N);

+   X[N] = X[N+1] = X[N+2] = 0;

+   sums = _mm_setzero_ps();

+   for (j=0;j<N;j+=4)

+   {

+      __m128 x4, s4;

+      x4 = _mm_loadu_ps(&X[j]);

+      s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());

+      /* Get rid of the sign */

+      x4 = _mm_andnot_ps(signmask, x4);

+      sums = _mm_add_ps(sums, x4);

+      /* Clear y and iy in case we don't do the projection. */

+      _mm_storeu_ps(&y[j], _mm_setzero_ps());

+      _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128());

+      _mm_storeu_ps(&X[j], x4);

+      _mm_storeu_ps(&signy[j], s4);

+   }

+   sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)));

+   sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1)));

+   xy = yy = 0;

+   pulsesLeft = K;

+   /* Do a pre-search by projecting on the pyramid */

+   if (K > (N>>1))

+   {

+      __m128i pulses_sum;

+      __m128 yy4, xy4;

+      __m128 rcp4;

+      opus_val32 sum = _mm_cvtss_f32(sums);

+      /* If X is too small, just replace it with a pulse at 0 */

+      /* Prevents infinities and NaNs from causing too many pulses

+         to be allocated. 64 is an approximation of infinity here. */

+      if (!(sum > EPSILON && sum < 64))

+      {

+         X[0] = QCONST16(1.f,14);

+         j=1; do

+            X[j]=0;

+         while (++j<N);

+         sums = _mm_set_ps1(1.f);

+      }

+      rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K-1)), _mm_rcp_ps(sums));

+      xy4 = yy4 = _mm_setzero_ps();

+      pulses_sum = _mm_setzero_si128();

+      for (j=0;j<N;j+=4)

+      {

+         __m128 rx4, x4, y4;

+         __m128i iy4;

+         x4 = _mm_loadu_ps(&X[j]);

+         rx4 = _mm_mul_ps(x4, rcp4);

+         iy4 = _mm_cvttps_epi32(rx4);

+         pulses_sum = _mm_add_epi32(pulses_sum, iy4);

+         _mm_storeu_si128((__m128i*)&iy[j], iy4);

+         y4 = _mm_cvtepi32_ps(iy4);

+         xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));

+         yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));

+         /* double the y[] vector so we don't have to do it in the search loop. */

+         _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));

+      }

+      pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2)));

+      pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1)));

+      pulsesLeft -= _mm_cvtsi128_si32(pulses_sum);

+      xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2)));

+      xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1)));

+      xy = _mm_cvtss_f32(xy4);

+      yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2)));

+      yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1)));

+      yy = _mm_cvtss_f32(yy4);

+   }

+   X[N] = X[N+1] = X[N+2] = -100;

+   y[N] = y[N+1] = y[N+2] = 100;

+   celt_assert2(pulsesLeft>=1, "Allocated too many pulses in the quick pass");

+   /* This should never happen, but just in case it does (e.g. on silence)

+      we fill the first bin with pulses. */

+   if (pulsesLeft > N+3)

+   {

+      opus_val16 tmp = (opus_val16)pulsesLeft;

+      yy = MAC16_16(yy, tmp, tmp);

+      yy = MAC16_16(yy, tmp, y[0]);

+      iy[0] += pulsesLeft;

+      pulsesLeft=0;

+   }

+   for (i=0;i<pulsesLeft;i++)

+   {

+      int best_id;

+      __m128 xy4, yy4;

+      __m128 max, max2;

+      __m128i count;

+      __m128i pos;

+      best_id = 0;

+      /* The squared magnitude term gets added anyway, so we might as well

+         add it outside the loop */

+      yy = ADD16(yy, 1);

+      xy4 = _mm_load1_ps(&xy);

+      yy4 = _mm_load1_ps(&yy);

+      max = _mm_setzero_ps();

+      pos = _mm_setzero_si128();

+      count = _mm_set_epi32(3, 2, 1, 0);

+      for (j=0;j<N;j+=4)

+      {

+         __m128 x4, y4, r4;

+         x4 = _mm_loadu_ps(&X[j]);

+         y4 = _mm_loadu_ps(&y[j]);

+         x4 = _mm_add_ps(x4, xy4);

+         y4 = _mm_add_ps(y4, yy4);

+         y4 = _mm_rsqrt_ps(y4);

+         r4 = _mm_mul_ps(x4, y4);

+         /* Update the index of the max. */

+         pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));

+         /* Update the max. */

+         max = _mm_max_ps(max, r4);

+         /* Update the indices (+4) */

+         count = _mm_add_epi32(count, fours);

+      }

+      /* Horizontal max */

+      max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));

+      max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));

+      /* Now that max2 contains the max at all positions, look at which value(s) of the

+         partial max is equal to the global max. */

+      pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));

+      pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));

+      pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));

+      best_id = _mm_cvtsi128_si32(pos);

+      /* Updating the sums of the new pulse(s) */

+      xy = ADD32(xy, EXTEND32(X[best_id]));

+      /* We're multiplying y[j] by two so we don't have to do it here */

+      yy = ADD16(yy, y[best_id]);

+      /* Only now that we've made the final choice, update y/iy */

+      /* Multiplying y[j] by 2 so we don't have to do it everywhere else */

+      y[best_id] += 2;

+      iy[best_id]++;

+   }

+   /* Put the original sign back */

+   for (j=0;j<N;j+=4)

+   {

+      __m128i y4;

+      __m128i s4;

+      y4 = _mm_loadu_si128((__m128i*)&iy[j]);

+      s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));

+      y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);

+      _mm_storeu_si128((__m128i*)&iy[j], y4);

+   }

+   RESTORE_STACK;

+   return yy;

+}

+#endif

--- a/celt/x86/x86_celt_map.c

+++ b/celt/x86/x86_celt_map.c

@@ -33,6 +33,7 @@

 #include "celt_lpc.h"

 #include "pitch.h"

 #include "pitch_sse.h"

+#include "vq.h"

 #if defined(OPUS_HAVE_RTCD)

@@ -149,6 +150,18 @@

};

+#endif

+#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)

+opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])(

+      celt_norm *_X, int *iy, int K, int N, int arch

+) = {

+  op_pvq_search_c,                /* non-sse */

+  op_pvq_search_c,

+  MAY_HAVE_SSE2(op_pvq_search),

+  MAY_HAVE_SSE2(op_pvq_search),

+  MAY_HAVE_SSE2(op_pvq_search)

+};

 #endif

 #endif

--- a/celt_headers.mk

+++ b/celt_headers.mk

@@ -49,4 +49,5 @@

 celt/mips/pitch_mipsr1.h \

 celt/mips/vq_mipsr1.h \

 celt/x86/pitch_sse.h \

+celt/x86/vq_sse.h \

 celt/x86/x86cpu.h

--- a/celt_sources.mk

+++ b/celt_sources.mk

@@ -21,7 +21,7 @@

 celt/x86/x86_celt_map.c \

 celt/x86/pitch_sse.c

-CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c

+CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c celt/x86/vq_sse2.c

 CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c \

 celt/x86/pitch_sse4_1.c

--

⑨