shithub: libvpx

--- a/test/quantize_test.cc

+++ b/test/quantize_test.cc

@@ -13,9 +13,10 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

-#include "./vpx_config.h"

 #include "./vp8_rtcd.h"

+#include "./vpx_config.h"

 #include "test/acm_random.h"

+#include "test/bench.h"

 #include "test/clear_system_state.h"

 #include "test/register_state_check.h"

 #include "test/util.h"

@@ -117,7 +118,8 @@

};

 class QuantizeTest : public QuantizeTestBase,

-                     public ::testing::TestWithParam<VP8QuantizeParam> {

+                     public ::testing::TestWithParam<VP8QuantizeParam>,

+                     public AbstractBench {

  protected:

   virtual void SetUp() {

     SetupCompressor();

@@ -125,6 +127,10 @@

     c_quant_ = GET_PARAM(1);

+  virtual void Run() {

+    asm_quant_(&vp8_comp_->mb.block[0], &macroblockd_dst_->block[0]);

+  }

   void RunComparison() {

     for (int i = 0; i < kNumBlocks; ++i) {

       ASM_REGISTER_STATE_CHECK(

@@ -165,6 +171,13 @@

     FillCoeffRandom();

     RunComparison();

+}

+TEST_P(QuantizeTest, DISABLED_Speed) {

+  FillCoeffRandom();

+  RunNTimes(10000000);

+  PrintMedian("vp8 quantize");

 #if HAVE_SSE2

--- a/vp8/encoder/x86/quantize_sse4.c

+++ b/vp8/encoder/x86/quantize_sse4.c

@@ -11,8 +11,8 @@

 #include <smmintrin.h> /* SSE4.1 */

 #include "./vp8_rtcd.h"

-#include "vp8/encoder/block.h"

 #include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */

+#include "vp8/encoder/block.h"

 #define SELECT_EOB(i, z, x, y, q)         \

   do {                                    \

@@ -31,8 +31,7 @@

   char eob = 0;

   short *zbin_boost_ptr = b->zrun_zbin_boost;

-  __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0,

-      dqcoeff1;

+  __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1;

   __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));

   __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));

   __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));

@@ -53,16 +52,10 @@

   zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);

   zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);

-  /* Sign of z: z >> 15 */

-  sz0 = _mm_srai_epi16(z0, 15);

-  sz1 = _mm_srai_epi16(z1, 15);

+  /* x = abs(z) */

+  x0 = _mm_abs_epi16(z0);

+  x1 = _mm_abs_epi16(z1);

-  /* x = abs(z): (z ^ sz) - sz */

-  x0 = _mm_xor_si128(z0, sz0);

-  x1 = _mm_xor_si128(z1, sz1);

-  x0 = _mm_sub_epi16(x0, sz0);

-  x1 = _mm_sub_epi16(x1, sz1);

   /* zbin[] + zbin_extra */

   zbin0 = _mm_add_epi16(zbin0, zbin_extra);

   zbin1 = _mm_add_epi16(zbin1, zbin_extra);

@@ -89,11 +82,9 @@

   y0 = _mm_mulhi_epi16(y0, quant_shift0);

   y1 = _mm_mulhi_epi16(y1, quant_shift1);

-  /* Return the sign: (y ^ sz) - sz */

-  y0 = _mm_xor_si128(y0, sz0);

-  y1 = _mm_xor_si128(y1, sz1);

-  y0 = _mm_sub_epi16(y0, sz0);

-  y1 = _mm_sub_epi16(y1, sz1);

+  /* Restore the sign. */

+  y0 = _mm_sign_epi16(y0, z0);

+  y1 = _mm_sign_epi16(y1, z1);

   /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */

   SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);

--

⑨