shithub: libvpx

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -701,7 +701,7 @@

 specialize vp9_quantize_b $ssse3_x86_64

 prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"

-specialize vp9_quantize_b_32x32

+specialize vp9_quantize_b_32x32 $ssse3_x86_64

 # Structured Similarity (SSIM)

--- a/vp9/encoder/vp9_quantize.c

+++ b/vp9/encoder/vp9_quantize.c

@@ -84,7 +84,6 @@

   *eob_ptr = eob + 1;

-// This function works well for large transform size.

 void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,

                             int skip_block,

                             int16_t *zbin_ptr, int16_t *round_ptr,

@@ -105,8 +104,8 @@

   eob = -1;

   // Base ZBIN

-  zbins[0] = zbin_ptr[0] + zbin_oq_value;

-  zbins[1] = zbin_ptr[1] + zbin_oq_value;

+  zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);

+  zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);

   nzbins[0] = zbins[0] * -1;

   nzbins[1] = zbins[1] * -1;

@@ -114,7 +113,7 @@

     // Pre-scan pass

     for (i = 0; i < n_coeffs; i++) {

       rc = scan[i];

-      z = coeff_ptr[rc] * 2;

+      z = coeff_ptr[rc];

       // If the coefficient is out of the base ZBIN range, keep it for

       // quantization.

@@ -130,14 +129,14 @@

       // Calculate ZBIN

       zbin = (zbins[rc != 0]);

-      z = coeff_ptr[rc] * 2;

+      z = coeff_ptr[rc];

       sz = (z >> 31);                               // sign of z

       x  = (z ^ sz) - sz;                           // x = abs(z)

       if (x >= zbin) {

-        x += (round_ptr[rc != 0]);

+        x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);

         y  = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *

-              quant_shift_ptr[rc != 0]) >> 16;      // quantize (x)

+              quant_shift_ptr[rc != 0]) >> 15;      // quantize (x)

         x  = (y ^ sz) - sz;                         // get the sign back

         qcoeff_ptr[rc]  = x;                        // write to destination

--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm

+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm

@@ -36,6 +36,14 @@

   pshufd                          m4, m4, 0

   mova                            m2, [quantq]             ; m2 = quant

   paddw                           m0, m4                   ; m0 = zbin + zbin_oq

+%ifidn %1, b_32x32

+  pcmpeqw                         m5, m5

+  psrlw                           m5, 15

+  paddw                           m0, m5

+  paddw                           m1, m5

+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2

+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2

+%endif

   mova                            m3, [r2q]                ; m3 = dequant

   psubw                           m0, [pw_1]

   mov                             r2, shiftmp

@@ -43,6 +51,9 @@

   mova                            m4, [r2]                 ; m4 = shift

   mov                             r4, dqcoeffmp

   mov                             r5, iscanmp

+%ifidn %1, b_32x32

+  psllw                           m4, 1

+%endif

   pxor                            m5, m5                   ; m5 = dedicated zero

   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob

   lea                         coeffq, [  coeffq+ncoeffq*2]

@@ -56,10 +67,6 @@

   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

   pabsw                           m6, m9                   ; m6 = abs(m9)

   pabsw                          m11, m10                  ; m11 = abs(m10)

-%ifidn %1, b_32x32

-  paddw                           m6, m6

-  paddw                          m11, m11

-%endif

   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin

   punpckhqdq                      m0, m0

   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin

@@ -112,10 +119,6 @@

   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

   pabsw                           m6, m9                   ; m6 = abs(m9)

   pabsw                          m11, m10                  ; m11 = abs(m10)

-%ifidn %1, b_32x32

-  paddw                           m6, m6

-  paddw                          m11, m11

-%endif

   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin

   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin

 %ifidn %1, b_32x32

@@ -164,6 +167,7 @@

   pmaxsw                          m8, m13

   add                        ncoeffq, mmsize

   jl .ac_only_loop

 %ifidn %1, b_32x32

   jmp .accumulate_eob

 .skip_iter:

--

⑨