shithub: libvpx

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -136,9 +136,10 @@

   specialize qw/vp9_block_error_fp sse2/;

   add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

-  specialize qw/vp9_quantize_fp neon/;

+  specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";

   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";

   add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

   specialize qw/vp9_fdct8x8_quant neon ssse3/;

--- a/vp9/encoder/x86/vp9_quantize_sse2.c

+++ b/vp9/encoder/x86/vp9_quantize_sse2.c

@@ -13,14 +13,16 @@

 #include "./vp9_rtcd.h"

 #include "vpx/vpx_integer.h"

+#include "vpx_dsp/vpx_dsp_common.h"

+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"

-void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,

+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

                           int skip_block, const int16_t *zbin_ptr,

                           const int16_t *round_ptr, const int16_t *quant_ptr,

-                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,

-                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,

-                          uint16_t *eob_ptr, const int16_t *scan_ptr,

-                          const int16_t *iscan_ptr) {

+                          const int16_t *quant_shift_ptr,

+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,

+                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {

   __m128i zero;

   __m128i thr;

   int16_t nzflag;

@@ -53,8 +55,8 @@

         __m128i qcoeff0, qcoeff1;

         __m128i qtmp0, qtmp1;

         // Do DC and first 15 AC

-        coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));

-        coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);

+        coeff0 = load_tran_low(coeff_ptr + n_coeffs);

+        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

         // Poor man's sign extract

         coeff0_sign = _mm_srai_epi16(coeff0, 15);

@@ -77,15 +79,15 @@

         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);

-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);

+        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);

+        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

         dequant = _mm_unpackhi_epi64(dequant, dequant);

         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);

-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);

+        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

+        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

@@ -120,8 +122,8 @@

         __m128i qcoeff0, qcoeff1;

         __m128i qtmp0, qtmp1;

-        coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));

-        coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);

+        coeff0 = load_tran_low(coeff_ptr + n_coeffs);

+        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

         // Poor man's sign extract

         coeff0_sign = _mm_srai_epi16(coeff0, 15);

@@ -146,20 +148,20 @@

           qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

           qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);

-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);

+          store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);

+          store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

           coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

           coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);

-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);

+          store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

+          store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

         } else {

-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);

-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);

+          store_zero_tran_low(qcoeff_ptr + n_coeffs);

+          store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);

-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);

-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);

+          store_zero_tran_low(dqcoeff_ptr + n_coeffs);

+          store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);

@@ -199,10 +201,11 @@

   } else {

     do {

-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);

-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);

-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);

-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);

+      store_zero_tran_low(qcoeff_ptr + n_coeffs);

+      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);

+      store_zero_tran_low(dqcoeff_ptr + n_coeffs);

+      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);

       n_coeffs += 8 * 2;

     } while (n_coeffs < 0);

     *eob_ptr = 0;

--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

@@ -11,6 +11,7 @@

 %define private_prefix vp9

 %include "third_party/x86inc/x86inc.asm"

+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"

 SECTION_RODATA

 pw_1: times 8 dw 1

@@ -48,15 +49,15 @@

 %endif

   pxor                            m5, m5                   ; m5 = dedicated zero

-  lea                         coeffq, [  coeffq+ncoeffq*2]

-  lea                            r5q, [  r5q+ncoeffq*2]

-  lea                            r3q, [ r3q+ncoeffq*2]

-  lea                            r4q, [r4q+ncoeffq*2]

+  INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq

+  lea                            r5q, [r5q+ncoeffq*2]

+  INCREMENT_ELEMENTS_TRAN_LOW    r3q, ncoeffq

+  INCREMENT_ELEMENTS_TRAN_LOW    r4q, ncoeffq

   neg                        ncoeffq

   ; get DC and first 15 AC coeffs

-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]

-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

+  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]

+  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]

   pabsw                           m6, m9                   ; m6 = abs(m9)

   pabsw                          m11, m10                  ; m11 = abs(m10)

   pcmpeqw                         m7, m7

@@ -69,8 +70,8 @@

   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16

   psignw                          m8, m9                   ; m8 = reinsert sign

   psignw                         m13, m10                  ; m13 = reinsert sign

-  mova            [r3q+ncoeffq*2+ 0], m8

-  mova            [r3q+ncoeffq*2+16], m13

+  STORE_TRAN_LOW  8, r3q, ncoeffq,     6, 11, 12

+  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12

 %ifidn %1, fp_32x32

   pabsw                           m8, m8

   pabsw                          m13, m13

@@ -87,8 +88,8 @@

 %else

   psrlw                           m0, m3, 1

 %endif

-  mova            [r4q+ncoeffq*2+ 0], m8

-  mova            [r4q+ncoeffq*2+16], m13

+  STORE_TRAN_LOW  8, r4q, ncoeffq,     6, 11, 12

+  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12

   pcmpeqw                         m8, m5                   ; m8 = c[i] == 0

   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0

   mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]

@@ -102,8 +103,8 @@

   jz .accumulate_eob

 .ac_only_loop:

-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]

-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

+  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]

+  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]

   pabsw                           m6, m9                   ; m6 = abs(m9)

   pabsw                          m11, m10                  ; m11 = abs(m10)

@@ -123,8 +124,8 @@

   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16

   psignw                         m14, m9                   ; m14 = reinsert sign

   psignw                         m13, m10                  ; m13 = reinsert sign

-  mova            [r3q+ncoeffq*2+ 0], m14

-  mova            [r3q+ncoeffq*2+16], m13

+  STORE_TRAN_LOW 14, r3q, ncoeffq,     6, 11, 12

+  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12

 %ifidn %1, fp_32x32

   pabsw                          m14, m14

   pabsw                          m13, m13

@@ -137,8 +138,8 @@

   psignw                         m14, m9

   psignw                         m13, m10

 %endif

-  mova            [r4q+ncoeffq*2+ 0], m14

-  mova            [r4q+ncoeffq*2+16], m13

+  STORE_TRAN_LOW 14, r4q, ncoeffq,     6, 11, 12

+  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12

   pcmpeqw                        m14, m5                   ; m14 = c[i] == 0

   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0

   mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]

@@ -154,10 +155,10 @@

   jmp .accumulate_eob

 .skip_iter:

-  mova            [r3q+ncoeffq*2+ 0], m5

-  mova            [r3q+ncoeffq*2+16], m5

-  mova            [r4q+ncoeffq*2+ 0], m5

-  mova            [r4q+ncoeffq*2+16], m5

+  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq

+  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8

+  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq

+  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8

   add                        ncoeffq, mmsize

   jl .ac_only_loop

@@ -186,10 +187,10 @@

   neg                        ncoeffq

   pxor                            m7, m7

 .blank_loop:

-  mova            [r0q+ncoeffq*2+ 0], m7

-  mova            [r0q+ncoeffq*2+16], m7

-  mova            [r2q+ncoeffq*2+ 0], m7

-  mova            [r2q+ncoeffq*2+16], m7

+  STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq

+  STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq + 8

+  STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq

+  STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq + 8

   add                        ncoeffq, mmsize

   jl .blank_loop

   mov                     word [r3q], 0

--- a/vpx_dsp/x86/bitdepth_conversion_sse2.asm

+++ b/vpx_dsp/x86/bitdepth_conversion_sse2.asm

@@ -38,29 +38,53 @@

 ; the values down to 16 bits.

 %macro LOAD_TRAN_LOW 3

 %if CONFIG_VP9_HIGHBITDEPTH

-  mova     m%1, [%2 + %3 * 4]

-  packssdw m%1, [%2 + %3 * 4 + 16]

+  mova     m%1, [%2 + (%3) * 4]

+  packssdw m%1, [%2 + (%3) * 4 + 16]

 %else

-  mova     m%1, [%2 + %3 * 2]

+  mova     m%1, [%2 + (%3) * 2]

 %endif

 %endmacro

 ; Store m%1 to %2 + %3.

 ; %3 is the offset in elements, not bytes.

+; If 5 arguments are provided then m%1 is corrupted.

+; If 6 arguments are provided then m%1 is preserved.

 ; If tran_low_t is 16 bits (low bit depth configuration) then store the value

 ; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign

 ; extend the values first.

 ; Uses m%4-m%6 as scratch registers for high bit depth.

-%macro STORE_TRAN_LOW 5

+%macro STORE_TRAN_LOW 5-6

 %if CONFIG_VP9_HIGHBITDEPTH

   pxor                      m%4, m%4

   mova                      m%5, m%1

+  %if %0 == 6

+  mova                      m%6, m%1

+  %endif

   pcmpgtw                   m%4, m%1

   punpcklwd                 m%5, m%4

+  %if %0 == 5

   punpckhwd                 m%1, m%4

-  mova       [%2 + %3 * 4 +  0], m%5

-  mova       [%2 + %3 * 4 + 16], m%1

+  %else

+  punpckhwd                 m%6, m%4

+  %endif

+  mova     [%2 + (%3) * 4 +  0], m%5

+  %if %0 == 5

+  mova     [%2 + (%3) * 4 + 16], m%1

+  %else

+  mova     [%2 + (%3) * 4 + 16], m%6

+  %endif

 %else

-  mova            [%2 + %3 * 2], m%1

+  mova          [%2 + (%3) * 2], m%1

+%endif

+%endmacro

+; Store zeros (in m%1) to %2 + %3.

+; %3 is the offset in elements, not bytes.

+%macro STORE_ZERO_TRAN_LOW 3

+%if CONFIG_VP9_HIGHBITDEPTH

+  mova     [%2 + (%3) * 4 +  0], m%1

+  mova     [%2 + (%3) * 4 + 16], m%1

+%else

+  mova          [%2 + (%3) * 2], m%1

 %endif

 %endmacro

--

⑨