shithub: libvpx

--- a/vp8/common/rtcd_defs.sh

+++ b/vp8/common/rtcd_defs.sh

@@ -444,8 +444,9 @@

 # Quantizer

 prototype void vp8_regular_quantize_b "struct block *, struct blockd *"

-specialize vp8_regular_quantize_b sse2 sse4_1

-vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4

+specialize vp8_regular_quantize_b sse2 #sse4_1

+# TODO(johann) Update sse4 implementation and re-enable

+#vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4

 prototype void vp8_fast_quantize_b "struct block *, struct blockd *"

 specialize vp8_fast_quantize_b sse2 ssse3 media neon

--- a/vp8/encoder/block.h

+++ b/vp8/encoder/block.h

@@ -37,7 +37,7 @@

     /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */

     short *quant;

     short *quant_fast;

-    unsigned char *quant_shift;

+    short *quant_shift;

     short *zbin;

     short *zrun_zbin_boost;

     short *round;

--- a/vp8/encoder/onyx_int.h

+++ b/vp8/encoder/onyx_int.h

@@ -282,17 +282,17 @@

     DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);

-    DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);

+    DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);

-    DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);

+    DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);

-    DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);

+    DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);

     DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);

--- a/vp8/encoder/quantize.c

+++ b/vp8/encoder/quantize.c

@@ -50,8 +50,8 @@

         if (x >= zbin)

             x += round_ptr[rc];

-            y  = (((x * quant_ptr[rc]) >> 16) + x)

-                 >> quant_shift_ptr[rc];             /* quantize (x) */

+            y  = ((((x * quant_ptr[rc]) >> 16) + x)

+                 * quant_shift_ptr[rc]) >> 16;       /* quantize (x) */

             x  = (y ^ sz) - sz;                      /* get the sign back */

             qcoeff_ptr[rc] = x;                      /* write to destination */

             dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */

@@ -113,7 +113,7 @@

     short *zbin_ptr        = b->zbin;

     short *round_ptr       = b->round;

     short *quant_ptr       = b->quant;

-    unsigned char *quant_shift_ptr = b->quant_shift;

+    short *quant_shift_ptr = b->quant_shift;

     short *qcoeff_ptr      = d->qcoeff;

     short *dqcoeff_ptr     = d->dqcoeff;

     short *dequant_ptr     = d->dequant;

@@ -138,8 +138,8 @@

         if (x >= zbin)

             x += round_ptr[rc];

-            y  = (((x * quant_ptr[rc]) >> 16) + x)

-                 >> quant_shift_ptr[rc];             /* quantize (x) */

+            y  = ((((x * quant_ptr[rc]) >> 16) + x)

+                 * quant_shift_ptr[rc]) >> 16;       /* quantize (x) */

             x  = (y ^ sz) - sz;                      /* get the sign back */

             qcoeff_ptr[rc]  = x;                     /* write to destination */

             dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */

@@ -167,7 +167,7 @@

     int sz;

     short *coeff_ptr;

     short *quant_ptr;

-    unsigned char *quant_shift_ptr;

+    short *quant_shift_ptr;

     short *qcoeff_ptr;

     short *dqcoeff_ptr;

     short *dequant_ptr;

@@ -198,7 +198,7 @@

         if (x >= dq)

             /* Quantize x. */

-            y  = (((x * quant_ptr[rc]) >> 16) + x) >> quant_shift_ptr[rc];

+            y  = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16;

             /* Put the sign back. */

             x = (y + sz) ^ sz;

             /* Save the coefficient and its dequantized value. */

@@ -406,7 +406,7 @@

 #define EXACT_QUANT

 #ifdef EXACT_QUANT

 static void invert_quant(int improved_quant, short *quant,

-                               unsigned char *shift, short d)

+                         short *shift, short d)

     if(improved_quant)

@@ -418,11 +418,15 @@

         t = 1 + (1<<(16+l))/d;

         *quant = (short)(t - (1<<16));

         *shift = l;

+        /* use multiplication and constant shift by 16 */

+        *shift = 1 << (16 - *shift);

     else

         *quant = (1 << 16) / d;

         *shift = 0;

+        /* use multiplication and constant shift by 16 */

+        *shift = 1 << (16 - *shift);

--- a/vp8/encoder/x86/quantize_sse2.asm

+++ /dev/null

@@ -1,245 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%include "vp8_asm_enc_offsets.asm"

-; void vp8_regular_quantize_b_sse2 | arg

-;  (BLOCK  *b,                     |  0

-;   BLOCKD *d)                     |  1

-global sym(vp8_regular_quantize_b_sse2) PRIVATE

-sym(vp8_regular_quantize_b_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SAVE_XMM 7

-    GET_GOT     rbx

-%if ABI_IS_32BIT

-    push        rdi

-    push        rsi

-%else

-  %if LIBVPX_YASM_WIN64

-    push        rdi

-    push        rsi

-  %endif

-%endif

-    ALIGN_STACK 16, rax

-    %define zrun_zbin_boost   0  ;  8

-    %define abs_minus_zbin    8  ; 32

-    %define temp_qcoeff       40 ; 32

-    %define qcoeff            72 ; 32

-    %define stack_size        104

-    sub         rsp, stack_size

-    ; end prolog

-%if ABI_IS_32BIT

-    mov         rdi, arg(0)                 ; BLOCK *b

-    mov         rsi, arg(1)                 ; BLOCKD *d

-%else

-  %if LIBVPX_YASM_WIN64

-    mov         rdi, rcx                    ; BLOCK *b

-    mov         rsi, rdx                    ; BLOCKD *d

-  %else

-    ;mov         rdi, rdi                    ; BLOCK *b

-    ;mov         rsi, rsi                    ; BLOCKD *d

-  %endif

-%endif

-    mov         rdx, [rdi + vp8_block_coeff] ; coeff_ptr

-    mov         rcx, [rdi + vp8_block_zbin] ; zbin_ptr

-    movd        xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value

-    ; z

-    movdqa      xmm0, [rdx]

-    movdqa      xmm4, [rdx + 16]

-    mov         rdx, [rdi + vp8_block_round] ; round_ptr

-    pshuflw     xmm7, xmm7, 0

-    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value

-    movdqa      xmm1, xmm0

-    movdqa      xmm5, xmm4

-    ; sz

-    psraw       xmm0, 15

-    psraw       xmm4, 15

-    ; (z ^ sz)

-    pxor        xmm1, xmm0

-    pxor        xmm5, xmm4

-    ; x = abs(z)

-    psubw       xmm1, xmm0

-    psubw       xmm5, xmm4

-    movdqa      xmm2, [rcx]

-    movdqa      xmm3, [rcx + 16]

-    mov         rcx, [rdi + vp8_block_quant] ; quant_ptr

-    ; *zbin_ptr + zbin_oq_value

-    paddw       xmm2, xmm7

-    paddw       xmm3, xmm7

-    ; x - (*zbin_ptr + zbin_oq_value)

-    psubw       xmm1, xmm2

-    psubw       xmm5, xmm3

-    movdqa      [rsp + abs_minus_zbin], xmm1

-    movdqa      [rsp + abs_minus_zbin + 16], xmm5

-    ; add (zbin_ptr + zbin_oq_value) back

-    paddw       xmm1, xmm2

-    paddw       xmm5, xmm3

-    movdqa      xmm2, [rdx]

-    movdqa      xmm6, [rdx + 16]

-    movdqa      xmm3, [rcx]

-    movdqa      xmm7, [rcx + 16]

-    ; x + round

-    paddw       xmm1, xmm2

-    paddw       xmm5, xmm6

-    ; y = x * quant_ptr >> 16

-    pmulhw      xmm3, xmm1

-    pmulhw      xmm7, xmm5

-    ; y += x

-    paddw       xmm1, xmm3

-    paddw       xmm5, xmm7

-    movdqa      [rsp + temp_qcoeff], xmm1

-    movdqa      [rsp + temp_qcoeff + 16], xmm5

-    pxor        xmm6, xmm6

-    ; zero qcoeff

-    movdqa      [rsp + qcoeff], xmm6

-    movdqa      [rsp + qcoeff + 16], xmm6

-    mov         rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr

-    mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr

-    mov         [rsp + zrun_zbin_boost], rdx

-%macro ZIGZAG_LOOP 1

-    ; x

-    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]

-    ; if (x >= zbin)

-    sub         cx, WORD PTR[rdx]           ; x - zbin

-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++

-    jl          .rq_zigzag_loop_%1           ; x < zbin

-    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]

-    ; downshift by quant_shift[rc]

-    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]

-    sar         edi, cl                     ; also sets Z bit

-    je          .rq_zigzag_loop_%1           ; !y

-    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]

-    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost

-.rq_zigzag_loop_%1:

-%endmacro

-; in vp8_default_zig_zag1d order: see vp8/common/entropy.c

-ZIGZAG_LOOP  0

-ZIGZAG_LOOP  1

-ZIGZAG_LOOP  4

-ZIGZAG_LOOP  8

-ZIGZAG_LOOP  5

-ZIGZAG_LOOP  2

-ZIGZAG_LOOP  3

-ZIGZAG_LOOP  6

-ZIGZAG_LOOP  9

-ZIGZAG_LOOP 12

-ZIGZAG_LOOP 13

-ZIGZAG_LOOP 10

-ZIGZAG_LOOP  7

-ZIGZAG_LOOP 11

-ZIGZAG_LOOP 14

-ZIGZAG_LOOP 15

-    movdqa      xmm2, [rsp + qcoeff]

-    movdqa      xmm3, [rsp + qcoeff + 16]

-    mov         rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr

-    mov         rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr

-    ; y ^ sz

-    pxor        xmm2, xmm0

-    pxor        xmm3, xmm4

-    ; x = (y ^ sz) - sz

-    psubw       xmm2, xmm0

-    psubw       xmm3, xmm4

-    ; dequant

-    movdqa      xmm0, [rcx]

-    movdqa      xmm1, [rcx + 16]

-    mov         rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr

-    pmullw      xmm0, xmm2

-    pmullw      xmm1, xmm3

-    movdqa      [rcx], xmm2        ; store qcoeff

-    movdqa      [rcx + 16], xmm3

-    movdqa      [rdi], xmm0        ; store dqcoeff

-    movdqa      [rdi + 16], xmm1

-    mov         rcx, [rsi + vp8_blockd_eob]

-    ; select the last value (in zig_zag order) for EOB

-    pcmpeqw     xmm2, xmm6

-    pcmpeqw     xmm3, xmm6

-    ; !

-    pcmpeqw     xmm6, xmm6

-    pxor        xmm2, xmm6

-    pxor        xmm3, xmm6

-    ; mask inv_zig_zag

-    pand        xmm2, [GLOBAL(inv_zig_zag)]

-    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]

-    ; select the max value

-    pmaxsw      xmm2, xmm3

-    pshufd      xmm3, xmm2, 00001110b

-    pmaxsw      xmm2, xmm3

-    pshuflw     xmm3, xmm2, 00001110b

-    pmaxsw      xmm2, xmm3

-    pshuflw     xmm3, xmm2, 00000001b

-    pmaxsw      xmm2, xmm3

-    movd        eax, xmm2

-    and         eax, 0xff

-    mov         BYTE PTR [rcx], al          ; store eob

-    ; begin epilog

-    add         rsp, stack_size

-    pop         rsp

-%if ABI_IS_32BIT

-    pop         rsi

-    pop         rdi

-%else

-  %if LIBVPX_YASM_WIN64

-    pop         rsi

-    pop         rdi

-  %endif

-%endif

-    RESTORE_GOT

-    RESTORE_XMM

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-inv_zig_zag:

-  dw 0x0001, 0x0002, 0x0006, 0x0007

-  dw 0x0003, 0x0005, 0x0008, 0x000d

-  dw 0x0004, 0x0009, 0x000c, 0x000e

-  dw 0x000a, 0x000b, 0x000f, 0x0010

--- a/vp8/encoder/x86/quantize_sse2.c

+++ b/vp8/encoder/x86/quantize_sse2.c

@@ -9,13 +9,140 @@

*/

-#include "vp8/common/blockd.h"

-#include "vp8/common/entropy.h"

+#include "vpx_config.h"

+#include "vp8_rtcd.h"

+#include "vpx_ports/x86.h"

+#include "vpx_mem/vpx_mem.h"

 #include "vp8/encoder/block.h"

+#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */

-#include <mmintrin.h> //MMX

-#include <xmmintrin.h> //SSE

-#include <emmintrin.h> //SSE2

+#include <mmintrin.h> /* MMX */

+#include <xmmintrin.h> /* SSE */

+#include <emmintrin.h> /* SSE2 */

+#define SELECT_EOB(i, z) \

+    do { \

+        __label__ select_eob_end; \

+        short boost = *zbin_boost_ptr; \

+        int cmp = (x[z] < boost) | (y[z] == 0); \

+        zbin_boost_ptr++; \

+        if (cmp) \

+            goto select_eob_end; \

+        qcoeff_ptr[z] = y[z]; \

+        eob = i; \

+        zbin_boost_ptr = b->zrun_zbin_boost; \

+        select_eob_end:; \

+    } while (0)

+void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)

+{

+    char eob = 0;

+    short *zbin_boost_ptr  = b->zrun_zbin_boost;

+    short *qcoeff_ptr      = d->qcoeff;

+    DECLARE_ALIGNED_ARRAY(16, short, x, 16);

+    DECLARE_ALIGNED_ARRAY(16, short, y, 16);

+    __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;

+    __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));

+    __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));

+    __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));

+    __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));

+    __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);

+    __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));

+    __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));

+    __m128i round0 = _mm_load_si128((__m128i *)(b->round));

+    __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));

+    __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));

+    __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));

+    __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));

+    __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));

+    vpx_memset(qcoeff_ptr, 0, 32);

+    /* Duplicate to all lanes. */

+    zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);

+    zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);

+    /* Sign of z: z >> 15 */

+    sz0 = _mm_srai_epi16(z0, 15);

+    sz1 = _mm_srai_epi16(z1, 15);

+    /* x = abs(z): (z ^ sz) - sz */

+    x0 = _mm_xor_si128(z0, sz0);

+    x1 = _mm_xor_si128(z1, sz1);

+    x0 = _mm_sub_epi16(x0, sz0);

+    x1 = _mm_sub_epi16(x1, sz1);

+    /* zbin[] + zbin_extra */

+    zbin0 = _mm_add_epi16(zbin0, zbin_extra);

+    zbin1 = _mm_add_epi16(zbin1, zbin_extra);

+    /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance

+     * the equation because boost is the only value which can change:

+     * x - (zbin[] + extra) >= boost */

+    x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);

+    x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);

+    _mm_store_si128((__m128i *)(x), x_minus_zbin0);

+    _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1);

+    /* All the remaining calculations are valid whether they are done now with

+     * simd or later inside the loop one at a time. */

+    x0 = _mm_add_epi16(x0, round0);

+    x1 = _mm_add_epi16(x1, round1);

+    y0 = _mm_mulhi_epi16(x0, quant0);

+    y1 = _mm_mulhi_epi16(x1, quant1);

+    y0 = _mm_add_epi16(y0, x0);

+    y1 = _mm_add_epi16(y1, x1);

+    /* Instead of shifting each value independently we convert the scaling

+     * factor with 1 << (16 - shift) so we can use multiply/return high half. */

+    y0 = _mm_mulhi_epi16(y0, quant_shift0);

+    y1 = _mm_mulhi_epi16(y1, quant_shift1);

+    /* Return the sign: (y ^ sz) - sz */

+    y0 = _mm_xor_si128(y0, sz0);

+    y1 = _mm_xor_si128(y1, sz1);

+    y0 = _mm_sub_epi16(y0, sz0);

+    y1 = _mm_sub_epi16(y1, sz1);

+    _mm_store_si128((__m128i *)(y), y0);

+    _mm_store_si128((__m128i *)(y + 8), y1);

+    zbin_boost_ptr = b->zrun_zbin_boost;

+    /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */

+    SELECT_EOB(1, 0);

+    SELECT_EOB(2, 1);

+    SELECT_EOB(3, 4);

+    SELECT_EOB(4, 8);

+    SELECT_EOB(5, 5);

+    SELECT_EOB(6, 2);

+    SELECT_EOB(7, 3);

+    SELECT_EOB(8, 6);

+    SELECT_EOB(9, 9);

+    SELECT_EOB(10, 12);

+    SELECT_EOB(11, 13);

+    SELECT_EOB(12, 10);

+    SELECT_EOB(13, 7);

+    SELECT_EOB(14, 11);

+    SELECT_EOB(15, 14);

+    SELECT_EOB(16, 15);

+    y0 = _mm_load_si128((__m128i *)(d->qcoeff));

+    y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8));

+    /* dqcoeff = qcoeff * dequant */

+    y0 = _mm_mullo_epi16(y0, dequant0);

+    y1 = _mm_mullo_epi16(y1, dequant1);

+    _mm_store_si128((__m128i *)(d->dqcoeff), y0);

+    _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1);

+    *d->eob = eob;

+}

 void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)

--- a/vp8/vp8cx.mk

+++ b/vp8/vp8cx.mk

@@ -90,7 +90,6 @@

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c

-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm

 # TODO(johann) make this generic

 ifeq ($(HAVE_SSE2),yes)

--

⑨