shithub: libvpx

--- a/vp8/encoder/quantize.c

+++ b/vp8/encoder/quantize.c

@@ -129,9 +129,6 @@

         rc   = vp8_default_zig_zag1d[i];

         z    = coeff_ptr[rc];

-        //if ( i == 0 )

-        //    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value/2;

-        //else

         zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;

         zbin_boost_ptr ++;

@@ -144,13 +141,13 @@

             y  = (((x * quant_ptr[rc]) >> 16) + x)

                  >> quant_shift_ptr[rc];                // quantize (x)

             x  = (y ^ sz) - sz;                         // get the sign back

-            qcoeff_ptr[rc]  = x;                         // write to destination

-            dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value

+            qcoeff_ptr[rc]  = x;                        // write to destination

+            dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

             if (y)

                 eob = i;                                // last nonzero coeffs

-                zbin_boost_ptr = &b->zrun_zbin_boost[0];    // reset zero runlength

+                zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength

--- a/vp8/encoder/x86/quantize_sse2.asm

+++ b/vp8/encoder/x86/quantize_sse2.asm

@@ -11,221 +11,170 @@

 %include "vpx_ports/x86_abi_support.asm"

-;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,

-;               short *qcoeff_ptr,short *dequant_ptr,

-;               const int *default_zig_zag, short *round_ptr,

-;               short *quant_ptr, short *dqcoeff_ptr,

+;int vp8_regular_quantize_b_impl_sse2(

+;               short *coeff_ptr,

+;               short *zbin_ptr,

+;               short *qcoeff_ptr,

+;               short *dequant_ptr,

+;               const int *default_zig_zag,

+;               short *round_ptr,

+;               short *quant_ptr,

+;               short *dqcoeff_ptr,

 ;               unsigned short zbin_oq_value,

-;               short *zbin_boost_ptr);

+;               short *zbin_boost_ptr,

+;               short *quant_shift);

 global sym(vp8_regular_quantize_b_impl_sse2)

 sym(vp8_regular_quantize_b_impl_sse2):

     push        rbp

     mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 10

+    SHADOW_ARGS_TO_STACK 11

+    SAVE_XMM

     push        rsi

     push        rdi

     push        rbx

+    ALIGN_STACK 16, rax

+    %define abs_minus_zbin    0

+    %define temp_qcoeff       32

+    %define qcoeff            64

+    %define eob_tmp           96

+    %define stack_size        112

+    sub         rsp, stack_size

     ; end prolog

-    ALIGN_STACK 16, rax

+    mov         rdx, arg(0)                 ; coeff_ptr

+    mov         rcx, arg(1)                 ; zbin_ptr

+    movd        xmm7, arg(8)                ; zbin_oq_value

+    mov         rdi, arg(5)                 ; round_ptr

+    mov         rsi, arg(6)                 ; quant_ptr

-    %define abs_minus_zbin_lo 0

-    %define abs_minus_zbin_hi 16

-    %define temp_qcoeff_lo 32

-    %define temp_qcoeff_hi 48

-    %define save_xmm6 64

-    %define save_xmm7 80

-    %define eob 96

-    %define vp8_regularquantizeb_stack_size eob + 16

-    sub         rsp, vp8_regularquantizeb_stack_size

-    movdqa      OWORD PTR[rsp + save_xmm6], xmm6

-    movdqa      OWORD PTR[rsp + save_xmm7], xmm7

-    mov         rdx, arg(0)                 ;coeff_ptr

-    mov         eax, arg(8)                 ;zbin_oq_value

-    mov         rcx, arg(1)                 ;zbin_ptr

-    movd        xmm7, eax

+    ; z

     movdqa      xmm0, OWORD PTR[rdx]

     movdqa      xmm4, OWORD PTR[rdx + 16]

+    pshuflw     xmm7, xmm7, 0

+    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value

     movdqa      xmm1, xmm0

     movdqa      xmm5, xmm4

-    psraw       xmm0, 15                    ;sign of z (aka sz)

-    psraw       xmm4, 15                    ;sign of z (aka sz)

+    ; sz

+    psraw       xmm0, 15

+    psraw       xmm4, 15

+    ; (z ^ sz)

     pxor        xmm1, xmm0

     pxor        xmm5, xmm4

-    movdqa      xmm2, OWORD PTR[rcx]        ;load zbin_ptr

-    movdqa      xmm3, OWORD PTR[rcx + 16]   ;load zbin_ptr

+    ; x = abs(z)

+    psubw       xmm1, xmm0

+    psubw       xmm5, xmm4

-    pshuflw     xmm7, xmm7, 0

-    psubw       xmm1, xmm0                  ;x = abs(z)

+    movdqa      xmm2, OWORD PTR[rcx]

+    movdqa      xmm3, OWORD PTR[rcx + 16]

-    punpcklwd   xmm7, xmm7                  ;duplicated zbin_oq_value

-    psubw       xmm5, xmm4                  ;x = abs(z)

+    ; *zbin_ptr + zbin_oq_value

     paddw       xmm2, xmm7

     paddw       xmm3, xmm7

-    psubw       xmm1, xmm2                  ;sub (zbin_ptr + zbin_oq_value)

-    psubw       xmm5, xmm3                  ;sub (zbin_ptr + zbin_oq_value)

+    ; x - (*zbin_ptr + zbin_oq_value)

+    psubw       xmm1, xmm2

+    psubw       xmm5, xmm3

+    movdqa      OWORD PTR[rsp + abs_minus_zbin], xmm1

+    movdqa      OWORD PTR[rsp + abs_minus_zbin + 16], xmm5

-    mov         rdi, arg(5)                 ;round_ptr

-    mov         rsi, arg(6)                 ;quant_ptr

+    ; add (zbin_ptr + zbin_oq_value) back

+    paddw       xmm1, xmm2

+    paddw       xmm5, xmm3

-    movdqa      OWORD PTR[rsp + abs_minus_zbin_lo], xmm1

-    movdqa      OWORD PTR[rsp + abs_minus_zbin_hi], xmm5

-    paddw       xmm1, xmm2                  ;add (zbin_ptr + zbin_oq_value) back

-    paddw       xmm5, xmm3                  ;add (zbin_ptr + zbin_oq_value) back

     movdqa      xmm2, OWORD PTR[rdi]

-    movdqa      xmm3, OWORD PTR[rsi]

     movdqa      xmm6, OWORD PTR[rdi + 16]

+    movdqa      xmm3, OWORD PTR[rsi]

     movdqa      xmm7, OWORD PTR[rsi + 16]

+    ; x + round

     paddw       xmm1, xmm2

     paddw       xmm5, xmm6

-    pmulhw      xmm1, xmm3

-    pmulhw      xmm5, xmm7

+    ; y = x * quant_ptr >> 16

+    pmulhw      xmm3, xmm1

+    pmulhw      xmm7, xmm5

-    mov         rsi, arg(2)                 ;qcoeff_ptr

-    pxor        xmm6, xmm6

+    ; y += x

+    paddw       xmm1, xmm3

+    paddw       xmm5, xmm7

-    pxor        xmm1, xmm0

-    pxor        xmm5, xmm4

+    movdqa      OWORD PTR[rsp + temp_qcoeff], xmm1

+    movdqa      OWORD PTR[rsp + temp_qcoeff + 16], xmm5

-    psubw       xmm1, xmm0

-    psubw       xmm5, xmm4

+    pxor        xmm6, xmm6

+    ; zero qcoeff

+    movdqa      OWORD PTR[rsp + qcoeff], xmm6

+    movdqa      OWORD PTR[rsp + qcoeff + 16], xmm6

-    movdqa      OWORD PTR[rsp + temp_qcoeff_lo], xmm1

-    movdqa      OWORD PTR[rsp + temp_qcoeff_hi], xmm5

+    mov         [rsp + eob_tmp], DWORD -1   ; eob

+    mov         rsi, arg(9)                 ; zbin_boost_ptr

+    mov         rdi, arg(4)                 ; default_zig_zag

+    mov         rax, arg(10)                ; quant_shift_ptr

-    movdqa      OWORD PTR[rsi], xmm6        ;zero qcoeff

-    movdqa      OWORD PTR[rsi + 16], xmm6   ;zero qcoeff

+%macro ZIGZAG_LOOP 2

+rq_zigzag_loop_%1:

+    movsxd      rdx, DWORD PTR[rdi + (%1 * 4)] ; rc

+    movsx       ebx, WORD PTR [rsi]         ; *zbin_boost_ptr

+    lea         rsi, [rsi + 2]              ; zbin_boost_ptr++

-    xor         rax, rax

-    mov         rcx, -1

+    ; x

+    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2]

-    mov         [rsp + eob], rcx

-    mov         rsi, arg(9)                 ;zbin_boost_ptr

+    ; if (x >= zbin)

+    sub         ecx, ebx                    ; x - zbin

+    jl          rq_zigzag_loop_%2           ; x < zbin

-    mov         rbx, arg(4)                 ;default_zig_zag

+    movsx       ebx, WORD PTR[rsp + temp_qcoeff + rdx *2]

-rq_zigzag_loop:

-    movsxd      rcx, DWORD PTR[rbx + rax*4] ;now we have rc

-    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin

-    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++

+    ; downshift by quant_shift[rdx]

+    movsx       ecx, WORD PTR[rax + rdx*2]  ; quant_shift_ptr[rc]

+    sar         ebx, cl                     ; also sets Z bit

+    je          rq_zigzag_loop_%2           ; !y

+    mov         WORD PTR[rsp + qcoeff + rdx * 2], bx ;qcoeff_ptr[rc] = temp_qcoeff[rc]

-    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]

+    mov         rsi, arg(9)                 ; reset to b->zrun_zbin_boost

+    mov         [rsp + eob_tmp], DWORD %1   ; eob = i

+%endmacro

+ZIGZAG_LOOP 0, 1

+ZIGZAG_LOOP 1, 2

+ZIGZAG_LOOP 2, 3

+ZIGZAG_LOOP 3, 4

+ZIGZAG_LOOP 4, 5

+ZIGZAG_LOOP 5, 6

+ZIGZAG_LOOP 6, 7

+ZIGZAG_LOOP 7, 8

+ZIGZAG_LOOP 8, 9

+ZIGZAG_LOOP 9, 10

+ZIGZAG_LOOP 10, 11

+ZIGZAG_LOOP 11, 12

+ZIGZAG_LOOP 12, 13

+ZIGZAG_LOOP 13, 14

+ZIGZAG_LOOP 14, 15

+ZIGZAG_LOOP 15, end

+rq_zigzag_loop_end:

-    sub         edx, edi                    ;x - zbin

-    jl          rq_zigzag_1

+    mov         rbx, arg(2)                 ; qcoeff_ptr

+    mov         rcx, arg(3)                 ; dequant_ptr

+    mov         rsi, arg(7)                 ; dqcoeff_ptr

+    mov         rax, [rsp + eob_tmp]        ; eob

-    mov         rdi, arg(2)                 ;qcoeff_ptr

+    movdqa      xmm2, OWORD PTR[rsp + qcoeff]

+    movdqa      xmm3, OWORD PTR[rsp + qcoeff + 16]

-    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]

+    ; y ^ sz

+    pxor        xmm2, xmm0

+    pxor        xmm3, xmm4

+    ; x = (y ^ sz) - sz

+    psubw       xmm2, xmm0

+    psubw       xmm3, xmm4

-    cmp         edx, 0

-    je          rq_zigzag_1

-    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]

-    mov         rsi, arg(9)                 ;zbin_boost_ptr

-    mov         [rsp + eob], rax            ;eob = i

-rq_zigzag_1:

-    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]

-    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin

-    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++

-    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]

-    lea         rax, [rax + 1]

-    sub         edx, edi                    ;x - zbin

-    jl          rq_zigzag_1a

-    mov         rdi, arg(2)                 ;qcoeff_ptr

-    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]

-    cmp         edx, 0

-    je          rq_zigzag_1a

-    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]

-    mov         rsi, arg(9)                 ;zbin_boost_ptr

-    mov         [rsp + eob], rax            ;eob = i

-rq_zigzag_1a:

-    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]

-    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin

-    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++

-    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]

-    lea         rax, [rax + 1]

-    sub         edx, edi                    ;x - zbin

-    jl          rq_zigzag_1b

-    mov         rdi, arg(2)                 ;qcoeff_ptr

-    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]

-    cmp         edx, 0

-    je          rq_zigzag_1b

-    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]

-    mov         rsi, arg(9)                 ;zbin_boost_ptr

-    mov         [rsp + eob], rax            ;eob = i

-rq_zigzag_1b:

-    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]

-    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin

-    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++

-    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]

-    lea         rax, [rax + 1]

-    sub         edx, edi                    ;x - zbin

-    jl          rq_zigzag_1c

-    mov         rdi, arg(2)                 ;qcoeff_ptr

-    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]

-    cmp         edx, 0

-    je          rq_zigzag_1c

-    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]

-    mov         rsi, arg(9)                 ;zbin_boost_ptr

-    mov         [rsp + eob], rax            ;eob = i

-rq_zigzag_1c:

-    lea         rax, [rax + 1]

-    cmp         rax, 16

-    jl          rq_zigzag_loop

-    mov         rdi, arg(2)                 ;qcoeff_ptr

-    mov         rcx, arg(3)                 ;dequant_ptr

-    mov         rsi, arg(7)                 ;dqcoeff_ptr

-    movdqa      xmm2, OWORD PTR[rdi]

-    movdqa      xmm3, OWORD PTR[rdi + 16]

     movdqa      xmm0, OWORD PTR[rcx]

     movdqa      xmm1, OWORD PTR[rcx + 16]

@@ -232,23 +181,20 @@

     pmullw      xmm0, xmm2

     pmullw      xmm1, xmm3

-    movdqa      OWORD PTR[rsi], xmm0        ;store dqcoeff

-    movdqa      OWORD PTR[rsi + 16], xmm1   ;store dqcoeff

+    movdqa      OWORD PTR[rbx], xmm2

+    movdqa      OWORD PTR[rbx + 16], xmm3

+    movdqa      OWORD PTR[rsi], xmm0        ; store dqcoeff

+    movdqa      OWORD PTR[rsi + 16], xmm1   ; store dqcoeff

-    mov         rax, [rsp + eob]

-    movdqa      xmm6, OWORD PTR[rsp + save_xmm6]

-    movdqa      xmm7, OWORD PTR[rsp + save_xmm7]

     add         rax, 1

-    add         rsp, vp8_regularquantizeb_stack_size

-    pop         rsp

     ; begin epilog

+    add         rsp, stack_size

+    pop         rsp

     pop         rbx

     pop         rdi

     pop         rsi

+    RESTORE_XMM

     UNSHADOW_ARGS

     pop         rbp

ret

--- a/vp8/encoder/x86/quantize_x86.h

+++ b/vp8/encoder/x86/quantize_x86.h

@@ -27,11 +27,11 @@

 #if !CONFIG_RUNTIME_CPU_DETECT

-/* The sse2 quantizer has not been updated to match the new exact

- * quantizer introduced in commit e04e2935

- *#undef vp8_quantize_quantb

- *#define vp8_quantize_quantb vp8_regular_quantize_b_sse2

- */

+// Currently, this function realizes a gain on x86 and a loss on x86_64

+#if ARCH_X86

+#undef vp8_quantize_quantb

+#define vp8_quantize_quantb vp8_regular_quantize_b_sse2

+#endif

 #endif

--- a/vp8/encoder/x86/x86_csystemdependent.c

+++ b/vp8/encoder/x86/x86_csystemdependent.c

@@ -108,37 +108,26 @@

 int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,

-                               short *qcoeff_ptr,short *dequant_ptr,

-                               const int *default_zig_zag, short *round_ptr,

-                               short *quant_ptr, short *dqcoeff_ptr,

-                               unsigned short zbin_oq_value,

-                               short *zbin_boost_ptr);

+                                     short *qcoeff_ptr,short *dequant_ptr,

+                                     const int *default_zig_zag, short *round_ptr,

+                                     short *quant_ptr, short *dqcoeff_ptr,

+                                     unsigned short zbin_oq_value,

+                                     short *zbin_boost_ptr,

+                                     short *quant_shift_ptr);

 void vp8_regular_quantize_b_sse2(BLOCK *b,BLOCKD *d)

-    short *zbin_boost_ptr = b->zrun_zbin_boost;

-    short *coeff_ptr      = b->coeff;

-    short *zbin_ptr       = b->zbin;

-    short *round_ptr      = b->round;

-    short *quant_ptr      = b->quant;

-    short *qcoeff_ptr     = d->qcoeff;

-    short *dqcoeff_ptr    = d->dqcoeff;

-    short *dequant_ptr    = d->dequant;

-    short zbin_oq_value   = b->zbin_extra;

-    d->eob = vp8_regular_quantize_b_impl_sse2(

-        coeff_ptr,

-        zbin_ptr,

-        qcoeff_ptr,

-        dequant_ptr,

-        vp8_default_zig_zag1d,

-        round_ptr,

-        quant_ptr,

-        dqcoeff_ptr,

-        zbin_oq_value,

-        zbin_boost_ptr

-        );

+    d->eob = vp8_regular_quantize_b_impl_sse2(b->coeff,

+                                              b->zbin,

+                                              d->qcoeff,

+                                              d->dequant,

+                                              vp8_default_zig_zag1d,

+                                              b->round,

+                                              b->quant,

+                                              d->dqcoeff,

+                                              b->zbin_extra,

+                                              b->zrun_zbin_boost,

+                                              b->quant_shift);

 int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

@@ -307,7 +296,9 @@

         cpi->rtcd.encodemb.submby                = vp8_subtract_mby_sse2;

         cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_sse2;

-        /*cpi->rtcd.quantize.quantb            = vp8_regular_quantize_b_sse2;*/

+#if ARCH_X86

+        cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b_sse2;

+#endif

         cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse2;

         cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_sse2;

--

⑨