shithub: libvpx

--- a/vp9/encoder/vp9_quantize.h

+++ b/vp9/encoder/vp9_quantize.h

@@ -22,10 +22,6 @@

 #define prototype_quantize_mb(sym) \

   void (sym)(MACROBLOCK *x)

-#if ARCH_X86 || ARCH_X86_64

-#include "x86/vp9_quantize_x86.h"

-#endif

 void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2,

                                      int y_blocks);

 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,

--- a/vp9/encoder/x86/vp9_quantize_mmx.asm

+++ /dev/null

@@ -1,286 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;int vp9_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,

-;                           short *qcoeff_ptr,short *dequant_ptr,

-;                           short *scan_mask, short *round_ptr,

-;                           short *quant_ptr, short *dqcoeff_ptr);

-global sym(vp9_fast_quantize_b_impl_mmx) PRIVATE

-sym(vp9_fast_quantize_b_impl_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    push rsi

-    push rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;coeff_ptr

-        movq            mm0,        [rsi]

-        mov             rax,        arg(1) ;zbin_ptr

-        movq            mm1,        [rax]

-        movq            mm3,        mm0

-        psraw           mm0,        15

-        pxor            mm3,        mm0

-        psubw           mm3,        mm0         ; abs

-        movq            mm2,        mm3

-        pcmpgtw         mm1,        mm2

-        pandn           mm1,        mm2

-        movq            mm3,        mm1

-        mov             rdx,        arg(6) ;quant_ptr

-        movq            mm1,        [rdx]

-        mov             rcx,        arg(5) ;round_ptr

-        movq            mm2,        [rcx]

-        paddw           mm3,        mm2

-        pmulhuw         mm3,        mm1

-        pxor            mm3,        mm0

-        psubw           mm3,        mm0     ;gain the sign back

-        mov             rdi,        arg(2) ;qcoeff_ptr

-        movq            mm0,        mm3

-        movq            [rdi],      mm3

-        mov             rax,        arg(3) ;dequant_ptr

-        movq            mm2,        [rax]

-        pmullw          mm3,        mm2

-        mov             rax,        arg(7) ;dqcoeff_ptr

-        movq            [rax],      mm3

-        ; next 8

-        movq            mm4,        [rsi+8]

-        mov             rax,        arg(1) ;zbin_ptr

-        movq            mm5,        [rax+8]

-        movq            mm7,        mm4

-        psraw           mm4,        15

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4         ; abs

-        movq            mm6,        mm7

-        pcmpgtw         mm5,        mm6

-        pandn           mm5,        mm6

-        movq            mm7,        mm5

-        movq            mm5,        [rdx+8]

-        movq            mm6,        [rcx+8]

-        paddw           mm7,        mm6

-        pmulhuw         mm7,        mm5

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4;gain the sign back

-        mov             rdi,        arg(2) ;qcoeff_ptr

-        movq            mm1,        mm7

-        movq            [rdi+8],    mm7

-        mov             rax,        arg(3) ;dequant_ptr

-        movq            mm6,        [rax+8]

-        pmullw          mm7,        mm6

-        mov             rax,        arg(7) ;dqcoeff_ptr

-        movq            [rax+8],    mm7

-                ; next 8

-        movq            mm4,        [rsi+16]

-        mov             rax,        arg(1) ;zbin_ptr

-        movq            mm5,        [rax+16]

-        movq            mm7,        mm4

-        psraw           mm4,        15

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4         ; abs

-        movq            mm6,        mm7

-        pcmpgtw         mm5,        mm6

-        pandn           mm5,        mm6

-        movq            mm7,        mm5

-        movq            mm5,        [rdx+16]

-        movq            mm6,        [rcx+16]

-        paddw           mm7,        mm6

-        pmulhuw         mm7,        mm5

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4;gain the sign back

-        mov             rdi,        arg(2) ;qcoeff_ptr

-        movq            mm1,        mm7

-        movq            [rdi+16],   mm7

-        mov             rax,        arg(3) ;dequant_ptr

-        movq            mm6,        [rax+16]

-        pmullw          mm7,        mm6

-        mov             rax,        arg(7) ;dqcoeff_ptr

-        movq            [rax+16],   mm7

-                ; next 8

-        movq            mm4,        [rsi+24]

-        mov             rax,        arg(1) ;zbin_ptr

-        movq            mm5,        [rax+24]

-        movq            mm7,        mm4

-        psraw           mm4,        15

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4         ; abs

-        movq            mm6,        mm7

-        pcmpgtw         mm5,        mm6

-        pandn           mm5,        mm6

-        movq            mm7,        mm5

-        movq            mm5,        [rdx+24]

-        movq            mm6,        [rcx+24]

-        paddw           mm7,        mm6

-        pmulhuw         mm7,        mm5

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4;gain the sign back

-        mov             rdi,        arg(2) ;qcoeff_ptr

-        movq            mm1,        mm7

-        movq            [rdi+24],   mm7

-        mov             rax,        arg(3) ;dequant_ptr

-        movq            mm6,        [rax+24]

-        pmullw          mm7,        mm6

-        mov             rax,        arg(7) ;dqcoeff_ptr

-        movq            [rax+24],   mm7

-        mov             rdi,        arg(4) ;scan_mask

-        mov             rsi,        arg(2) ;qcoeff_ptr

-        pxor            mm5,        mm5

-        pxor            mm7,        mm7

-        movq            mm0,        [rsi]

-        movq            mm1,        [rsi+8]

-        movq            mm2,        [rdi]

-        movq            mm3,        [rdi+8];

-        pcmpeqw         mm0,        mm7

-        pcmpeqw         mm1,        mm7

-        pcmpeqw         mm6,        mm6

-        pxor            mm0,        mm6

-        pxor            mm1,        mm6

-        psrlw           mm0,        15

-        psrlw           mm1,        15

-        pmaddwd         mm0,        mm2

-        pmaddwd         mm1,        mm3

-        movq            mm5,        mm0

-        paddd           mm5,        mm1

-        movq            mm0,        [rsi+16]

-        movq            mm1,        [rsi+24]

-        movq            mm2,        [rdi+16]

-        movq            mm3,        [rdi+24];

-        pcmpeqw         mm0,        mm7

-        pcmpeqw         mm1,        mm7

-        pcmpeqw         mm6,        mm6

-        pxor            mm0,        mm6

-        pxor            mm1,        mm6

-        psrlw           mm0,        15

-        psrlw           mm1,        15

-        pmaddwd         mm0,        mm2

-        pmaddwd         mm1,        mm3

-        paddd           mm5,        mm0

-        paddd           mm5,        mm1

-        movq            mm0,        mm5

-        psrlq           mm5,        32

-        paddd           mm0,        mm5

-        ; eob adjustment begins here

-        movq            rcx,        mm0

-        and             rcx,        0xffff

-        xor             rdx,        rdx

-        sub             rdx,        rcx ; rdx=-rcx

-        bsr             rax,        rcx

-        inc             rax

-        sar             rdx,        31

-        and             rax,        rdx

-        ; Substitute the sse assembly for the old mmx mixed assembly/C. The

-        ; following is kept as reference

-        ;    movq            rcx,        mm0

-        ;    bsr             rax,        rcx

-        ;

-        ;    mov             eob,        rax

-        ;    mov             eee,        rcx

-        ;

-        ;if(eee==0)

-        ;{

-        ;    eob=-1;

-        ;}

-        ;else if(eee<0)

-        ;{

-        ;    eob=15;

-        ;}

-        ;d->eob = eob+1;

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp9/encoder/x86/vp9_quantize_sse2.asm

+++ /dev/null

@@ -1,379 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-; void vp9_regular_quantize_b_sse2 | arg

-;  (BLOCK  *b,                     |  0

-;   BLOCKD *d)                     |  1

-global sym(vp9_regular_quantize_b_sse2) PRIVATE

-sym(vp9_regular_quantize_b_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SAVE_XMM 7

-    GET_GOT     rbx

-%if ABI_IS_32BIT

-    push        rdi

-    push        rsi

-%else

-  %if LIBVPX_YASM_WIN64

-    push        rdi

-    push        rsi

-  %endif

-%endif

-    ALIGN_STACK 16, rax

-    %define zrun_zbin_boost   0  ;  8

-    %define abs_minus_zbin    8  ; 32

-    %define temp_qcoeff       40 ; 32

-    %define qcoeff            72 ; 32

-    %define stack_size        104

-    sub         rsp, stack_size

-    ; end prolog

-%if ABI_IS_32BIT

-    mov         rdi, arg(0)                 ; BLOCK *b

-    mov         rsi, arg(1)                 ; BLOCKD *d

-%else

-  %if LIBVPX_YASM_WIN64

-    mov         rdi, rcx                    ; BLOCK *b

-    mov         rsi, rdx                    ; BLOCKD *d

-  %else

-    ;mov         rdi, rdi                    ; BLOCK *b

-    ;mov         rsi, rsi                    ; BLOCKD *d

-  %endif

-%endif

-    mov         rdx, [rdi + vp9_block_coeff] ; coeff_ptr

-    mov         rcx, [rdi + vp9_block_zbin] ; zbin_ptr

-    movd        xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value

-    ; z

-    movdqa      xmm0, [rdx]

-    movdqa      xmm4, [rdx + 16]

-    mov         rdx, [rdi + vp9_block_round] ; round_ptr

-    pshuflw     xmm7, xmm7, 0

-    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value

-    movdqa      xmm1, xmm0

-    movdqa      xmm5, xmm4

-    ; sz

-    psraw       xmm0, 15

-    psraw       xmm4, 15

-    ; (z ^ sz)

-    pxor        xmm1, xmm0

-    pxor        xmm5, xmm4

-    ; x = abs(z)

-    psubw       xmm1, xmm0

-    psubw       xmm5, xmm4

-    movdqa      xmm2, [rcx]

-    movdqa      xmm3, [rcx + 16]

-    mov         rcx, [rdi + vp9_block_quant] ; quant_ptr

-    ; *zbin_ptr + zbin_oq_value

-    paddw       xmm2, xmm7

-    paddw       xmm3, xmm7

-    ; x - (*zbin_ptr + zbin_oq_value)

-    psubw       xmm1, xmm2

-    psubw       xmm5, xmm3

-    movdqa      [rsp + abs_minus_zbin], xmm1

-    movdqa      [rsp + abs_minus_zbin + 16], xmm5

-    ; add (zbin_ptr + zbin_oq_value) back

-    paddw       xmm1, xmm2

-    paddw       xmm5, xmm3

-    movdqa      xmm2, [rdx]

-    movdqa      xmm6, [rdx + 16]

-    movdqa      xmm3, [rcx]

-    movdqa      xmm7, [rcx + 16]

-    ; x + round

-    paddw       xmm1, xmm2

-    paddw       xmm5, xmm6

-    ; y = x * quant_ptr >> 16

-    pmulhw      xmm3, xmm1

-    pmulhw      xmm7, xmm5

-    ; y += x

-    paddw       xmm1, xmm3

-    paddw       xmm5, xmm7

-    movdqa      [rsp + temp_qcoeff], xmm1

-    movdqa      [rsp + temp_qcoeff + 16], xmm5

-    pxor        xmm6, xmm6

-    ; zero qcoeff

-    movdqa      [rsp + qcoeff], xmm6

-    movdqa      [rsp + qcoeff + 16], xmm6

-    mov         rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr

-    mov         rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr

-    mov         [rsp + zrun_zbin_boost], rdx

-%macro ZIGZAG_LOOP 1

-    ; x

-    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]

-    ; if (x >= zbin)

-    sub         cx, WORD PTR[rdx]           ; x - zbin

-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++

-    jl          .rq_zigzag_loop_%1           ; x < zbin

-    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]

-    ; downshift by quant_shift[rc]

-    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]

-    sar         edi, cl                     ; also sets Z bit

-    je          .rq_zigzag_loop_%1           ; !y

-    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]

-    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost

-.rq_zigzag_loop_%1:

-%endmacro

-; in vp9_default_zig_zag1d order: see vp9/common/vp9_entropy.c

-ZIGZAG_LOOP  0

-ZIGZAG_LOOP  1

-ZIGZAG_LOOP  4

-ZIGZAG_LOOP  8

-ZIGZAG_LOOP  5

-ZIGZAG_LOOP  2

-ZIGZAG_LOOP  3

-ZIGZAG_LOOP  6

-ZIGZAG_LOOP  9

-ZIGZAG_LOOP 12

-ZIGZAG_LOOP 13

-ZIGZAG_LOOP 10

-ZIGZAG_LOOP  7

-ZIGZAG_LOOP 11

-ZIGZAG_LOOP 14

-ZIGZAG_LOOP 15

-    movdqa      xmm2, [rsp + qcoeff]

-    movdqa      xmm3, [rsp + qcoeff + 16]

-    mov         rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr

-    mov         rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr

-    ; y ^ sz

-    pxor        xmm2, xmm0

-    pxor        xmm3, xmm4

-    ; x = (y ^ sz) - sz

-    psubw       xmm2, xmm0

-    psubw       xmm3, xmm4

-    ; dequant

-    movdqa      xmm0, [rcx]

-    movdqa      xmm1, [rcx + 16]

-    mov         rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr

-    pmullw      xmm0, xmm2

-    pmullw      xmm1, xmm3

-    movdqa      [rcx], xmm2        ; store qcoeff

-    movdqa      [rcx + 16], xmm3

-    movdqa      [rdi], xmm0        ; store dqcoeff

-    movdqa      [rdi + 16], xmm1

-    ; select the last value (in zig_zag order) for EOB

-    pcmpeqw     xmm2, xmm6

-    pcmpeqw     xmm3, xmm6

-    ; !

-    pcmpeqw     xmm6, xmm6

-    pxor        xmm2, xmm6

-    pxor        xmm3, xmm6

-    ; mask inv_zig_zag

-    pand        xmm2, [GLOBAL(inv_zig_zag)]

-    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]

-    ; select the max value

-    pmaxsw      xmm2, xmm3

-    pshufd      xmm3, xmm2, 00001110b

-    pmaxsw      xmm2, xmm3

-    pshuflw     xmm3, xmm2, 00001110b

-    pmaxsw      xmm2, xmm3

-    pshuflw     xmm3, xmm2, 00000001b

-    pmaxsw      xmm2, xmm3

-    movd        eax, xmm2

-    and         eax, 0xff

-    mov         [rsi + vp9_blockd_eob], eax

-    ; begin epilog

-    add         rsp, stack_size

-    pop         rsp

-%if ABI_IS_32BIT

-    pop         rsi

-    pop         rdi

-%else

-  %if LIBVPX_YASM_WIN64

-    pop         rsi

-    pop         rdi

-  %endif

-%endif

-    RESTORE_GOT

-    RESTORE_XMM

-    pop         rbp

-    ret

-; void vp9_fast_quantize_b_sse2 | arg

-;  (BLOCK  *b,                  |  0

-;   BLOCKD *d)                  |  1

-global sym(vp9_fast_quantize_b_sse2) PRIVATE

-sym(vp9_fast_quantize_b_sse2):

-    push        rbp

-    mov         rbp, rsp

-    GET_GOT     rbx

-%if ABI_IS_32BIT

-    push        rdi

-    push        rsi

-%else

-  %if LIBVPX_YASM_WIN64

-    push        rdi

-    push        rsi

-  %else

-    ; these registers are used for passing arguments

-  %endif

-%endif

-    ; end prolog

-%if ABI_IS_32BIT

-    mov         rdi, arg(0)                 ; BLOCK *b

-    mov         rsi, arg(1)                 ; BLOCKD *d

-%else

-  %if LIBVPX_YASM_WIN64

-    mov         rdi, rcx                    ; BLOCK *b

-    mov         rsi, rdx                    ; BLOCKD *d

-  %else

-    ;mov         rdi, rdi                    ; BLOCK *b

-    ;mov         rsi, rsi                    ; BLOCKD *d

-  %endif

-%endif

-    mov         rax, [rdi + vp9_block_coeff]

-    mov         rcx, [rdi + vp9_block_round]

-    mov         rdx, [rdi + vp9_block_quant_fast]

-    ; z = coeff

-    movdqa      xmm0, [rax]

-    movdqa      xmm4, [rax + 16]

-    ; dup z so we can save sz

-    movdqa      xmm1, xmm0

-    movdqa      xmm5, xmm4

-    ; sz = z >> 15

-    psraw       xmm0, 15

-    psraw       xmm4, 15

-    ; x = abs(z) = (z ^ sz) - sz

-    pxor        xmm1, xmm0

-    pxor        xmm5, xmm4

-    psubw       xmm1, xmm0

-    psubw       xmm5, xmm4

-    ; x += round

-    paddw       xmm1, [rcx]

-    paddw       xmm5, [rcx + 16]

-    mov         rax, [rsi + vp9_blockd_qcoeff]

-    mov         rcx, [rsi + vp9_blockd_dequant]

-    mov         rdi, [rsi + vp9_blockd_dqcoeff]

-    ; y = x * quant >> 16

-    pmulhw      xmm1, [rdx]

-    pmulhw      xmm5, [rdx + 16]

-    ; x = (y ^ sz) - sz

-    pxor        xmm1, xmm0

-    pxor        xmm5, xmm4

-    psubw       xmm1, xmm0

-    psubw       xmm5, xmm4

-    ; qcoeff = x

-    movdqa      [rax], xmm1

-    movdqa      [rax + 16], xmm5

-    ; x * dequant

-    movdqa      xmm2, xmm1

-    movdqa      xmm3, xmm5

-    pmullw      xmm2, [rcx]

-    pmullw      xmm3, [rcx + 16]

-    ; dqcoeff = x * dequant

-    movdqa      [rdi], xmm2

-    movdqa      [rdi + 16], xmm3

-    pxor        xmm4, xmm4                  ;clear all bits

-    pcmpeqw     xmm1, xmm4

-    pcmpeqw     xmm5, xmm4

-    pcmpeqw     xmm4, xmm4                  ;set all bits

-    pxor        xmm1, xmm4

-    pxor        xmm5, xmm4

-    pand        xmm1, [GLOBAL(inv_zig_zag)]

-    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]

-    pmaxsw      xmm1, xmm5

-    ; now down to 8

-    pshufd      xmm5, xmm1, 00001110b

-    pmaxsw      xmm1, xmm5

-    ; only 4 left

-    pshuflw     xmm5, xmm1, 00001110b

-    pmaxsw      xmm1, xmm5

-    ; okay, just 2!

-    pshuflw     xmm5, xmm1, 00000001b

-    pmaxsw      xmm1, xmm5

-    movd        eax, xmm1

-    and         eax, 0xff

-    mov         [rsi + vp9_blockd_eob], eax

-    ; begin epilog

-%if ABI_IS_32BIT

-    pop         rsi

-    pop         rdi

-%else

-  %if LIBVPX_YASM_WIN64

-    pop         rsi

-    pop         rdi

-  %endif

-%endif

-    RESTORE_GOT

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-inv_zig_zag:

-  dw 0x0001, 0x0002, 0x0006, 0x0007

-  dw 0x0003, 0x0005, 0x0008, 0x000d

-  dw 0x0004, 0x0009, 0x000c, 0x000e

-  dw 0x000a, 0x000b, 0x000f, 0x0010

--- a/vp9/encoder/x86/vp9_quantize_sse4.asm

+++ /dev/null

@@ -1,253 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-; void vp9_regular_quantize_b_sse4 | arg

-;  (BLOCK  *b,                     |  0

-;   BLOCKD *d)                     |  1

-global sym(vp9_regular_quantize_b_sse4) PRIVATE

-sym(vp9_regular_quantize_b_sse4):

-%if ABI_IS_32BIT

-    push        rbp

-    mov         rbp, rsp

-    GET_GOT     rbx

-    push        rdi

-    push        rsi

-    ALIGN_STACK 16, rax

-    %define qcoeff      0 ; 32

-    %define stack_size 32

-    sub         rsp, stack_size

-%else

-  %if LIBVPX_YASM_WIN64

-    SAVE_XMM 8, u

-    push        rdi

-    push        rsi

-  %endif

-%endif

-    ; end prolog

-%if ABI_IS_32BIT

-    mov         rdi, arg(0)                 ; BLOCK *b

-    mov         rsi, arg(1)                 ; BLOCKD *d

-%else

-  %if LIBVPX_YASM_WIN64

-    mov         rdi, rcx                    ; BLOCK *b

-    mov         rsi, rdx                    ; BLOCKD *d

-  %else

-    ;mov         rdi, rdi                    ; BLOCK *b

-    ;mov         rsi, rsi                    ; BLOCKD *d

-  %endif

-%endif

-    mov         rax, [rdi + vp9_block_coeff]

-    mov         rcx, [rdi + vp9_block_zbin]

-    mov         rdx, [rdi + vp9_block_round]

-    movd        xmm7, [rdi + vp9_block_zbin_extra]

-    ; z

-    movdqa      xmm0, [rax]

-    movdqa      xmm1, [rax + 16]

-    ; duplicate zbin_oq_value

-    pshuflw     xmm7, xmm7, 0

-    punpcklwd   xmm7, xmm7

-    movdqa      xmm2, xmm0

-    movdqa      xmm3, xmm1

-    ; sz

-    psraw       xmm0, 15

-    psraw       xmm1, 15

-    ; (z ^ sz)

-    pxor        xmm2, xmm0

-    pxor        xmm3, xmm1

-    ; x = abs(z)

-    psubw       xmm2, xmm0

-    psubw       xmm3, xmm1

-    ; zbin

-    movdqa      xmm4, [rcx]

-    movdqa      xmm5, [rcx + 16]

-    ; *zbin_ptr + zbin_oq_value

-    paddw       xmm4, xmm7

-    paddw       xmm5, xmm7

-    movdqa      xmm6, xmm2

-    movdqa      xmm7, xmm3

-    ; x - (*zbin_ptr + zbin_oq_value)

-    psubw       xmm6, xmm4

-    psubw       xmm7, xmm5

-    ; round

-    movdqa      xmm4, [rdx]

-    movdqa      xmm5, [rdx + 16]

-    mov         rax, [rdi + vp9_block_quant_shift]

-    mov         rcx, [rdi + vp9_block_quant]

-    mov         rdx, [rdi + vp9_block_zrun_zbin_boost]

-    ; x + round

-    paddw       xmm2, xmm4

-    paddw       xmm3, xmm5

-    ; quant

-    movdqa      xmm4, [rcx]

-    movdqa      xmm5, [rcx + 16]

-    ; y = x * quant_ptr >> 16

-    pmulhw      xmm4, xmm2

-    pmulhw      xmm5, xmm3

-    ; y += x

-    paddw       xmm2, xmm4

-    paddw       xmm3, xmm5

-    pxor        xmm4, xmm4

-%if ABI_IS_32BIT

-    movdqa      [rsp + qcoeff], xmm4

-    movdqa      [rsp + qcoeff + 16], xmm4

-%else

-    pxor        xmm8, xmm8

-%endif

-    ; quant_shift

-    movdqa      xmm5, [rax]

-    ; zrun_zbin_boost

-    mov         rax, rdx

-%macro ZIGZAG_LOOP 5

-    ; x

-    pextrw      ecx, %4, %2

-    ; if (x >= zbin)

-    sub         cx, WORD PTR[rdx]           ; x - zbin

-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++

-    jl          .rq_zigzag_loop_%1          ; x < zbin

-    pextrw      edi, %3, %2                 ; y

-    ; downshift by quant_shift[rc]

-    pextrb      ecx, xmm5, %1               ; quant_shift[rc]

-    sar         edi, cl                     ; also sets Z bit

-    je          .rq_zigzag_loop_%1          ; !y

-%if ABI_IS_32BIT

-    mov         WORD PTR[rsp + qcoeff + %1 *2], di

-%else

-    pinsrw      %5, edi, %2                 ; qcoeff[rc]

-%endif

-    mov         rdx, rax                    ; reset to b->zrun_zbin_boost

-.rq_zigzag_loop_%1:

-%endmacro

-; in vp9_default_zig_zag1d order: see vp9/common/vp9_entropy.c

-ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8

-ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8

-ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4

-ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8

-    mov         rcx, [rsi + vp9_blockd_dequant]

-    mov         rdi, [rsi + vp9_blockd_dqcoeff]

-%if ABI_IS_32BIT

-    movdqa      xmm4, [rsp + qcoeff]

-    movdqa      xmm5, [rsp + qcoeff + 16]

-%else

-    %define     xmm5 xmm8

-%endif

-    ; y ^ sz

-    pxor        xmm4, xmm0

-    pxor        xmm5, xmm1

-    ; x = (y ^ sz) - sz

-    psubw       xmm4, xmm0

-    psubw       xmm5, xmm1

-    ; dequant

-    movdqa      xmm0, [rcx]

-    movdqa      xmm1, [rcx + 16]

-    mov         rcx, [rsi + vp9_blockd_qcoeff]

-    pmullw      xmm0, xmm4

-    pmullw      xmm1, xmm5

-    ; store qcoeff

-    movdqa      [rcx], xmm4

-    movdqa      [rcx + 16], xmm5

-    ; store dqcoeff

-    movdqa      [rdi], xmm0

-    movdqa      [rdi + 16], xmm1

-    ; select the last value (in zig_zag order) for EOB

-    pxor        xmm6, xmm6

-    pcmpeqw     xmm4, xmm6

-    pcmpeqw     xmm5, xmm6

-    packsswb    xmm4, xmm5

-    pshufb      xmm4, [GLOBAL(zig_zag1d)]

-    pmovmskb    edx, xmm4

-    xor         rdi, rdi

-    mov         eax, -1

-    xor         dx, ax

-    bsr         eax, edx

-    sub         edi, edx

-    sar         edi, 31

-    add         eax, 1

-    and         eax, edi

-    mov         [rsi + vp9_blockd_eob], eax

-    ; begin epilog

-%if ABI_IS_32BIT

-    add         rsp, stack_size

-    pop         rsp

-    pop         rsi

-    pop         rdi

-    RESTORE_GOT

-    pop         rbp

-%else

-  %undef xmm5

-  %if LIBVPX_YASM_WIN64

-    pop         rsi

-    pop         rdi

-    RESTORE_XMM

-  %endif

-%endif

-    ret

-SECTION_RODATA

-align 16

-; vp9/common/vp9_entropy.c: vp9_default_zig_zag1d

-zig_zag1d:

-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15

--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm

+++ /dev/null

@@ -1,137 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-; void vp9_fast_quantize_b_ssse3 | arg

-;  (BLOCK  *b,                   |  0

-;   BLOCKD *d)                   |  1

-;

-global sym(vp9_fast_quantize_b_ssse3) PRIVATE

-sym(vp9_fast_quantize_b_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    GET_GOT     rbx

-%if ABI_IS_32BIT

-    push        rdi

-    push        rsi

-%else

-  %if LIBVPX_YASM_WIN64

-    push        rdi

-    push        rsi

-  %endif

-%endif

-    ; end prolog

-%if ABI_IS_32BIT

-    mov         rdi, arg(0)                 ; BLOCK *b

-    mov         rsi, arg(1)                 ; BLOCKD *d

-%else

-  %if LIBVPX_YASM_WIN64

-    mov         rdi, rcx                    ; BLOCK *b

-    mov         rsi, rdx                    ; BLOCKD *d

-  %else

-    ;mov         rdi, rdi                    ; BLOCK *b

-    ;mov         rsi, rsi                    ; BLOCKD *d

-  %endif

-%endif

-    mov         rax, [rdi + vp9_block_coeff]

-    mov         rcx, [rdi + vp9_block_round]

-    mov         rdx, [rdi + vp9_block_quant_fast]

-    ; coeff

-    movdqa      xmm0, [rax]

-    movdqa      xmm4, [rax + 16]

-    ; round

-    movdqa      xmm2, [rcx]

-    movdqa      xmm3, [rcx + 16]

-    movdqa      xmm1, xmm0

-    movdqa      xmm5, xmm4

-    ; sz = z >> 15

-    psraw       xmm0, 15

-    psraw       xmm4, 15

-    pabsw       xmm1, xmm1

-    pabsw       xmm5, xmm5

-    paddw       xmm1, xmm2

-    paddw       xmm5, xmm3

-    ; quant_fast

-    pmulhw      xmm1, [rdx]

-    pmulhw      xmm5, [rdx + 16]

-    mov         rax, [rsi + vp9_blockd_qcoeff]

-    mov         rdi, [rsi + vp9_blockd_dequant]

-    mov         rcx, [rsi + vp9_blockd_dqcoeff]

-    pxor        xmm1, xmm0

-    pxor        xmm5, xmm4

-    psubw       xmm1, xmm0

-    psubw       xmm5, xmm4

-    movdqa      [rax], xmm1

-    movdqa      [rax + 16], xmm5

-    movdqa      xmm2, [rdi]

-    movdqa      xmm3, [rdi + 16]

-    pxor        xmm4, xmm4

-    pmullw      xmm2, xmm1

-    pmullw      xmm3, xmm5

-    pcmpeqw     xmm1, xmm4                  ;non zero mask

-    pcmpeqw     xmm5, xmm4                  ;non zero mask

-    packsswb    xmm1, xmm5

-    pshufb      xmm1, [GLOBAL(zz_shuf)]

-    pmovmskb    edx, xmm1

-    xor         rdi, rdi

-    mov         eax, -1

-    xor         dx, ax                      ;flip the bits for bsr

-    bsr         eax, edx

-    movdqa      [rcx], xmm2                 ;store dqcoeff

-    movdqa      [rcx + 16], xmm3            ;store dqcoeff

-    sub         edi, edx                    ;check for all zeros in bit mask

-    sar         edi, 31                     ;0 or -1

-    add         eax, 1

-    and         eax, edi                    ;if the bit mask was all zero,

-                                            ;then eob = 0

-    mov         [rsi + vp9_blockd_eob], eax

-    ; begin epilog

-%if ABI_IS_32BIT

-    pop         rsi

-    pop         rdi

-%else

-  %if LIBVPX_YASM_WIN64

-    pop         rsi

-    pop         rdi

-  %endif

-%endif

-    RESTORE_GOT

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-zz_shuf:

-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15

--- a/vp9/encoder/x86/vp9_quantize_x86.h

+++ /dev/null

@@ -1,48 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license and patent

- *  grant that can be found in the LICENSE file in the root of the source

- *  tree. All contributing project authors may be found in the AUTHORS

- *  file in the root of the source tree.

- */

-#ifndef VP9_ENCODER_X86_VP9_QUANTIZE_X86_H_

-#define VP9_ENCODER_X86_VP9_QUANTIZE_X86_H_

-/* Note:

- *

- * This platform is commonly built for runtime CPU detection. If you modify

- * any of the function mappings present in this file, be sure to also update

- * them in the function pointer initialization code

- */

-#if HAVE_MMX

-#endif /* HAVE_MMX */

-#if HAVE_SSE2

-extern prototype_quantize_block(vp9_regular_quantize_b_sse2);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef vp9_quantize_quantb

-#define vp9_quantize_quantb vp9_regular_quantize_b_sse2

-#endif /* !CONFIG_RUNTIME_CPU_DETECT */

-#endif /* HAVE_SSE2 */

-#if HAVE_SSE4_1

-extern prototype_quantize_block(vp9_regular_quantize_b_sse4);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef vp9_quantize_quantb

-#define vp9_quantize_quantb vp9_regular_quantize_b_sse4

-#endif /* !CONFIG_RUNTIME_CPU_DETECT */

-#endif /* HAVE_SSE4_1 */

-#endif /* QUANTIZE_X86_H */

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -17,15 +17,6 @@

 VP9_CX_SRCS-yes += vp9_cx_iface.c

-# encoder

-#INCLUDES += algo/vpx_common/vpx_mem/include

-#INCLUDES += common

-#INCLUDES += common

-#INCLUDES += common

-#INCLUDES += algo/vpx_ref/cpu_id/include

-#INCLUDES += common

-#INCLUDES += encoder

 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c

 VP9_CX_SRCS-yes += encoder/vp9_boolhuff.c

 VP9_CX_SRCS-yes += encoder/vp9_dct.c

@@ -81,7 +72,6 @@

 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h

-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_quantize_x86.h

 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_x86_csystemdependent.c

 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c

 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm

@@ -94,7 +84,6 @@

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm

-#VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm

@@ -101,10 +90,7 @@

 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm

 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c

 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm

-#VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm

 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm

-#VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_quantize_sse4.asm

-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_quantize_mmx.asm

 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm

 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm

--

⑨