shithub: libvpx

--- a/vp9/encoder/x86/vp9_dct_mmx.asm

+++ /dev/null

@@ -1,241 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch)

-global sym(vp9_short_fdct4x4_mmx) PRIVATE

-sym(vp9_short_fdct4x4_mmx):

-    push        rbp

-    mov         rbp,        rsp

-    SHADOW_ARGS_TO_STACK 3

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0)      ; input

-        mov         rdi,        arg(1)      ; output

-        movsxd      rax,        dword ptr arg(2) ;pitch

-        lea         rcx,        [rsi + rax*2]

-        ; read the input data

-        movq        mm0,        [rsi]

-        movq        mm1,        [rsi + rax]

-        movq        mm2,        [rcx]

-        movq        mm4,        [rcx + rax]

-        ; transpose for the first stage

-        movq        mm3,        mm0         ; 00 01 02 03

-        movq        mm5,        mm2         ; 20 21 22 23

-        punpcklwd   mm0,        mm1         ; 00 10 01 11

-        punpckhwd   mm3,        mm1         ; 02 12 03 13

-        punpcklwd   mm2,        mm4         ; 20 30 21 31

-        punpckhwd   mm5,        mm4         ; 22 32 23 33

-        movq        mm1,        mm0         ; 00 10 01 11

-        punpckldq   mm0,        mm2         ; 00 10 20 30

-        punpckhdq   mm1,        mm2         ; 01 11 21 31

-        movq        mm2,        mm3         ; 02 12 03 13

-        punpckldq   mm2,        mm5         ; 02 12 22 32

-        punpckhdq   mm3,        mm5         ; 03 13 23 33

-        ; mm0 0

-        ; mm1 1

-        ; mm2 2

-        ; mm3 3

-        ; first stage

-        movq        mm5,        mm0

-        movq        mm4,        mm1

-        paddw       mm0,        mm3         ; a1 = 0 + 3

-        paddw       mm1,        mm2         ; b1 = 1 + 2

-        psubw       mm4,        mm2         ; c1 = 1 - 2

-        psubw       mm5,        mm3         ; d1 = 0 - 3

-        psllw       mm5,        3

-        psllw       mm4,        3

-        psllw       mm0,        3

-        psllw       mm1,        3

-        ; output 0 and 2

-        movq        mm2,        mm0         ; a1

-        paddw       mm0,        mm1         ; op[0] = a1 + b1

-        psubw       mm2,        mm1         ; op[2] = a1 - b1

-        ; output 1 and 3

-        ; interleave c1, d1

-        movq        mm1,        mm5         ; d1

-        punpcklwd   mm1,        mm4         ; c1 d1

-        punpckhwd   mm5,        mm4         ; c1 d1

-        movq        mm3,        mm1

-        movq        mm4,        mm5

-        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]

-        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]

-        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]

-        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]

-        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12

-        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12

-        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12

-        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12

-        packssdw    mm1,        mm4         ; op[1]

-        packssdw    mm3,        mm5         ; op[3]

-        ; done with vertical

-        ; transpose for the second stage

-        movq        mm4,        mm0         ; 00 10 20 30

-        movq        mm5,        mm2         ; 02 12 22 32

-        punpcklwd   mm0,        mm1         ; 00 01 10 11

-        punpckhwd   mm4,        mm1         ; 20 21 30 31

-        punpcklwd   mm2,        mm3         ; 02 03 12 13

-        punpckhwd   mm5,        mm3         ; 22 23 32 33

-        movq        mm1,        mm0         ; 00 01 10 11

-        punpckldq   mm0,        mm2         ; 00 01 02 03

-        punpckhdq   mm1,        mm2         ; 01 22 12 13

-        movq        mm2,        mm4         ; 20 31 30 31

-        punpckldq   mm2,        mm5         ; 20 21 22 23

-        punpckhdq   mm4,        mm5         ; 30 31 32 33

-        ; mm0 0

-        ; mm1 1

-        ; mm2 2

-        ; mm3 4

-        movq        mm5,        mm0

-        movq        mm3,        mm1

-        paddw       mm0,        mm4         ; a1 = 0 + 3

-        paddw       mm1,        mm2         ; b1 = 1 + 2

-        psubw       mm3,        mm2         ; c1 = 1 - 2

-        psubw       mm5,        mm4         ; d1 = 0 - 3

-        pxor        mm6,        mm6         ; zero out for compare

-        pcmpeqw     mm6,        mm5         ; d1 != 0

-        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,

-                                                                ; and keep bit 0 of lower

-        ; output 0 and 2

-        movq        mm2,        mm0         ; a1

-        paddw       mm0,        mm1         ; a1 + b1

-        psubw       mm2,        mm1         ; a1 - b1

-        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]

-        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]

-        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4

-        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4

-        movq        MMWORD PTR[rdi + 0 ],  mm0

-        movq        MMWORD PTR[rdi + 16],  mm2

-        ; output 1 and 3

-        ; interleave c1, d1

-        movq        mm1,        mm5         ; d1

-        punpcklwd   mm1,        mm3         ; c1 d1

-        punpckhwd   mm5,        mm3         ; c1 d1

-        movq        mm3,        mm1

-        movq        mm4,        mm5

-        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]

-        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]

-        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]

-        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]

-        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16

-        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16

-        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16

-        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16

-        packssdw    mm1,        mm4         ; op[4]

-        packssdw    mm3,        mm5         ; op[12]

-        paddw       mm1,        mm6         ; op[4] += (d1!=0)

-        movq        MMWORD PTR[rdi + 8 ],  mm1

-        movq        MMWORD PTR[rdi + 24],  mm3

-     ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 8

-_5352_2217:

-    dw 5352

-    dw 2217

-    dw 5352

-    dw 2217

-align 8

-_2217_neg5352:

-    dw 2217

-    dw -5352

-    dw 2217

-    dw -5352

-align 8

-_cmp_mask:

-    times 4 dw 1

-align 8

-_7w:

-    times 4 dw 7

-align 8

-_14500:

-    times 2 dd 14500

-align 8

-_7500:

-    times 2 dd 7500

-align 8

-_12000:

-    times 2 dd 12000

-align 8

-_51000:

-    times 2 dd 51000

--- a/vp9/encoder/x86/vp9_dct_mmx.h

+++ /dev/null

@@ -1,17 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_ENCODER_X86_VP9_DCT_MMX_H_

-#define VP9_ENCODER_X86_VP9_DCT_MMX_H_

-extern void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch);

-#endif /* VP9_ENCODER_X86_VP9_DCT_MMX_H_ */

--- a/vp9/encoder/x86/vp9_fwalsh_sse2.asm

+++ /dev/null

@@ -1,164 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_short_walsh4x4_sse2(short *input, short *output, int pitch)

-global sym(vp9_short_walsh4x4_sse2) PRIVATE

-sym(vp9_short_walsh4x4_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 3

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov     rsi, arg(0)           ; input

-    mov     rdi, arg(1)           ; output

-    movsxd  rdx, dword ptr arg(2) ; pitch

-    ; first for loop

-    movq    xmm0, MMWORD PTR [rsi]           ; load input

-    movq    xmm1, MMWORD PTR [rsi + rdx]

-    lea     rsi,  [rsi + rdx*2]

-    movq    xmm2, MMWORD PTR [rsi]

-    movq    xmm3, MMWORD PTR [rsi + rdx]

-    punpcklwd xmm0,  xmm1

-    punpcklwd xmm2,  xmm3

-    movdqa    xmm1, xmm0

-    punpckldq xmm0, xmm2           ; ip[1] ip[0]

-    punpckhdq xmm1, xmm2           ; ip[3] ip[2]

-    movdqa    xmm2, xmm0

-    paddw     xmm0, xmm1

-    psubw     xmm2, xmm1

-    psllw     xmm0, 2              ; d1  a1

-    psllw     xmm2, 2              ; c1  b1

-    movdqa    xmm1, xmm0

-    punpcklqdq xmm0, xmm2          ; b1  a1

-    punpckhqdq xmm1, xmm2          ; c1  d1

-    pxor      xmm6, xmm6

-    movq      xmm6, xmm0

-    pxor      xmm7, xmm7

-    pcmpeqw   xmm7, xmm6

-    paddw     xmm7, [GLOBAL(c1)]

-    movdqa    xmm2, xmm0

-    paddw     xmm0, xmm1           ; b1+c1  a1+d1

-    psubw     xmm2, xmm1           ; b1-c1  a1-d1

-    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)

-    ; second for loop

-    ; input: 13  9  5  1 12  8  4  0 (xmm0)

-    ;        14 10  6  2 15 11  7  3 (xmm2)

-    ; after shuffle:

-    ;        13  5  9  1 12  4  8  0 (xmm0)

-    ;        14  6 10  2 15  7 11  3 (xmm1)

-    pshuflw   xmm3, xmm0, 0xd8

-    pshufhw   xmm0, xmm3, 0xd8

-    pshuflw   xmm3, xmm2, 0xd8

-    pshufhw   xmm1, xmm3, 0xd8

-    movdqa    xmm2, xmm0

-    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10

-    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10

-    movdqa    xmm3, xmm1

-    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13

-    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13

-    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10

-    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10

-    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12

-    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12

-    movdqa    xmm0, xmm4

-    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10

-    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10

-    movdqa    xmm1, xmm6

-    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12

-    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12

-    movdqa    xmm2, xmm0

-    paddd     xmm0, xmm4            ; b21 b20 a21 a20

-    psubd     xmm2, xmm4            ; c21 c20 d21 d20

-    movdqa    xmm3, xmm1

-    paddd     xmm1, xmm6            ; b23 b22 a23 a22

-    psubd     xmm3, xmm6            ; c23 c22 d23 d22

-    pxor      xmm4, xmm4

-    movdqa    xmm5, xmm4

-    pcmpgtd   xmm4, xmm0

-    pcmpgtd   xmm5, xmm2

-    pand      xmm4, [GLOBAL(cd1)]

-    pand      xmm5, [GLOBAL(cd1)]

-    pxor      xmm6, xmm6

-    movdqa    xmm7, xmm6

-    pcmpgtd   xmm6, xmm1

-    pcmpgtd   xmm7, xmm3

-    pand      xmm6, [GLOBAL(cd1)]

-    pand      xmm7, [GLOBAL(cd1)]

-    paddd     xmm0, xmm4

-    paddd     xmm2, xmm5

-    paddd     xmm0, [GLOBAL(cd3)]

-    paddd     xmm2, [GLOBAL(cd3)]

-    paddd     xmm1, xmm6

-    paddd     xmm3, xmm7

-    paddd     xmm1, [GLOBAL(cd3)]

-    paddd     xmm3, [GLOBAL(cd3)]

-    psrad     xmm0, 3

-    psrad     xmm1, 3

-    psrad     xmm2, 3

-    psrad     xmm3, 3

-    movdqa    xmm4, xmm0

-    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20

-    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20

-    movdqa    xmm5, xmm2

-    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20

-    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20

-    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20

-    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20

-    movdqa  XMMWORD PTR [rdi], xmm0

-    movdqa  XMMWORD PTR [rdi + 16], xmm2

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-c1:

-    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001

-align 16

-cn1:

-    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff

-align 16

-cd1:

-    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001

-align 16

-cd3:

-    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -78,13 +78,10 @@

 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c

 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm

 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm

-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm

-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm

-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm

--

⑨