shithub: libvpx

ref: 11fec1863d7e9f0e864238dcc5b755edf29d10d0
dir: /vp9/decoder/x86/dequantize_mmx.asm/

View raw version
;
;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;

%include "third_party/x86inc/x86inc.asm"

SECTION_RODATA
align 16
x_s1sqr2:      times 4 dw 0x8A8C
align 16
x_c1sqr2less1: times 4 dw 0x4E7B
align 16
pw_16:         times 4 dw 16

SECTION .text

INIT_MMX


;void dequantize_b_impl_mmx(short *sq, short *dq, short *q)
cglobal dequantize_b_impl_mmx, 3,3,0,sq,dq,arg3
    mova       m1, [sqq]
    pmullw     m1, [arg3q+0]            ; mm4 *= kernel 0 modifiers.
    mova [dqq+ 0], m1

    mova       m1, [sqq+8]
    pmullw     m1, [arg3q+8]            ; mm4 *= kernel 0 modifiers.
    mova [dqq+ 8], m1

    mova       m1, [sqq+16]
    pmullw     m1, [arg3q+16]            ; mm4 *= kernel 0 modifiers.
    mova [dqq+16], m1

    mova       m1, [sqq+24]
    pmullw     m1, [arg3q+24]            ; mm4 *= kernel 0 modifiers.
    mova [dqq+24], m1
    RET


;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
cglobal dequant_idct_add_mmx, 4,6,0,inp,dq,pred,dest,pit,stride

%if ARCH_X86_64
    movsxd              strideq,  dword stridem
    movsxd              pitq,     dword pitm
%else
    mov                 strideq,  stridem
    mov                 pitq,     pitm
%endif

    mova                m0,       [inpq+ 0]
    pmullw              m0,       [dqq]

    mova                m1,       [inpq+ 8]
    pmullw              m1,       [dqq+ 8]

    mova                m2,       [inpq+16]
    pmullw              m2,       [dqq+16]

    mova                m3,       [inpq+24]
    pmullw              m3,       [dqq+24]

    pxor                m7,        m7
    mova            [inpq],        m7
    mova          [inpq+8],        m7
    mova         [inpq+16],        m7
    mova         [inpq+24],        m7


    psubw               m0,        m2             ; b1= 0-2
    paddw               m2,        m2             ;

    mova                m5,        m1
    paddw               m2,        m0             ; a1 =0+2

    pmulhw              m5,       [x_s1sqr2];
    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

    mova                m7,        m3             ;
    pmulhw              m7,       [x_c1sqr2less1];

    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
    psubw               m7,        m5             ; c1

    mova                m5,        m1
    mova                m4,        m3

    pmulhw              m5,       [x_c1sqr2less1]
    paddw               m5,        m1

    pmulhw              m3,       [x_s1sqr2]
    paddw               m3,        m4

    paddw               m3,        m5             ; d1
    mova                m6,        m2             ; a1

    mova                m4,        m0             ; b1
    paddw               m2,        m3             ;0

    paddw               m4,        m7             ;1
    psubw               m0,        m7             ;2

    psubw               m6,        m3             ;3

    mova                m1,        m2             ; 03 02 01 00
    mova                m3,        m4             ; 23 22 21 20

    punpcklwd           m1,        m0             ; 11 01 10 00
    punpckhwd           m2,        m0             ; 13 03 12 02

    punpcklwd           m3,        m6             ; 31 21 30 20
    punpckhwd           m4,        m6             ; 33 23 32 22

    mova                m0,        m1             ; 11 01 10 00
    mova                m5,        m2             ; 13 03 12 02

    punpckldq           m0,        m3             ; 30 20 10 00
    punpckhdq           m1,        m3             ; 31 21 11 01

    punpckldq           m2,        m4             ; 32 22 12 02
    punpckhdq           m5,        m4             ; 33 23 13 03

    mova                m3,        m5             ; 33 23 13 03

    psubw               m0,        m2             ; b1= 0-2
    paddw               m2,        m2             ;

    mova                m5,        m1
    paddw               m2,        m0             ; a1 =0+2

    pmulhw              m5,       [x_s1sqr2];
    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

    mova                m7,        m3             ;
    pmulhw              m7,       [x_c1sqr2less1];

    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
    psubw               m7,        m5             ; c1

    mova                m5,        m1
    mova                m4,        m3

    pmulhw              m5,       [x_c1sqr2less1]
    paddw               m5,        m1

    pmulhw              m3,       [x_s1sqr2]
    paddw               m3,        m4

    paddw               m3,        m5             ; d1
    paddw               m0,       [pw_16]

    paddw               m2,       [pw_16]
    mova                m6,        m2             ; a1

    mova                m4,        m0             ; b1
    paddw               m2,        m3             ;0

    paddw               m4,        m7             ;1
    psubw               m0,        m7             ;2

    psubw               m6,        m3             ;3
    psraw               m2,        5

    psraw               m0,        5
    psraw               m4,        5

    psraw               m6,        5

    mova                m1,        m2             ; 03 02 01 00
    mova                m3,        m4             ; 23 22 21 20

    punpcklwd           m1,        m0             ; 11 01 10 00
    punpckhwd           m2,        m0             ; 13 03 12 02

    punpcklwd           m3,        m6             ; 31 21 30 20
    punpckhwd           m4,        m6             ; 33 23 32 22

    mova                m0,        m1             ; 11 01 10 00
    mova                m5,        m2             ; 13 03 12 02

    punpckldq           m0,        m3             ; 30 20 10 00
    punpckhdq           m1,        m3             ; 31 21 11 01

    punpckldq           m2,        m4             ; 32 22 12 02
    punpckhdq           m5,        m4             ; 33 23 13 03

    pxor                m7,        m7

    movh                m4,       [predq]
    punpcklbw           m4,        m7
    paddsw              m0,        m4
    packuswb            m0,        m7
    movh           [destq],      m0

    movh                m4,       [predq+pitq]
    punpcklbw           m4,        m7
    paddsw              m1,        m4
    packuswb            m1,        m7
    movh   [destq+strideq],        m1

    movh                m4,       [predq+2*pitq]
    punpcklbw           m4,        m7
    paddsw              m2,        m4
    packuswb            m2,        m7
    movh [destq+strideq*2],        m2

    add              destq,        strideq
    add              predq,        pitq

    movh                m4,       [predq+2*pitq]
    punpcklbw           m4,        m7
    paddsw              m5,        m4
    packuswb            m5,        m7
    movh [destq+strideq*2],        m5
    RET


;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
cglobal dequant_dc_idct_add_mmx, 4,7,0,inp,dq,pred,dest,pit,stride,Dc

%if ARCH_X86_64
    movsxd              strideq,   dword stridem
    movsxd              pitq,      dword pitm
%else
    mov                 strideq,   stridem
    mov                 pitq,      pitm
%endif

    mov                 Dcq, Dcm
    mova                m0,       [inpq+ 0]
    pmullw              m0,       [dqq+ 0]

    mova                m1,       [inpq+ 8]
    pmullw              m1,       [dqq+ 8]

    mova                m2,       [inpq+16]
    pmullw              m2,       [dqq+16]

    mova                m3,       [inpq+24]
    pmullw              m3,       [dqq+24]

    pxor                m7,        m7
    mova         [inpq+ 0],        m7
    mova         [inpq+ 8],        m7
    mova         [inpq+16],        m7
    mova         [inpq+24],        m7

    ; move lower word of Dc to lower word of m0
    psrlq               m0,        16
    psllq               m0,        16
    and                Dcq,        0xFFFF         ; If Dc < 0, we don't want the full dword precision.
    movh                m7,        Dcq
    por                 m0,        m7
    psubw               m0,        m2             ; b1= 0-2
    paddw               m2,        m2             ;

    mova                m5,        m1
    paddw               m2,        m0             ; a1 =0+2

    pmulhw              m5,       [x_s1sqr2];
    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

    mova                m7,        m3             ;
    pmulhw              m7,       [x_c1sqr2less1];

    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
    psubw               m7,        m5             ; c1

    mova                m5,        m1
    mova                m4,        m3

    pmulhw              m5,       [x_c1sqr2less1]
    paddw               m5,        m1

    pmulhw              m3,       [x_s1sqr2]
    paddw               m3,        m4

    paddw               m3,        m5             ; d1
    mova                m6,        m2             ; a1

    mova                m4,        m0             ; b1
    paddw               m2,        m3             ;0

    paddw               m4,        m7             ;1
    psubw               m0,        m7             ;2

    psubw               m6,        m3             ;3

    mova                m1,        m2             ; 03 02 01 00
    mova                m3,        m4             ; 23 22 21 20

    punpcklwd           m1,        m0             ; 11 01 10 00
    punpckhwd           m2,        m0             ; 13 03 12 02

    punpcklwd           m3,        m6             ; 31 21 30 20
    punpckhwd           m4,        m6             ; 33 23 32 22

    mova                m0,        m1             ; 11 01 10 00
    mova                m5,        m2             ; 13 03 12 02

    punpckldq           m0,        m3             ; 30 20 10 00
    punpckhdq           m1,        m3             ; 31 21 11 01

    punpckldq           m2,        m4             ; 32 22 12 02
    punpckhdq           m5,        m4             ; 33 23 13 03

    mova                m3,        m5             ; 33 23 13 03

    psubw               m0,        m2             ; b1= 0-2
    paddw               m2,        m2             ;

    mova                m5,        m1
    paddw               m2,        m0             ; a1 =0+2

    pmulhw              m5,       [x_s1sqr2];
    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

    mova                m7,        m3             ;
    pmulhw              m7,       [x_c1sqr2less1];

    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)
    psubw               m7,        m5             ; c1

    mova                m5,        m1
    mova                m4,        m3

    pmulhw              m5,       [x_c1sqr2less1]
    paddw               m5,        m1

    pmulhw              m3,       [x_s1sqr2]
    paddw               m3,        m4

    paddw               m3,        m5             ; d1
    paddw               m0,       [pw_16]

    paddw               m2,       [pw_16]
    mova                m6,        m2             ; a1

    mova                m4,        m0             ; b1
    paddw               m2,        m3             ;0

    paddw               m4,        m7             ;1
    psubw               m0,        m7             ;2

    psubw               m6,        m3             ;3
    psraw               m2,        5

    psraw               m0,        5
    psraw               m4,        5

    psraw               m6,        5

    mova                m1,        m2             ; 03 02 01 00
    mova                m3,        m4             ; 23 22 21 20

    punpcklwd           m1,        m0             ; 11 01 10 00
    punpckhwd           m2,        m0             ; 13 03 12 02

    punpcklwd           m3,        m6             ; 31 21 30 20
    punpckhwd           m4,        m6             ; 33 23 32 22

    mova                m0,        m1             ; 11 01 10 00
    mova                m5,        m2             ; 13 03 12 02

    punpckldq           m0,        m3             ; 30 20 10 00
    punpckhdq           m1,        m3             ; 31 21 11 01

    punpckldq           m2,        m4             ; 32 22 12 02
    punpckhdq           m5,        m4             ; 33 23 13 03

    pxor                m7,        m7

    movh                m4,       [predq]
    punpcklbw           m4,        m7
    paddsw              m0,        m4
    packuswb            m0,        m7
    movh           [destq],        m0

    movh                m4,       [predq+pitq]
    punpcklbw           m4,        m7
    paddsw              m1,        m4
    packuswb            m1,        m7
    movh   [destq+strideq],        m1

    movh                m4,       [predq+2*pitq]
    punpcklbw           m4,        m7
    paddsw              m2,        m4
    packuswb            m2,        m7
    movh [destq+strideq*2],        m2

    add              destq,        strideq
    add              predq,        pitq

    movh                m4,       [predq+2*pitq]
    punpcklbw           m4,        m7
    paddsw              m5,        m4
    packuswb            m5,        m7
    movh [destq+strideq*2],        m5
    RET