ref: 8cf28d346d697d5e09ac8955c0e70f07fd09b60a
dir: /vp8/encoder/x86/subtract_mmx.asm/
;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
;                            short *diff, unsigned char *Predictor,
;                            int pitch);
global sym(vp8_subtract_b_mmx_impl)
sym(vp8_subtract_b_mmx_impl):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push rsi
    push rdi
    ; end prolog
        mov     rdi,        arg(2) ;diff
        mov     rax,        arg(3) ;Predictor
        mov     rsi,        arg(0) ;z
        movsxd  rdx,        dword ptr arg(1);src_stride;
        movsxd  rcx,        dword ptr arg(4);pitch
        pxor    mm7,        mm7
        movd    mm0,        [rsi]
        movd    mm1,        [rax]
        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7
        psubw   mm0,        mm1
        movq    [rdi],      mm0
        movd    mm0,        [rsi+rdx]
        movd    mm1,        [rax+rcx]
        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7
        psubw   mm0,        mm1
        movq    [rdi+rcx*2],mm0
        movd    mm0,        [rsi+rdx*2]
        movd    mm1,        [rax+rcx*2]
        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7
        psubw   mm0,        mm1
        movq    [rdi+rcx*4],        mm0
        lea     rsi,        [rsi+rdx*2]
        lea     rcx,        [rcx+rcx*2]
        movd    mm0,        [rsi+rdx]
        movd    mm1,        [rax+rcx]
        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7
        psubw   mm0,        mm1
        movq    [rdi+rcx*2],        mm0
    ; begin epilog
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
global sym(vp8_subtract_mby_mmx)
sym(vp8_subtract_mby_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 4
    push rsi
    push rdi
    ; end prolog
            mov         rsi,            arg(1) ;src
            mov         rdi,            arg(0) ;diff
            mov         rax,            arg(2) ;pred
            movsxd      rdx,            dword ptr arg(3) ;stride
            mov         rcx,            16
            pxor        mm0,            mm0
.submby_loop:
            movq        mm1,            [rsi]
            movq        mm3,            [rax]
            movq        mm2,            mm1
            movq        mm4,            mm3
            punpcklbw   mm1,            mm0
            punpcklbw   mm3,            mm0
            punpckhbw   mm2,            mm0
            punpckhbw   mm4,            mm0
            psubw       mm1,            mm3
            psubw       mm2,            mm4
            movq        [rdi],          mm1
            movq        [rdi+8],        mm2
            movq        mm1,            [rsi+8]
            movq        mm3,            [rax+8]
            movq        mm2,            mm1
            movq        mm4,            mm3
            punpcklbw   mm1,            mm0
            punpcklbw   mm3,            mm0
            punpckhbw   mm2,            mm0
            punpckhbw   mm4,            mm0
            psubw       mm1,            mm3
            psubw       mm2,            mm4
            movq        [rdi+16],       mm1
            movq        [rdi+24],       mm2
            add         rdi,            32
            add         rax,            16
            lea         rsi,            [rsi+rdx]
            sub         rcx,            1
            jnz         .submby_loop
    pop rdi
    pop rsi
    ; begin epilog
    UNSHADOW_ARGS
    pop         rbp
    ret
;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
global sym(vp8_subtract_mbuv_mmx)
sym(vp8_subtract_mbuv_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push rsi
    push rdi
    ; end prolog
    ;short *udiff = diff + 256;
    ;short *vdiff = diff + 320;
    ;unsigned char *upred = pred + 256;
    ;unsigned char *vpred = pred + 320;
        ;unsigned char  *z    = usrc;
        ;unsigned short *diff = udiff;
        ;unsigned char  *Predictor= upred;
            mov     rdi,        arg(0) ;diff
            mov     rax,        arg(3) ;pred
            mov     rsi,        arg(1) ;z = usrc
            add     rdi,        256*2  ;diff = diff + 256 (shorts)
            add     rax,        256    ;Predictor = pred + 256
            movsxd  rdx,        dword ptr arg(4) ;stride;
            pxor    mm7,        mm7
            movq    mm0,        [rsi]
            movq    mm1,        [rax]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi],      mm0
            movq    [rdi+8],    mm3
            movq    mm0,        [rsi+rdx]
            movq    mm1,        [rax+8]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi+16],   mm0
            movq    [rdi+24],   mm3
            movq    mm0,        [rsi+rdx*2]
            movq    mm1,        [rax+16]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi+32],   mm0
            movq    [rdi+40],   mm3
            lea     rsi,        [rsi+rdx*2]
            movq    mm0,        [rsi+rdx]
            movq    mm1,        [rax+24]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi+48],   mm0
            movq    [rdi+56],   mm3
            add     rdi,        64
            add     rax,        32
            lea     rsi,        [rsi+rdx*2]
            movq    mm0,        [rsi]
            movq    mm1,        [rax]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi],      mm0
            movq    [rdi+8],    mm3
            movq    mm0,        [rsi+rdx]
            movq    mm1,        [rax+8]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi+16],   mm0
            movq    [rdi+24],   mm3
            movq    mm0,        [rsi+rdx*2]
            movq    mm1,        [rax+16]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi+32],   mm0
            movq    [rdi+40],   mm3
            lea     rsi,        [rsi+rdx*2]
            movq    mm0,        [rsi+rdx]
            movq    mm1,        [rax+24]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi+48],   mm0
            movq    [rdi+56],   mm3
        ;unsigned char  *z    = vsrc;
        ;unsigned short *diff = vdiff;
        ;unsigned char  *Predictor= vpred;
            mov     rdi,        arg(0) ;diff
            mov     rax,        arg(3) ;pred
            mov     rsi,        arg(2) ;z = usrc
            add     rdi,        320*2  ;diff = diff + 320 (shorts)
            add     rax,        320    ;Predictor = pred + 320
            movsxd  rdx,        dword ptr arg(4) ;stride;
            pxor    mm7,        mm7
            movq    mm0,        [rsi]
            movq    mm1,        [rax]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi],      mm0
            movq    [rdi+8],    mm3
            movq    mm0,        [rsi+rdx]
            movq    mm1,        [rax+8]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi+16],   mm0
            movq    [rdi+24],   mm3
            movq    mm0,        [rsi+rdx*2]
            movq    mm1,        [rax+16]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi+32],   mm0
            movq    [rdi+40],   mm3
            lea     rsi,        [rsi+rdx*2]
            movq    mm0,        [rsi+rdx]
            movq    mm1,        [rax+24]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi+48],   mm0
            movq    [rdi+56],   mm3
            add     rdi,        64
            add     rax,        32
            lea     rsi,        [rsi+rdx*2]
            movq    mm0,        [rsi]
            movq    mm1,        [rax]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi],      mm0
            movq    [rdi+8],    mm3
            movq    mm0,        [rsi+rdx]
            movq    mm1,        [rax+8]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi+16],   mm0
            movq    [rdi+24],   mm3
            movq    mm0,        [rsi+rdx*2]
            movq    mm1,        [rax+16]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi+32],   mm0
            movq    [rdi+40],   mm3
            lea     rsi,        [rsi+rdx*2]
            movq    mm0,        [rsi+rdx]
            movq    mm1,        [rax+24]
            movq    mm3,        mm0
            movq    mm4,        mm1
            punpcklbw   mm0,    mm7
            punpcklbw   mm1,    mm7
            punpckhbw   mm3,    mm7
            punpckhbw   mm4,    mm7
            psubw   mm0,        mm1
            psubw   mm3,        mm4
            movq    [rdi+48],   mm0
            movq    [rdi+56],   mm3
    ; begin epilog
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret