shithub: libvpx

ref: c404fa42aca246c644a5bd84d43bbe9d9740e7a8
dir: /vp8/common/x86/subpixel_ssse3.asm/

View raw version
;
;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"

%define BLOCK_HEIGHT_WIDTH 4
%define VP8_FILTER_WEIGHT 128
%define VP8_FILTER_SHIFT  7


;/************************************************************************************
; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
; input pixel array has output_height rows. This routine assumes that output_height is an
; even number. This function handles 8 pixels in horizontal direction, calculating ONE
; rows each iteration to take advantage of the 128 bits operations.
;*************************************************************************************/
;void vp8_filter_block1d8_h6_ssse3
;(
;    unsigned char  *src_ptr,
;    unsigned int    src_pixels_per_line,
;    unsigned char *output_ptr,
;    unsigned int    output_pitch,
;    unsigned int    output_height,
;    unsigned int    vp8_filter_index
;)
global sym(vp8_filter_block1d8_h6_ssse3)
sym(vp8_filter_block1d8_h6_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    movsxd      rdx, DWORD PTR arg(5)   ;table index
    xor         rsi, rsi
    shl         rdx, 4

    movdqa      xmm7, [rd GLOBAL]

    lea         rax, [k0_k5 GLOBAL]
    add         rax, rdx
    mov         rdi, arg(2)             ;output_ptr

    cmp         esi, DWORD PTR [rax]
    je          vp8_filter_block1d8_h4_ssse3

    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

    mov         rsi, arg(0)             ;src_ptr
    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    movsxd      rcx, dword ptr arg(4)   ;output_height

    movsxd      rdx, dword ptr arg(3)   ;output_pitch

    sub         rdi, rdx
;xmm3 free
filter_block1d8_h6_rowloop_ssse3:
    movdqu      xmm0,   XMMWORD PTR [rsi - 2]

    movdqa      xmm1, xmm0
    pshufb      xmm0, [shuf1b GLOBAL]

    movdqa      xmm2, xmm1
    pshufb      xmm1, [shuf2b GLOBAL]
    pmaddubsw   xmm0, xmm4
    pmaddubsw   xmm1, xmm5

    pshufb      xmm2, [shuf3b GLOBAL]
    add         rdi, rdx
    pmaddubsw   xmm2, xmm6

    lea         rsi,    [rsi + rax]
    dec         rcx
    paddsw      xmm0, xmm1
    paddsw      xmm0, xmm7
    paddsw      xmm0, xmm2
    psraw       xmm0, 7
    packuswb    xmm0, xmm0

    movq        MMWORD Ptr [rdi], xmm0
    jnz         filter_block1d8_h6_rowloop_ssse3

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

vp8_filter_block1d8_h4_ssse3:
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

    movdqa      xmm3, XMMWORD PTR [shuf2b GLOBAL]
    movdqa      xmm4, XMMWORD PTR [shuf3b GLOBAL]

    mov         rsi, arg(0)             ;src_ptr

    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    movsxd      rcx, dword ptr arg(4)   ;output_height

    movsxd      rdx, dword ptr arg(3)   ;output_pitch

    sub         rdi, rdx
;xmm3 free
filter_block1d8_h4_rowloop_ssse3:
    movdqu      xmm0,   XMMWORD PTR [rsi - 2]

    movdqa      xmm2, xmm0
    pshufb      xmm0, xmm3 ;[shuf2b GLOBAL]
    pshufb      xmm2, xmm4 ;[shuf3b GLOBAL]

    pmaddubsw   xmm0, xmm5
    add         rdi, rdx
    pmaddubsw   xmm2, xmm6

    lea         rsi,    [rsi + rax]
    dec         rcx
    paddsw      xmm0, xmm7
    paddsw      xmm0, xmm2
    psraw       xmm0, 7
    packuswb    xmm0, xmm0

    movq        MMWORD Ptr [rdi], xmm0

    jnz         filter_block1d8_h4_rowloop_ssse3

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
;void vp8_filter_block1d16_h6_ssse3
;(
;    unsigned char  *src_ptr,
;    unsigned int    src_pixels_per_line,
;    unsigned char  *output_ptr,
;    unsigned int    output_pitch,
;    unsigned int    output_height,
;    unsigned int    vp8_filter_index
;)
global sym(vp8_filter_block1d16_h6_ssse3)
sym(vp8_filter_block1d16_h6_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    movsxd      rdx, DWORD PTR arg(5)   ;table index
    xor         rsi, rsi
    shl         rdx, 4      ;

    lea         rax, [k0_k5 GLOBAL]
    add         rax, rdx

    mov         rdi, arg(2)             ;output_ptr
    movdqa      xmm7, [rd GLOBAL]

;;
;;    cmp         esi, DWORD PTR [rax]
;;    je          vp8_filter_block1d16_h4_ssse3

    mov         rsi, arg(0)             ;src_ptr

    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    movsxd      rcx, dword ptr arg(4)   ;output_height
    movsxd      rdx, dword ptr arg(3)   ;output_pitch

filter_block1d16_h6_rowloop_ssse3:
    movdqu      xmm0,   XMMWORD PTR [rsi - 2]

    movdqa      xmm1, xmm0
    pshufb      xmm0, [shuf1b GLOBAL]
    movdqa      xmm2, xmm1
    pmaddubsw   xmm0, xmm4
    pshufb      xmm1, [shuf2b GLOBAL]
    pshufb      xmm2, [shuf3b GLOBAL]
    pmaddubsw   xmm1, xmm5

    movdqu      xmm3,   XMMWORD PTR [rsi + 6]

    pmaddubsw   xmm2, xmm6
    paddsw      xmm0, xmm1
    movdqa      xmm1, xmm3
    pshufb      xmm3, [shuf1b GLOBAL]
    paddsw      xmm0, xmm7
    pmaddubsw   xmm3, xmm4
    paddsw      xmm0, xmm2
    movdqa      xmm2, xmm1
    pshufb      xmm1, [shuf2b GLOBAL]
    pshufb      xmm2, [shuf3b GLOBAL]
    pmaddubsw   xmm1, xmm5
    pmaddubsw   xmm2, xmm6

    psraw       xmm0, 7
    packuswb    xmm0, xmm0
    lea         rsi,    [rsi + rax]
    paddsw      xmm3, xmm1
    paddsw      xmm3, xmm7
    paddsw      xmm3, xmm2
    psraw       xmm3, 7
    packuswb    xmm3, xmm3

    punpcklqdq  xmm0, xmm3

    movdqa      XMMWORD Ptr [rdi], xmm0

    add         rdi, rdx
    dec         rcx
    jnz         filter_block1d16_h6_rowloop_ssse3


    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

vp8_filter_block1d16_h4_ssse3:
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

    mov         rsi, arg(0)             ;src_ptr
    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    movsxd      rcx, dword ptr arg(4)   ;output_height
    movsxd      rdx, dword ptr arg(3)   ;output_pitch

filter_block1d16_h4_rowloop_ssse3:
    movdqu      xmm1,   XMMWORD PTR [rsi - 2]

    movdqa      xmm2, xmm1
    pshufb      xmm1, [shuf2b GLOBAL]
    pshufb      xmm2, [shuf3b GLOBAL]
    pmaddubsw   xmm1, xmm5

    movdqu      xmm3,   XMMWORD PTR [rsi + 6]

    pmaddubsw   xmm2, xmm6
    movdqa      xmm0, xmm3
    pshufb      xmm3, [shuf3b GLOBAL]
    pshufb      xmm0, [shuf2b GLOBAL]

    paddsw      xmm1, xmm7
    paddsw      xmm1, xmm2

    pmaddubsw   xmm0, xmm5
    pmaddubsw   xmm3, xmm6

    psraw       xmm1, 7
    packuswb    xmm1, xmm1
    lea         rsi,    [rsi + rax]
    paddsw      xmm3, xmm0
    paddsw      xmm3, xmm7
    psraw       xmm3, 7
    packuswb    xmm3, xmm3

    punpcklqdq  xmm1, xmm3

    movdqa      XMMWORD Ptr [rdi], xmm1

    add         rdi, rdx
    dec         rcx
    jnz         filter_block1d16_h4_rowloop_ssse3


    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

;void vp8_filter_block1d4_h6_ssse3
;(
;    unsigned char  *src_ptr,
;    unsigned int    src_pixels_per_line,
;    unsigned char  *output_ptr,
;    unsigned int    output_pitch,
;    unsigned int    output_height,
;    unsigned int    vp8_filter_index
;)
global sym(vp8_filter_block1d4_h6_ssse3)
sym(vp8_filter_block1d4_h6_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    movsxd      rdx, DWORD PTR arg(5)   ;table index
    mov         rsi, arg(0)             ;src_ptr
    shl         rdx, 4      ;

    lea         rax, [k0_k5 GLOBAL]
    add         rax, rdx
    movdqa      xmm7, [rd GLOBAL]





    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

    mov         rdi, arg(2)             ;output_ptr
    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    movsxd      rcx, dword ptr arg(4)   ;output_height

    movsxd      rdx, dword ptr arg(3)   ;output_pitch

;xmm3 free
filter_block1d4_h6_rowloop_ssse3:
    movdqu      xmm0,   XMMWORD PTR [rsi - 2]

    movdqa      xmm1, xmm0
    pshufb      xmm0, [shuf1b GLOBAL]

    movdqa      xmm2, xmm1
    pshufb      xmm1, [shuf2b GLOBAL]
    pmaddubsw   xmm0, xmm4
    pshufb      xmm2, [shuf3b GLOBAL]
    pmaddubsw   xmm1, xmm5

;--
    pmaddubsw   xmm2, xmm6

    lea         rsi,    [rsi + rax]
;--
    paddsw      xmm0, xmm1
    paddsw      xmm0, xmm7
    pxor        xmm1, xmm1
    paddsw      xmm0, xmm2
    psraw       xmm0, 7
    packuswb    xmm0, xmm0

;
    punpcklbw   xmm0, xmm1

    movq        MMWORD PTR [rdi], xmm0
    add         rdi, rdx
    dec         rcx
    jnz         filter_block1d4_h6_rowloop_ssse3

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

;void vp8_filter_block1d16_v6_ssse3
;(
;    unsigned char *src_ptr,
;    unsigned int   src_pitch,
;    unsigned char *output_ptr,
;    unsigned int   out_pitch,
;    unsigned int   output_height,
;    unsigned int   vp8_filter_index
;)
global sym(vp8_filter_block1d16_v6_ssse3)
sym(vp8_filter_block1d16_v6_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    movsxd      rdx, DWORD PTR arg(5)   ;table index
    xor         rsi, rsi
    shl         rdx, 4      ;

    lea         rax, [k0_k5 GLOBAL]
    add         rax, rdx

    cmp         esi, DWORD PTR [rax]
    je          vp8_filter_block1d16_v4_ssse3

    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

    mov         rsi, arg(0)             ;src_ptr
    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    mov         rdi, arg(2)             ;output_ptr

%if ABI_IS_32BIT=0
    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
%endif
    mov         rax, rsi
    movsxd      rcx, DWORD PTR arg(4)   ;output_height
    add         rax, rdx


vp8_filter_block1d16_v6_ssse3_loop:
    movq        xmm1, MMWORD PTR [rsi]                  ;A
    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

    punpcklbw   xmm2, xmm4                  ;B D
    punpcklbw   xmm3, xmm0                  ;C E

    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F

    pmaddubsw   xmm3, xmm6
    punpcklbw   xmm1, xmm0                  ;A F
    pmaddubsw   xmm2, xmm7
    pmaddubsw   xmm1, xmm5

    paddsw      xmm2, xmm3
    paddsw      xmm2, xmm1
    paddsw      xmm2, [rd GLOBAL]
    psraw       xmm2, 7
    packuswb    xmm2, xmm2

    movq        MMWORD PTR [rdi], xmm2          ;store the results

    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E

    punpcklbw   xmm2, xmm4                  ;B D
    punpcklbw   xmm3, xmm0                  ;C E

    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
    pmaddubsw   xmm3, xmm6
    punpcklbw   xmm1, xmm0                  ;A F
    pmaddubsw   xmm2, xmm7
    pmaddubsw   xmm1, xmm5

    add         rsi,  rdx
    add         rax,  rdx
;--
;--
    paddsw      xmm2, xmm3
    paddsw      xmm2, xmm1
    paddsw      xmm2, [rd GLOBAL]
    psraw       xmm2, 7
    packuswb    xmm2, xmm2

    movq        MMWORD PTR [rdi+8], xmm2

%if ABI_IS_32BIT
    add         rdi,        DWORD PTR arg(3) ;out_pitch
%else
    add         rdi,        r8
%endif
    dec         rcx
    jnz         vp8_filter_block1d16_v6_ssse3_loop

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

vp8_filter_block1d16_v4_ssse3:
    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

    mov         rsi, arg(0)             ;src_ptr
    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    mov         rdi, arg(2)             ;output_ptr

%if ABI_IS_32BIT=0
    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
%endif
    mov         rax, rsi
    movsxd      rcx, DWORD PTR arg(4)   ;output_height
    add         rax, rdx

vp8_filter_block1d16_v4_ssse3_loop:
    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

    punpcklbw   xmm2, xmm4                  ;B D
    punpcklbw   xmm3, xmm0                  ;C E

    pmaddubsw   xmm3, xmm6
    pmaddubsw   xmm2, xmm7
    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E

    paddsw      xmm2, [rd GLOBAL]
    paddsw      xmm2, xmm3
    psraw       xmm2, 7
    packuswb    xmm2, xmm2

    punpcklbw   xmm5, xmm4                  ;B D
    punpcklbw   xmm1, xmm0                  ;C E

    pmaddubsw   xmm1, xmm6
    pmaddubsw   xmm5, xmm7

    movdqa      xmm4, [rd GLOBAL]
    add         rsi,  rdx
    add         rax,  rdx
;--
;--
    paddsw      xmm5, xmm1
    paddsw      xmm5, xmm4
    psraw       xmm5, 7
    packuswb    xmm5, xmm5

    punpcklqdq  xmm2, xmm5

    movdqa       XMMWORD PTR [rdi], xmm2

%if ABI_IS_32BIT
    add         rdi,        DWORD PTR arg(3) ;out_pitch
%else
    add         rdi,        r8
%endif
    dec         rcx
    jnz         vp8_filter_block1d16_v4_ssse3_loop

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

;void vp8_filter_block1d8_v6_ssse3
;(
;    unsigned char *src_ptr,
;    unsigned int   src_pitch,
;    unsigned char *output_ptr,
;    unsigned int   out_pitch,
;    unsigned int   output_height,
;    unsigned int   vp8_filter_index
;)
global sym(vp8_filter_block1d8_v6_ssse3)
sym(vp8_filter_block1d8_v6_ssse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    movsxd      rdx, DWORD PTR arg(5)   ;table index
    xor         rsi, rsi
    shl         rdx, 4      ;

    lea         rax, [k0_k5 GLOBAL]
    add         rax, rdx

    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
    mov         rdi, arg(2)             ;output_ptr
%if ABI_IS_32BIT=0
    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
%endif
    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]

    cmp         esi, DWORD PTR [rax]
    je          vp8_filter_block1d8_v4_ssse3

    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

    mov         rsi, arg(0)             ;src_ptr

    mov         rax, rsi
    add         rax, rdx

vp8_filter_block1d8_v6_ssse3_loop:
    movq        xmm1, MMWORD PTR [rsi]                  ;A
    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

    punpcklbw   xmm2, xmm4                  ;B D
    punpcklbw   xmm3, xmm0                  ;C E

    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
    movdqa      xmm4, [rd GLOBAL]

    pmaddubsw   xmm3, xmm6
    punpcklbw   xmm1, xmm0                  ;A F
    pmaddubsw   xmm2, xmm7
    pmaddubsw   xmm1, xmm5
    add         rsi,  rdx
    add         rax,  rdx
;--
;--
    paddsw      xmm2, xmm3
    paddsw      xmm2, xmm1
    paddsw      xmm2, xmm4
    psraw       xmm2, 7
    packuswb    xmm2, xmm2

    movq        MMWORD PTR [rdi], xmm2

%if ABI_IS_32BIT
    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
%else
    add         rdi,        r8
%endif
    dec         rcx
    jnz         vp8_filter_block1d8_v6_ssse3_loop

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

vp8_filter_block1d8_v4_ssse3:
    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
    movdqa      xmm5, [rd GLOBAL]

    mov         rsi, arg(0)             ;src_ptr

    mov         rax, rsi
    add         rax, rdx

vp8_filter_block1d8_v4_ssse3_loop:
    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

    punpcklbw   xmm2, xmm4                  ;B D
    punpcklbw   xmm3, xmm0                  ;C E

    pmaddubsw   xmm3, xmm6
    pmaddubsw   xmm2, xmm7
    add         rsi,  rdx
    add         rax,  rdx
;--
;--
    paddsw      xmm2, xmm3
    paddsw      xmm2, xmm5
    psraw       xmm2, 7
    packuswb    xmm2, xmm2

    movq        MMWORD PTR [rdi], xmm2

%if ABI_IS_32BIT
    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
%else
    add         rdi,        r8
%endif
    dec         rcx
    jnz         vp8_filter_block1d8_v4_ssse3_loop

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
global sym(vp8_filter_block1d8_h6_ssse3_slow)
sym(vp8_filter_block1d8_h6_ssse3_slow):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    mov         rdx, arg(6) ;vp8_filter
    mov         rsi, arg(0) ;src_ptr

    mov         rdi, arg(1) ;output_ptr

    movsxd      rcx, dword ptr arg(4) ;output_height
    movsxd      rax, dword ptr arg(2) ;src_pixels_per_line

    movq        xmm7, [rdx]
    pxor        xmm4, xmm4
    movdqa      xmm5, XMMWORD PTR [shuf1 GLOBAL]
    movdqa      xmm6, XMMWORD PTR [shuf2 GLOBAL]

    movsxd      rdx, dword ptr arg(5) ;output_width

    punpcklqdq  xmm7, xmm7          ;copy filter constants to upper 8 bytes

filter_block1d8_h6_rowloop3_slow:
    movdqu      xmm0,   XMMWORD PTR [rsi - 2]

    lea         rsi,    [rsi + rax]

    movdqa      xmm1, xmm0
    pshufb      xmm0, XMMWORD PTR [shuf1 GLOBAL]

    movdqa      xmm2, xmm1
    pmaddubsw   xmm0, xmm7
    pshufb      xmm1, XMMWORD PTR [shuf2 GLOBAL]

    movdqa      xmm3, xmm2
    pmaddubsw   xmm1, xmm7
    pshufb      xmm2, XMMWORD PTR [shuf3 GLOBAL]

    pshufb      xmm3, XMMWORD PTR [shuf4 GLOBAL]

    pmaddubsw   xmm2, xmm7
    pmaddubsw   xmm3, xmm7
;4 cycles

    phaddsw     xmm0, xmm1
    phaddsw     xmm2, xmm3
;7 cycles
    phaddsw     xmm0, xmm2
;7 cycles


    paddsw      xmm0, [rd GLOBAL]
    psraw       xmm0, 7
    packuswb    xmm0, xmm0

;
    punpcklbw   xmm0, xmm4

    movdqa      XMMWORD Ptr [rdi], xmm0
    add         rdi, rdx
    dec         rcx
    jnz         filter_block1d8_h6_rowloop3_slow                ; next row

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
;void vp8_filter_block1d16_h6_ssse3
;(
;    unsigned char  *src_ptr,
;    unsigned short *output_ptr,
;    unsigned int    src_pixels_per_line,
;    unsigned int    pixel_step,
;    unsigned int    output_height,
;    unsigned int    output_width,
;    short           *vp8_filter
;)
global sym(vp8_filter_block1d16_h6_ssse3_slow)
sym(vp8_filter_block1d16_h6_ssse3_slow):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    SAVE_XMM
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog
    mov         rdx, arg(6) ;vp8_filter
    mov         rsi, arg(0) ;src_ptr

    mov         rdi, arg(1) ;output_ptr

    movsxd      rcx, dword ptr arg(4) ;output_height
    movsxd      rax, dword ptr arg(2) ;src_pixels_per_line

    movq        xmm7, [rdx]
    pxor        xmm4, xmm4
    movdqa      xmm5, XMMWORD PTR [shuf1 GLOBAL]
    movdqa      xmm6, XMMWORD PTR [shuf2 GLOBAL]

    movsxd      rdx, dword ptr arg(5) ;output_width

    punpcklqdq  xmm7, xmm7          ;copy filter constants to upper 8 bytes
    sub         rdi, rdx

filter_block1d16_h6_rowloop3_slow:
    movdqu      xmm0,   XMMWORD PTR [rsi - 2]

    movdqa      xmm1, xmm0
    pshufb      xmm0, xmm5

    movdqa      xmm2, xmm1
    pmaddubsw   xmm0, xmm7
    pshufb      xmm1, xmm6

    movdqa      xmm3, xmm2
    pmaddubsw   xmm1, xmm7
    pshufb      xmm2, XMMWORD PTR [shuf3 GLOBAL]
    movdqu      xmm4,   XMMWORD PTR [rsi + 6]
    pshufb      xmm3, XMMWORD PTR [shuf4 GLOBAL]
    lea         rsi,    [rsi + rax]
    pmaddubsw   xmm2, xmm7
    phaddsw     xmm0, xmm1

    pmaddubsw   xmm3, xmm7
    movdqa      xmm1, xmm4
    pshufb      xmm4, xmm5
    movdqa      xmm5, xmm1
    pmaddubsw   xmm4, xmm7
    pshufb      xmm1, xmm6
    phaddsw     xmm2, xmm3
    pmaddubsw   xmm1, xmm7
    movdqa      xmm3, xmm5
    pshufb      xmm5, XMMWORD PTR [shuf3 GLOBAL]
    add         rdi, rdx
    pmaddubsw   xmm5, xmm7
    pshufb      xmm3, XMMWORD PTR [shuf4 GLOBAL]
    phaddsw     xmm4, xmm1
    dec         rcx
    phaddsw     xmm0, xmm2
    pmaddubsw   xmm3, xmm7


    paddsw      xmm0, [rd GLOBAL]
    psraw       xmm0, 7
    packuswb    xmm0, xmm0
    phaddsw     xmm5, xmm3
    pxor        xmm3, xmm3
    punpcklbw   xmm0, xmm3
;--
;--
;--
;--

    phaddsw     xmm4, xmm5
    movdqa      xmm5, XMMWORD PTR [shuf1 GLOBAL]
    movdqa      XMMWORD Ptr [rdi], xmm0
;--
;--
;--
;--
;--
    paddsw      xmm4, [rd GLOBAL]
    psraw       xmm4, 7
    packuswb    xmm4, xmm4
;
    punpcklbw   xmm4, xmm3

    movdqa      XMMWORD Ptr [rdi+16], xmm4

    jnz         filter_block1d16_h6_rowloop3_slow                ; next row


    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

SECTION_RODATA
align 16
shuf1:
    db 0, 1, 2, 4, 3, 5, 128, 128, 1, 2, 3, 5, 4, 6, 128, 128
shuf2:
    db 2, 3, 4, 6, 5, 7, 128, 128, 3, 4, 5, 7, 6, 8, 128, 128
shuf3:
    db 4, 5, 6, 8, 7, 9, 128, 128, 5, 6, 7, 9, 8, 10, 128, 128
shuf4:
    db 6, 7, 8, 10, 9, 11, 128, 128, 7, 8, 9, 11, 10, 12, 128, 128

shuf1a:
    db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
shuf2a:
    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
shuf3a:
    db 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12

shuf1b:
    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
shuf2b:
    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
shuf3b:
    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10

align 16
rd:
    times 8 dw 0x40

align 16
k0_k5:
    times 8 db 0, 0             ;placeholder
    times 8 db 0, 0
    times 8 db 2, 1
    times 8 db 0, 0
    times 8 db 3, 3
    times 8 db 0, 0
    times 8 db 1, 2
    times 8 db 0, 0
k1_k3:
    times 8 db  0,    0         ;placeholder
    times 8 db  -6,  12
    times 8 db -11,  36
    times 8 db  -9,  50
    times 8 db -16,  77
    times 8 db  -6,  93
    times 8 db  -8, 108
    times 8 db  -1, 123
k2_k4:
    times 8 db 128,    0        ;placeholder
    times 8 db 123,   -1
    times 8 db 108,   -8
    times 8 db  93,   -6
    times 8 db  77,  -16
    times 8 db  50,   -9
    times 8 db  36,  -11
    times 8 db  12,   -6