shithub: openh264

ref: 2dc76d74250d90dacd4f3ca0e1ec5348012db4ca
dir: /codec/common/x86/deblock.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  deblock.asm
;*
;*  Abstract
;*      edge loop
;*
;*  History
;*      08/07/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"

;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************

SECTION .rodata align=16

ALIGN   16
FOUR_16B_SSE2:   dw   4, 4, 4, 4, 4, 4, 4, 4

ALIGN   16
WELS_DB1_16:
    times 16 db 1
WELS_DB127_16:
    times 16 db 127
WELS_DB96_16:
    times 16 db 96
WELS_SHUFB0000111122223333:
    times 4 db 0
    times 4 db 1
    times 4 db 2
    times 4 db 3


SECTION .text

; Unsigned byte absolute difference.
; a=%1 b=%2 clobber=%3
; Subtract once in each direction with saturation and return the maximum.
%macro SSE2_AbsDiffUB 3
    movdqa   %3, %2
    psubusb  %3, %1
    psubusb  %1, %2
    por      %1, %3
%endmacro

; Unsigned byte compare less than.
; lhs=%1 rhs^0x7f=%2 0x7f=%3
; No unsigned byte lt/gt compare instruction available; xor by 0x7f and use a
; signed compare. Some other options do exist. This one allows modifying the lhs
; without mov and uses a bitwise op which can be executed on most ports on
; common architectures.
%macro SSE2_CmpltUB 3
    pxor     %1, %3
    pcmpgtb  %1, %2
%endmacro

; Unsigned byte compare greater than or equal.
%macro SSE2_CmpgeUB 2
    pminub   %1, %2
    pcmpeqb  %1, %2
%endmacro

; Clip unsigned bytes to ref +/- diff.
; data=%1 ref=%2 maxdiff_from_ref=%3 clobber=%4
%macro SSE2_ClipUB 4
    movdqa   %4, %2
    psubusb  %4, %3
    paddusb  %3, %2
    pmaxub   %1, %4
    pminub   %1, %3
%endmacro

; (a + b + 1 - c) >> 1
; a=%1 b=%2 c=%3 [out:a^b&c]=%4
%macro SSE2_AvgbFloor1 4
    movdqa   %4, %1
    pxor     %4, %2
    pavgb    %1, %2
    pand     %4, %3
    psubb    %1, %4
%endmacro

; (a + b + carry) >> 1
; a=%1 b=%2 carry-1=%3
%macro SSE2_AvgbFloor2 3
    pxor     %1, %3
    pxor     %2, %3
    pavgb    %1, %2
    pxor     %1, %3
%endmacro

; a = (a & m) | (b & ~m)
; a=%1 b=%2 m=%3
%macro SSE2_Blend 3
    pand     %1, %3
    pandn    %3, %2
    por      %1, %3
%endmacro

; Compute
; p0 = clip(p0 + clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255)
; q0 = clip(q0 - clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255)
; 16-wide parallel in packed byte representation in xmm registers.
;
; p1=%1 p0=%2 q0=%3 q1=%4 iTc=%5 FFh=%6 xmmclobber=%7,%8
%macro SSE2_DeblockP0Q0_Lt4 8
    ; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255].
    ; A limited range is sufficient because the value is clipped to [-iTc, iTc] later.
    ; Bias so that unsigned saturation can be used.
    ; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs.
    ; q0 - p0 is split into a non-negative and non-positive part. The latter is
    ; subtracted from the biased value.
    movdqa     %7, %2
    psubusb    %7, %3  ; clip(p0 - q0, 0, 255)
    ; ((p1 - q1) >> 2) + 0xc0
    pxor       %4, %6  ; q1 ^ 0xff aka -q1 - 1 & 0xff
    pavgb      %1, %4  ; (((p1 - q1 + 0x100) >> 1)
    pavgb      %1, %6  ;  + 0x100) >> 1
    psubusb    %1, %7  ; -= clip(p0 - q0, 0, 255) saturate.
    movdqa     %8, %3
    psubusb    %8, %2  ; (clip(q0 - p0, 0, 255)
    pavgb      %8, %1  ;  + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1

    ; Unbias and split into a non-negative and a non-positive part.
    ; Clip each part to iTc via minub.
    ; Add/subtract each part to/from p0/q0 and clip.
%ifdef X86_32_PICASM
    push       r0
    mov        r0, esp
    sub        esp, 16
    and        esp, -16
    push       0x60606060    ;WELS_DB96_16
    push       0x60606060
    push       0x60606060
    push       0x60606060
    movdqa     %6, [esp]
    psubusb    %6, %8
    psubusb    %8, [esp]
    mov        esp, r0
    pop        r0
%else
    movdqa     %6, [WELS_DB96_16]
    psubusb    %6, %8
    psubusb    %8, [WELS_DB96_16]
%endif
    pminub     %6, %5
    pminub     %8, %5
    psubusb    %2, %6
    paddusb    %2, %8  ; p0
    paddusb    %3, %6
    psubusb    %3, %8  ; q0
%endmacro


;*******************************************************************************
;    void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
;                                 int32_t iBeta, int8_t * pTC)
;*******************************************************************************

WELS_EXTERN DeblockLumaLt4V_ssse3
    %assign push_num 0
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    movd     xmm1, arg3d
    movd     xmm2, arg4d
    pxor     xmm3, xmm3
%ifdef X86_32_PICASM
    push     r4
    mov      r4, esp
    sub      esp, 16
    and      esp, -16
    push     0x7f7f7f7f
    push     0x7f7f7f7f
    push     0x7f7f7f7f
    push     0x7f7f7f7f
    pxor     xmm1, [esp]
    pxor     xmm2, [esp]
%else
    pxor     xmm1, [WELS_DB127_16]
    pxor     xmm2, [WELS_DB127_16]
%endif
    pshufb   xmm1, xmm3                       ; iAlpha ^ 0x7f
    pshufb   xmm2, xmm3                       ; iBeta  ^ 0x7f
    mov      r2, r1                           ; iStride
    neg      r1                               ; -iStride
    lea      r3, [r0 + r1]                    ; pPix - iStride

    ; Compute masks to enable/disable deblocking.
    MOVDQ    xmm6, [r3 + 0 * r1]              ; p0
    MOVDQ    xmm7, [r3 + 1 * r1]              ; p1
    MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
    movdqa   xmm4, xmm6
    SSE2_AbsDiffUB xmm6, xmm0, xmm3           ; |p0 - q0|
%ifdef X86_32_PICASM
    SSE2_CmpltUB xmm6, xmm1, [esp]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
%else
    SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
%endif
    MOVDQ    xmm1, [r0 + 1 * r2]              ; q1
    SSE2_AbsDiffUB xmm7, xmm4, xmm3           ; |p1 - p0|
    SSE2_AbsDiffUB xmm0, xmm1, xmm3           ; |q1 - q0|
    pmaxub   xmm7, xmm0                       ; max(|p1 - p0|, |q1 - q0|)
%ifdef X86_32_PICASM
    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
%else
    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
%endif
    pand     xmm6, xmm7                       ; bDeltaP0Q0P1P0Q1Q0 = bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
    MOVDQ    xmm7, [r3 + 2 * r1]              ; p2
    movdqa   xmm0, xmm7
    SSE2_AbsDiffUB xmm7, xmm4, xmm3           ; |p2 - p0|
%ifdef X86_32_PICASM
    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP2P0 = |p2 - p0| < iBeta
%else
    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP2P0 = |p2 - p0| < iBeta
%endif
    MOVDQ    xmm5, [r0 + 2 * r2]              ; q2
    MOVDQ    xmm3, [r0 + 0 * r2]              ; q0
    movdqa   xmm1, xmm5
    SSE2_AbsDiffUB xmm5, xmm3, xmm4           ; |q2 - q0|
%ifdef X86_32_PICASM
    SSE2_CmpltUB xmm5, xmm2, [esp]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
    mov      esp, r4
    pop      r4
%else
    SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
%endif

    pavgb    xmm3, [r3 + 0 * r1]
    pcmpeqw  xmm2, xmm2  ; FFh
    pxor     xmm3, xmm2
    ; (p2 + ((p0 + q0 + 1) >> 1)) >> 1
    pxor     xmm0, xmm2
    pavgb    xmm0, xmm3
    pxor     xmm0, xmm2
    ; (q2 + ((p0 + q0 + 1) >> 1)) >> 1
    pxor     xmm1, xmm2
    pavgb    xmm1, xmm3
    pxor     xmm1, xmm2

    movd     xmm3, [r4]
%ifdef X86_32_PICASM
    push     r0
    mov      r0, esp
    sub      esp, 16
    and      esp, -16
    push     0x03030303    ;WELS_SHUFB0000111122223333
    push     0x02020202
    push     0x01010101
    push     0x00000000
    pshufb   xmm3, [esp] ; iTc
    mov      esp, r0
    pop      r0
%else
    pshufb   xmm3, [WELS_SHUFB0000111122223333] ; iTc
%endif
    movdqa   xmm4, xmm3  ; iTc0 = iTc
    pcmpgtb  xmm3, xmm2  ; iTc > -1 ? 0xff : 0x00
    pand     xmm6, xmm3  ; bDeltaP0Q0P1P0Q1Q0 &= iTc > -1
    movdqa   xmm3, xmm4
    psubb    xmm3, xmm7  ; iTc -= bDeltaP2P0 ? -1 : 0
    psubb    xmm3, xmm5  ; iTc -= bDeltaQ2Q0 ? -1 : 0
    pand     xmm3, xmm6  ; iTc &= bDeltaP0Q0P1P0Q1Q0 ? 0xff : 0
    pand     xmm7, xmm6  ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0
    pand     xmm5, xmm6  ; bDeltaQ2Q0 &= bDeltaP0Q0P1P0Q1Q0
    pand     xmm7, xmm4  ; iTc0 & (bDeltaP2P0 ? 0xff : 0)
    pand     xmm5, xmm4  ; iTc0 & (bDeltaQ2Q0 ? 0xff : 0)

    MOVDQ    xmm4, [r3 + 1 * r1]
    SSE2_ClipUB xmm0, xmm4, xmm7, xmm6  ; clip p1.
    MOVDQ    xmm6, [r0 + 1 * r2]
    MOVDQ    [r3 + 1 * r1], xmm0        ; store p1.
    SSE2_ClipUB xmm1, xmm6, xmm5, xmm7  ; clip q1.
    MOVDQ    [r0 + 1 * r2], xmm1        ; store q1.

    MOVDQ    xmm1, [r3 + 0 * r1]  ; p0
    MOVDQ    xmm0, [r0 + 0 * r2]  ; q0
    SSE2_DeblockP0Q0_Lt4 xmm4, xmm1, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7
    MOVDQ    [r3 + 0 * r1], xmm1  ; store p0.
    MOVDQ    [r0 + 0 * r2], xmm0  ; store q0.

    POP_XMM
    LOAD_5_PARA_POP
    ret


; Deblock 3x16 luma pixels for the eq4 case.
;
; Compose 8-bit averages from pavgbs. Ie. (p1 + p0 + p2 + q0 + 2) >> 2 can be
; written as (((p1 + p0) >> 1) + ((p2 + q0 + (p1 ^ p0 & 1)) >> 1) + 1) >> 1,
; which maps to 3 pavgbs.
;
; pPix=%1 iStride=%2 [in:q0,out:p0]=%3 [in:q1,out:p1]=%4 bDeltaP0Q0P1P0Q1Q0=%5 bDeltaP2P0=%6 clobber=%7,%8,%9,%10 preserve_p0p1=%11 db1=%12
%macro SSE2_DeblockLumaEq4_3x16P 12
    movdqa   %7, %3
    movdqa   %8, %6
    MOVDQ    %10, [%1 + 1 * %2]                      ; p1
    SSE2_Blend %7, %10, %8                           ; t0 = bDeltaP2P0 ? q0 : p1
    movdqa   %8, %6
    MOVDQ    %9, [%1 + 2 * %2]                       ; p2
    SSE2_Blend %9, %4, %8                            ; t1 = bDeltaP2P0 ? p2 : q1
    SSE2_AvgbFloor1 %4,  %9,   %12, %8               ; t1 = (t1 + q1) >> 1
    SSE2_AvgbFloor1 %10, [%1], %12, %8               ; (p0 + p1) >> 1, p0 ^ p1
    pxor     %8, %12
    SSE2_AvgbFloor1 %7, %4, %8, %9                   ; (t0 + t1 + (p0 ^ p1 & 1)) >> 1
    MOVDQ    %9, [%1 + 2 * %2]                       ; p2
    SSE2_AvgbFloor1 %3, %9, %8, %4                   ; (p2 + q0 + (p0 ^ p1 & 1)) >> 1
    pavgb    %7, %10                                 ; p0' = (p0 + p1 + t0 + t1 + 2) >> 2
    movdqa   %8, %10
    pxor     %8, %3                                  ; (p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1
    pand     %8, %12                                 ; & 1
    pavgb    %10, %3                                 ; p1' = (p0 + p1 + p2 + q0 + 2) >> 2
    pand     %6, %5                                  ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0
%if %11
    MOVDQ    %3, [%1 + 0 * %2]                       ; p0
    movdqa   %4, %5
    SSE2_Blend %7, %3, %4                            ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
%else
    SSE2_Blend %7, [%1 + 0 * %2], %5                 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
%endif
    MOVDQ    [%1 + 0 * %2], %7                       ; store p0
    add      %1, %2
    movdqa   %7, %10
    psubb    %10, %8                                 ; (p0 + p1 + p2 + q0) >> 2
    psubb    %8, %12
    MOVDQ    %4, [%1 + (3 - 1) * %2]                 ; p3
    SSE2_AvgbFloor2 %4, %9, %8                       ; (p2 + p3 + ((p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1 & 1)) >> 1
    pavgb    %10, %4                                 ; p2' = (((p0 + p1 + p2 + q0) >> 1) + p2 + p3 + 2) >> 2
    movdqa   %8, %6
    SSE2_Blend %10, [%1 + (2 - 1) * %2], %8          ; p2out = bDeltaP2P0 ? p2' : p2
    MOVDQ    [%1 + (2 - 1) * %2], %10                ; store p2
%if %11
    MOVDQ    %4, [%1 + (1 - 1) * %2]                 ; p1
    SSE2_Blend %7, %4, %6                            ; p1out = bDeltaP2P0 ? p1' : p1
%else
    SSE2_Blend %7, [%1 + (1 - 1) * %2], %6           ; p1out = bDeltaP2P0 ? p1' : p1
%endif
    MOVDQ    [%1 + (1 - 1) * %2], %7                 ; store p1
%endmacro


;*******************************************************************************
;    void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
;                                 int32_t iBeta)
;*******************************************************************************

WELS_EXTERN DeblockLumaEq4V_ssse3
    %assign push_num 0
    LOAD_4_PARA
    PUSH_XMM 10
    SIGN_EXTENSION r1, r1d
    movd     xmm1, arg3d
    movd     xmm2, arg4d
    shr      r2, 2
    add      r2, 1
    movd     xmm3, r2d
    pxor     xmm4, xmm4
%ifdef X86_32_PICASM
    push     r4
    mov      r4, esp
    sub      esp, 16
    and      esp, -16
    push     0x7f7f7f7f    ;WELS_DB127_16
    push     0x7f7f7f7f
    push     0x7f7f7f7f
    push     0x7f7f7f7f
    pxor     xmm1, [esp]
    pxor     xmm2, [esp]
%else
    pxor     xmm1, [WELS_DB127_16]
    pxor     xmm2, [WELS_DB127_16]
%endif
    pshufb   xmm1, xmm4                       ; iAlpha ^ 0x7f
    pshufb   xmm2, xmm4                       ; iBeta  ^ 0x7f
    pshufb   xmm3, xmm4                       ; (iAlpha >> 2) + 1
    mov      r2, r1                           ; iStride
    neg      r1                               ; -iStride
    lea      r3, [r0 + r1]                    ; pPix - iStride

    ; Compute masks to enable/disable filtering.
    MOVDQ    xmm7, [r3 + 1 * r1]              ; p1
    MOVDQ    xmm6, [r3 + 0 * r1]              ; p0
    MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
    movdqa   xmm4, xmm6
    SSE2_AbsDiffUB xmm6, xmm0, xmm5           ; |p0 - q0|
    SSE2_CmpgeUB xmm3, xmm6                   ; |p0 - q0| < (iAlpha >> 2) + 2
%ifdef X86_32_PICASM
    SSE2_CmpltUB xmm6, xmm1, [esp]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
%else
    SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
%endif
    MOVDQ    xmm1, [r0 + 1 * r2]              ; q1
    SSE2_AbsDiffUB xmm7, xmm4, xmm5           ; |p1 - p0|
    SSE2_AbsDiffUB xmm0, xmm1, xmm5           ; |q1 - q0|
    pmaxub   xmm7, xmm0                       ; max(|p1 - p0|, |q1 - q0|)
%ifdef X86_32_PICASM
    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
%else
    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
%endif
    pand     xmm6, xmm7                       ; & bDeltaP0Q0

    MOVDQ    xmm7, [r3 + 2 * r1]              ; p2
    SSE2_AbsDiffUB xmm7, xmm4, xmm5           ; |p2 - p0|
%ifdef X86_32_PICASM
    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP2P0 = |p2 - p0| < iBeta
%else
    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP2P0 = |p2 - p0| < iBeta
%endif
    pand     xmm7, xmm3                       ; &= |p0 - q0| < (iAlpha >> 2) + 2

    MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
    MOVDQ    xmm5, [r0 + 2 * r2]              ; q2
    SSE2_AbsDiffUB xmm5, xmm0, xmm4           ; |q2 - q0|
%ifdef X86_32_PICASM
    SSE2_CmpltUB xmm5, xmm2, [esp]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
    mov      esp, r4
    pop      r4
%else
    SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
%endif
    pand     xmm5, xmm3                       ; &= |p0 - q0| < (iAlpha >> 2) + 2

%ifdef X86_32
    ; Push xmm5 to free up one register. Align stack so as to ensure that failed
    ; store forwarding penalty cannot occur (up to ~50 cycles for 128-bit on IVB).
    mov      r2, esp
    sub      esp,  16
    and      esp, -16
%ifdef X86_32_PICASM
    push     0x01010101
    push     0x01010101
    push     0x01010101
    push     0x01010101
    sub      esp, 16
    movdqa   [esp], xmm5
    SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [esp+16]
    movdqa   xmm5, [esp]
    neg      r1
    SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [esp+16]
    mov      esp, r2
%else
    movdqa   [esp], xmm5
    SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [WELS_DB1_16]
    movdqa   xmm5, [esp]
    mov      esp, r2
    neg      r1
    SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [WELS_DB1_16]
%endif
%else
    movdqa   xmm9, [WELS_DB1_16]
    SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm8, xmm4, 1, xmm9
    SSE2_DeblockLumaEq4_3x16P r0, r2, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, xmm9
%endif

    POP_XMM
    LOAD_4_PARA_POP
    ret


; [out:p1,p0,q0,q1]=%1,%2,%3,%4 pPixCb=%5 pPixCr=%6 iStride=%7 3*iStride-1=%8 xmmclobber=%9,%10,%11
%macro SSE2_LoadCbCr_4x16H 11
    movd       %1,  [%5 + 0 * %7 - 2]  ; [p1,p0,q0,q1] cb line 0
    movd       %2,  [%5 + 2 * %7 - 2]  ; [p1,p0,q0,q1] cb line 2
    punpcklbw  %1,  %2                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 0,2
    movd       %2,  [%5 + 4 * %7 - 2]  ; [p1,p0,q0,q1] cb line 4
    movd       %9,  [%5 + 2 * %8]      ; [p1,p0,q0,q1] cb line 6
    punpcklbw  %2,  %9                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 4,6
    punpcklwd  %1,  %2                 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 0,2,4,6
    movd       %2,  [%6 + 0 * %7 - 2]  ; [p1,p0,q0,q1] cr line 0
    movd       %9,  [%6 + 2 * %7 - 2]  ; [p1,p0,q0,q1] cr line 2
    punpcklbw  %2,  %9                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 0,2
    movd       %9,  [%6 + 4 * %7 - 2]  ; [p1,p0,q0,q1] cr line 4
    movd       %10, [%6 + 2 * %8]      ; [p1,p0,q0,q1] cr line 6
    punpcklbw  %9,  %10                ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 4,6
    punpcklwd  %2,  %9                 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 0,2,4,6
    add        %5,  %7                 ; pPixCb += iStride
    add        %6,  %7                 ; pPixCr += iStride
    movd       %9,  [%5 + 0 * %7 - 2]  ; [p1,p0,q0,q1] cb line 1
    movd       %10, [%5 + 2 * %7 - 2]  ; [p1,p0,q0,q1] cb line 3
    punpcklbw  %9,  %10                ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 1,3
    movd       %10, [%5 + 4 * %7 - 2]  ; [p1,p0,q0,q1] cb line 5
    movd       %3,  [%5 + 2 * %8]      ; [p1,p0,q0,q1] cb line 7
    punpcklbw  %10, %3                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 5,7
    punpcklwd  %9,  %10                ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 1,3,5,7
    movd       %10, [%6 + 0 * %7 - 2]  ; [p1,p0,q0,q1] cr line 1
    movd       %3,  [%6 + 2 * %7 - 2]  ; [p1,p0,q0,q1] cr line 3
    punpcklbw  %10, %3                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 1,3
    movd       %3,  [%6 + 4 * %7 - 2]  ; [p1,p0,q0,q1] cr line 5
    movd       %4,  [%6 + 2 * %8]      ; [p1,p0,q0,q1] cr line 7
    punpcklbw  %3,  %4                 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 5,7
    punpcklwd  %10, %3                 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 1,3,5,7
    movdqa     %3,  %1
    punpckldq  %1,  %2                 ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6
    punpckhdq  %3,  %2                 ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6
    movdqa     %11, %9
    punpckldq  %9,  %10                ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 1,3,5,7
    punpckhdq  %11, %10                ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 1,3,5,7
    movdqa     %2,  %1
    punpcklqdq %1,  %9                 ; [p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1] cb/cr line 0,2,4,6,1,3,5,7
    punpckhqdq %2,  %9                 ; [p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6,1,3,5,7
    movdqa     %4,  %3
    punpcklqdq %3,  %11                ; [q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0] cb/cr line 0,2,4,6,1,3,5,7
    punpckhqdq %4,  %11                ; [q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6,1,3,5,7
%endmacro

; pPixCb+iStride=%1 pPixCr+iStride=%2 iStride=%3 3*iStride-1=%4 p0=%5 q0=%6 rclobber=%7 dwclobber={%8,%9} xmmclobber=%10
%macro SSE2_StoreCbCr_4x16H 10
    movdqa     %10, %5
    punpcklbw  %10, %6                 ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6
    punpckhbw  %5, %6                  ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7
    mov        %7, r7                  ; preserve stack pointer
    and        r7, -16                 ; align stack pointer
    sub        r7, 32                  ; allocate stack space
    movdqa     [r7     ], %10          ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6 on the stack
    movdqa     [r7 + 16], %5           ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7 on the stack
    mov        %8, [r7 + 16]           ; [p0,q0,p0,q0] cb line 1,3
    mov        [%1 + 0 * %3 - 1], %9   ; store [p0,q0] cb line 1
    shr        %8, 16                  ; [p0,q0] cb line 3
    mov        [%1 + 2 * %3 - 1], %9   ; store [p0,q0] cb line 3
    mov        %8, [r7 + 20]           ; [p0,q0,p0,q0] cb line 5,7
    mov        [%1 + 4 * %3 - 1], %9   ; store [p0,q0] cb line 5
    shr        %8, 16                  ; [p0,q0] cb line 7
    mov        [%1 + 2 * %4 + 1], %9   ; store [p0,q0] cb line 7
    mov        %8, [r7 + 24]           ; [p0,q0,p0,q0] cr line 1,3
    mov        [%2 + 0 * %3 - 1], %9   ; store [p0,q0] cr line 1
    shr        %8, 16                  ; [p0,q0] cr line 3
    mov        [%2 + 2 * %3 - 1], %9   ; store [p0,q0] cr line 3
    mov        %8, [r7 + 28]           ; [p0,q0,p0,q0] cr line 5,7
    mov        [%2 + 4 * %3 - 1], %9   ; store [p0,q0] cr line 5
    shr        %8, 16                  ; [p0,q0] cr line 7
    mov        [%2 + 2 * %4 + 1], %9   ; store [p0,q0] cr line 7
    sub        %1, %3                  ; pPixCb -= iStride
    sub        %2, %3                  ; pPixCr -= iStride
    mov        %8, [r7     ]           ; [p0,q0,p0,q0] cb line 0,2
    mov        [%1 + 0 * %3 - 1], %9   ; store [p0,q0] cb line 0
    shr        %8, 16                  ; [p0,q0] cb line 2
    mov        [%1 + 2 * %3 - 1], %9   ; store [p0,q0] cb line 2
    mov        %8, [r7 +  4]           ; [p0,q0,p0,q0] cb line 4,6
    mov        [%1 + 4 * %3 - 1], %9   ; store [p0,q0] cb line 4
    shr        %8, 16                  ; [p0,q0] cb line 6
    mov        [%1 + 2 * %4 + 1], %9   ; store [p0,q0] cb line 6
    mov        %8, [r7 +  8]           ; [p0,q0,p0,q0] cr line 0,2
    mov        [%2 + 0 * %3 - 1], %9   ; store [p0,q0] cr line 0
    shr        %8, 16                  ; [p0,q0] cr line 2
    mov        [%2 + 2 * %3 - 1], %9   ; store [p0,q0] cr line 2
    mov        %8, [r7 + 12]           ; [p0,q0,p0,q0] cr line 4,6
    mov        [%2 + 4 * %3 - 1], %9   ; store [p0,q0] cr line 4
    shr        %8, 16                  ; [p0,q0] cr line 6
    mov        [%2 + 2 * %4 + 1], %9   ; store [p0,q0] cr line 6
    mov        r7, %7                  ; restore stack pointer
%endmacro

; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 pTC=%7 xmmclobber=%8,%9,%10 interleaveTC=%11
%macro SSSE3_DeblockChromaLt4 11
    movdqa     %8, %3
    SSE2_AbsDiffUB %8, %2, %9           ; |p0 - q0|
    SSE2_CmpgeUB %8, %5                 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
    movdqa     %9, %4
    SSE2_AbsDiffUB %9, %3, %5           ; |q1 - q0|
    movdqa     %10, %1
    SSE2_AbsDiffUB %10, %2, %5          ; |p1 - p0|
    pmaxub     %9, %10                  ; max(|q1 - q0|, |p1 - p0|)
    pxor       %10, %10
    movd       %5, %6
    pshufb     %5, %10                  ; iBeta
    SSE2_CmpgeUB %9, %5                 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
    por        %8, %9                   ; | !bDeltaP0Q0
    movd       %5, [%7]
%if %11
    punpckldq  %5, %5
    punpcklbw  %5, %5                   ; iTc
%else
    pshufd     %5, %5, 0                ; iTc
%endif
    pcmpeqw    %10, %10                 ; FFh
    movdqa     %9, %5
    pcmpgtb    %9, %10                  ; iTc > -1 ? FFh : 00h
    pandn      %8, %5                   ; iTc & bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
    pand       %8, %9                   ; &= (iTc > -1 ? FFh : 00h)
    SSE2_DeblockP0Q0_Lt4 %1, %2, %3, %4, %8, %10, %5, %9
%endmacro

; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 xmmclobber=%7,%8,%9
%macro SSSE3_DeblockChromaEq4 9
    movdqa   %7, %3
    SSE2_AbsDiffUB %7, %2, %8         ; |p0 - q0|
    SSE2_CmpgeUB %7, %5               ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
    movdqa   %8, %4
    SSE2_AbsDiffUB %8, %3, %5         ; |q1 - q0|
    movdqa   %9, %1
    SSE2_AbsDiffUB %9, %2, %5         ; |p1 - p0|
    pmaxub   %8, %9                   ; max(|q1 - q0|, |p1 - p0|)
    pxor     %9, %9
    movd     %5, %6
    pshufb   %5, %9                   ; iBeta
    SSE2_CmpgeUB %8, %5               ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
    por      %7, %8                   ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0
    WELS_DB1 %5
    movdqa   %8, %2
    SSE2_AvgbFloor1 %8, %4, %5, %9    ; (p0 + q1) >> 1
    pavgb    %8, %1                   ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1
    movdqa   %9, %7
    SSE2_Blend %2, %8, %7             ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
    SSE2_AvgbFloor1 %1, %3, %5, %7    ; (q0 + p1) >> 1
    pavgb    %1, %4                   ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1
    SSE2_Blend %3, %1, %9             ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0
%endmacro


;******************************************************************************
; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************

WELS_EXTERN DeblockChromaLt4V_ssse3
    %assign push_num 0
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r2, r2d
    movd     xmm7, arg4d
    pxor     xmm0, xmm0
    pshufb   xmm7, xmm0                       ; iAlpha
    mov      r3, r2
    neg      r3                               ; -iStride

    movq     xmm0, [r0 + 0 * r2]              ; q0 cb
    movhps   xmm0, [r1 + 0 * r2]              ; q0 cr
    movq     xmm2, [r0 + 1 * r3]              ; p0 cb
    movhps   xmm2, [r1 + 1 * r3]              ; p0 cr
    movq     xmm1, [r0 + 1 * r2]              ; q1 cb
    movhps   xmm1, [r1 + 1 * r2]              ; q1 cr
    movq     xmm3, [r0 + 2 * r3]              ; p1 cb
    movhps   xmm3, [r1 + 2 * r3]              ; p1 cr

%ifidni arg6, r5
    SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, arg6, xmm4, xmm5, xmm6, 1
%else
    mov      r2, arg6
    SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, r2,   xmm4, xmm5, xmm6, 1
%endif

    movlps   [r0 + 1 * r3], xmm2              ; store p0 cb
    movhps   [r1 + 1 * r3], xmm2              ; store p0 cr
    movlps   [r0         ], xmm0              ; store q0 cb
    movhps   [r1         ], xmm0              ; store q0 cr

    POP_XMM
    LOAD_4_PARA_POP
    ret


;********************************************************************************
;  void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;                             int32_t iAlpha, int32_t iBeta)
;********************************************************************************

WELS_EXTERN DeblockChromaEq4V_ssse3
    %assign push_num 0
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r2, r2d
    movd     xmm7, arg4d
    pxor     xmm0, xmm0
    pshufb   xmm7, xmm0                       ; iAlpha
    mov      r3, r2
    neg      r3                               ; -iStride

    movq     xmm0, [r0 + 0 * r2]              ; q0 cb
    movhps   xmm0, [r1 + 0 * r2]              ; q0 cr
    movq     xmm2, [r0 + 1 * r3]              ; p0 cb
    movhps   xmm2, [r1 + 1 * r3]              ; p0 cr
    movq     xmm1, [r0 + 1 * r2]              ; q1 cb
    movhps   xmm1, [r1 + 1 * r2]              ; q1 cr
    movq     xmm3, [r0 + 2 * r3]              ; p1 cb
    movhps   xmm3, [r1 + 2 * r3]              ; p1 cr

    SSSE3_DeblockChromaEq4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, xmm4, xmm5, xmm6

    movlps   [r0 + 1 * r3], xmm2              ; store p0 cb
    movhps   [r1 + 1 * r3], xmm2              ; store p0 cr
    movlps   [r0 + 0 * r2], xmm0              ; store q0 cb
    movhps   [r1 + 0 * r2], xmm0              ; store q0 cr

    POP_XMM
    LOAD_4_PARA_POP
    ret


;*******************************************************************************
;    void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************

WELS_EXTERN DeblockChromaLt4H_ssse3
    %assign push_num 0
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r2, r2d
    movd       xmm7, arg4d
    pxor       xmm0, xmm0
    pshufb     xmm7, xmm0                       ; iAlpha
    lea        r3, [3 * r2 - 1]                 ; 3 * iStride - 1

    SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
    SSSE3_DeblockChromaLt4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, r5, xmm2, xmm3, xmm6, 0
    SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0

    POP_XMM
    LOAD_6_PARA_POP
    ret


;***************************************************************************
;  void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;          int32_t iAlpha, int32_t iBeta)
;***************************************************************************

WELS_EXTERN DeblockChromaEq4H_ssse3
    %assign push_num 0
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r2, r2d
    movd       xmm7, arg4d
    pxor       xmm0, xmm0
    pshufb     xmm7, xmm0                       ; iAlpha
    lea        r3, [3 * r2 - 1]                 ; 3 * iStride - 1

    SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
    SSSE3_DeblockChromaEq4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, xmm2, xmm3, xmm6
%ifdef X86_32
    push r4
    push r5
    SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
    pop r5
    pop r4
%else
    SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
%endif

    POP_XMM
    LOAD_4_PARA_POP
    ret


;********************************************************************************
;
;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
;
;********************************************************************************

WELS_EXTERN DeblockLumaTransposeH2V_sse2
    push     r3
    push     r4
    push     r5

%assign   push_num   3
    LOAD_3_PARA
    PUSH_XMM 8

    SIGN_EXTENSION   r1, r1d

    mov      r5,    r7
    mov      r3,    r7
    and      r3,    0Fh
    sub      r7,    r3
    sub      r7,    10h

    lea      r3,    [r0 + r1 * 8]
    lea      r4,    [r1 * 3]

    movq    xmm0,  [r0]
    movq    xmm7,  [r3]
    punpcklqdq   xmm0,  xmm7
    movq    xmm1,  [r0 + r1]
    movq    xmm7,  [r3 + r1]
    punpcklqdq   xmm1,  xmm7
    movq    xmm2,  [r0 + r1*2]
    movq    xmm7,  [r3 + r1*2]
    punpcklqdq   xmm2,  xmm7
    movq    xmm3,  [r0 + r4]
    movq    xmm7,  [r3 + r4]
    punpcklqdq   xmm3,  xmm7

    lea     r0,   [r0 + r1 * 4]
    lea     r3,   [r3 + r1 * 4]
    movq    xmm4,  [r0]
    movq    xmm7,  [r3]
    punpcklqdq   xmm4,  xmm7
    movq    xmm5,  [r0 + r1]
    movq    xmm7,  [r3 + r1]
    punpcklqdq   xmm5,  xmm7
    movq    xmm6,  [r0 + r1*2]
    movq    xmm7,  [r3 + r1*2]
    punpcklqdq   xmm6,  xmm7

    movdqa  [r7],   xmm0
    movq    xmm7,  [r0 + r4]
    movq    xmm0,  [r3 + r4]
    punpcklqdq   xmm7,  xmm0
    movdqa  xmm0,   [r7]

    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1

    movdqa  [r2],    xmm4
    movdqa  [r2 + 10h],  xmm2
    movdqa  [r2 + 20h],  xmm3
    movdqa  [r2 + 30h],  xmm7
    movdqa  [r2 + 40h],  xmm5
    movdqa  [r2 + 50h],  xmm1
    movdqa  [r2 + 60h],  xmm6
    movdqa  [r2 + 70h],  xmm0

    mov     r7,   r5
    POP_XMM
    pop     r5
    pop     r4
    pop     r3
    ret


;*******************************************************************************************
;
;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
;
;*******************************************************************************************

WELS_EXTERN DeblockLumaTransposeV2H_sse2
    push     r3
    push     r4

%assign  push_num 2
    LOAD_3_PARA
    PUSH_XMM 8

    SIGN_EXTENSION   r1, r1d

    mov      r4,    r7
    mov      r3,    r7
    and      r3,    0Fh
    sub      r7,    r3
    sub      r7,    10h

    movdqa   xmm0,   [r2]
    movdqa   xmm1,   [r2 + 10h]
    movdqa   xmm2,   [r2 + 20h]
    movdqa   xmm3,   [r2 + 30h]
    movdqa   xmm4,   [r2 + 40h]
    movdqa   xmm5,   [r2 + 50h]
    movdqa   xmm6,   [r2 + 60h]
    movdqa   xmm7,   [r2 + 70h]

    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1

    lea      r2,   [r1 * 3]

    movq     [r0],  xmm4
    movq     [r0 + r1],  xmm2
    movq     [r0 + r1*2],  xmm3
    movq     [r0 + r2],  xmm7

    lea      r0,   [r0 + r1*4]
    movq     [r0],  xmm5
    movq     [r0 + r1],  xmm1
    movq     [r0 + r1*2],  xmm6
    movq     [r0 + r2],  xmm0

    psrldq    xmm4,   8
    psrldq    xmm2,   8
    psrldq    xmm3,   8
    psrldq    xmm7,   8
    psrldq    xmm5,   8
    psrldq    xmm1,   8
    psrldq    xmm6,   8
    psrldq    xmm0,   8

    lea       r0,  [r0 + r1*4]
    movq     [r0],  xmm4
    movq     [r0 + r1],  xmm2
    movq     [r0 + r1*2],  xmm3
    movq     [r0 + r2],  xmm7

    lea      r0,   [r0 + r1*4]
    movq     [r0],  xmm5
    movq     [r0 + r1],  xmm1
    movq     [r0 + r1*2],  xmm6
    movq     [r0 + r2],  xmm0


    mov      r7,   r4
    POP_XMM
    pop      r4
    pop      r3
    ret

WELS_EXTERN WelsNonZeroCount_sse2
    %assign  push_num 0
    LOAD_1_PARA
    movdqu  xmm0, [r0]
    movq    xmm1, [r0+16]
    WELS_DB1 xmm2
    pminub  xmm0, xmm2
    pminub  xmm1, xmm2
    movdqu  [r0], xmm0
    movq    [r0+16], xmm1
    ret