ref: 75f242d49ab0f5da5f118a5aebc403f1eef74a9a
dir: /codec/common/x86/satd_sad.asm/
;*! ;* \copy ;* Copyright (c) 2009-2013, Cisco Systems ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* * Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* ;* * Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in ;* the documentation and/or other materials provided with the ;* distribution. ;* ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ;* POSSIBILITY OF SUCH DAMAGE. ;* ;* ;* satd_sad.asm ;* ;* Abstract ;* WelsSampleSatd4x4_sse2 ;* WelsSampleSatd8x8_sse2 ;* WelsSampleSatd16x8_sse2 ;* WelsSampleSatd8x16_sse2 ;* WelsSampleSatd16x16_sse2 ;* ;* WelsSampleSad16x8_sse2 ;* WelsSampleSad16x16_sse2 ;* ;* History ;* 8/5/2009 Created ;* 24/9/2009 modified ;* ;* ;*************************************************************************/ %include "asm_inc.asm" ;*********************************************************************** ; Data ;*********************************************************************** SECTION .rodata align=16 align 16 HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1 align 16 HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1 align 16 PDW1: dw 1,1,1,1,1,1,1,1 align 16 PDQ2: dw 2,0,0,0,2,0,0,0 align 16 HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 ;*********************************************************************** ; Code ;*********************************************************************** SECTION .text ;*********************************************************************** ; ;Pixel_satd_wxh_sse2 BEGIN ; ;*********************************************************************** %macro MMX_DW_1_2REG 2 pxor %1, %1 pcmpeqw %2, %2 psubw %1, %2 %endmacro %macro SSE2_SumWHorizon1 2 movdqa %2, %1 psrldq %2, 8 paddusw %1, %2 movdqa %2, %1 psrldq %2, 4 paddusw %1, %2 movdqa %2, %1 psrldq %2, 2 paddusw %1, %2 %endmacro %macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3 SSE2_SumSub %1, %2, %5 SSE2_SumSub %3, %4, %5 SSE2_SumSub %2, %4, %5 SSE2_SumSub %1, %3, %5 %endmacro %macro SSE2_SumAbs4 7 WELS_AbsW %1, %3 WELS_AbsW %2, %3 WELS_AbsW %4, %6 WELS_AbsW %5, %6 paddusw %1, %2 paddusw %4, %5 paddusw %7, %1 paddusw %7, %4 %endmacro %macro SSE2_SumWHorizon 3 movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4 paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04 punpcklwd %1, %3 ; x1 = d37 d26 d15 d04 movhlps %2, %1 ; x2 = xxxx xxxx d37 d26 paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246 pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357 paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567 %endmacro %macro SSE2_GetSatd8x8 0 SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2] SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3] lea r0, [r0+2*r1] lea r2, [r2+2*r3] SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2] SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3] SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4 SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4 SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5 SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6 lea r0, [r0+2*r1] lea r2, [r2+2*r3] SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2] SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3] lea r0, [r0+2*r1] lea r2, [r2+2*r3] SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2] SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3] SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4 SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4 SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5 SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6 %endmacro ;*********************************************************************** ; ;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd4x4_sse2 %assign push_num 0 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d movd xmm0, [r0] movd xmm1, [r0+r1] lea r0 , [r0+2*r1] movd xmm2, [r0] movd xmm3, [r0+r1] punpckldq xmm0, xmm2 punpckldq xmm1, xmm3 movd xmm4, [r2] movd xmm5, [r2+r3] lea r2 , [r2+2*r3] movd xmm6, [r2] movd xmm7, [r2+r3] punpckldq xmm4, xmm6 punpckldq xmm5, xmm7 pxor xmm6, xmm6 punpcklbw xmm0, xmm6 punpcklbw xmm1, xmm6 punpcklbw xmm4, xmm6 punpcklbw xmm5, xmm6 psubw xmm0, xmm4 psubw xmm1, xmm5 movdqa xmm2, xmm0 paddw xmm0, xmm1 psubw xmm2, xmm1 SSE2_XSawp qdq, xmm0, xmm2, xmm3 movdqa xmm4, xmm0 paddw xmm0, xmm3 psubw xmm4, xmm3 movdqa xmm2, xmm0 punpcklwd xmm0, xmm4 punpckhwd xmm4, xmm2 SSE2_XSawp dq, xmm0, xmm4, xmm3 SSE2_XSawp qdq, xmm0, xmm3, xmm5 movdqa xmm7, xmm0 paddw xmm0, xmm5 psubw xmm7, xmm5 SSE2_XSawp qdq, xmm0, xmm7, xmm1 movdqa xmm2, xmm0 paddw xmm0, xmm1 psubw xmm2, xmm1 WELS_AbsW xmm0, xmm3 paddusw xmm6, xmm0 WELS_AbsW xmm2, xmm4 paddusw xmm6, xmm2 SSE2_SumWHorizon1 xmm6, xmm4 movd retrd, xmm6 and retrd, 0xffff shr retrd, 1 POP_XMM LOAD_4_PARA_POP ret ;*********************************************************************** ; ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd8x8_sse2 %assign push_num 0 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d pxor xmm6, xmm6 pxor xmm7, xmm7 SSE2_GetSatd8x8 psrlw xmm6, 1 SSE2_SumWHorizon xmm6,xmm4,xmm7 movd retrd, xmm6 POP_XMM LOAD_4_PARA_POP ret ;*********************************************************************** ; ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd8x16_sse2 %assign push_num 0 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d pxor xmm6, xmm6 pxor xmm7, xmm7 SSE2_GetSatd8x8 lea r0, [r0+2*r1] lea r2, [r2+2*r3] SSE2_GetSatd8x8 psrlw xmm6, 1 SSE2_SumWHorizon xmm6,xmm4,xmm7 movd retrd, xmm6 POP_XMM LOAD_4_PARA_POP ret ;*********************************************************************** ; ;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd16x8_sse2 %assign push_num 0 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d push r0 push r2 pxor xmm6, xmm6 pxor xmm7, xmm7 SSE2_GetSatd8x8 pop r2 pop r0 add r0, 8 add r2, 8 SSE2_GetSatd8x8 psrlw xmm6, 1 SSE2_SumWHorizon xmm6,xmm4,xmm7 movd retrd, xmm6 POP_XMM LOAD_4_PARA_POP ret ;*********************************************************************** ; ;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd16x16_sse2 %assign push_num 0 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d push r0 push r2 pxor xmm6, xmm6 pxor xmm7, xmm7 SSE2_GetSatd8x8 lea r0, [r0+2*r1] lea r2, [r2+2*r3] SSE2_GetSatd8x8 pop r2 pop r0 add r0, 8 add r2, 8 SSE2_GetSatd8x8 lea r0, [r0+2*r1] lea r2, [r2+2*r3] SSE2_GetSatd8x8 ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first. psrlw xmm6, 1 SSE2_SumWHorizon xmm6,xmm4,xmm7 movd retrd, xmm6 POP_XMM LOAD_4_PARA_POP ret ;*********************************************************************** ; ;Pixel_satd_wxh_sse2 END ; ;*********************************************************************** ;*********************************************************************** ; ;Pixel_satd_intra_sse2 BEGIN ; ;*********************************************************************** %macro SSE_DB_1_2REG 2 pxor %1, %1 pcmpeqw %2, %2 psubb %1, %2 %endmacro ;*********************************************************************** ; ;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc, ; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatdThree4x4_sse2 %ifdef X86_32 push r3 push r4 push r5 push r6 %assign push_num 4 %else %assign push_num 0 %endif PUSH_XMM 8 mov r2, arg3 mov r3, arg4 SIGN_EXTENSION r3, r3d ; load source 4x4 samples and Hadamard transform movd xmm0, [r2] movd xmm1, [r2+r3] lea r2 , [r2+2*r3] movd xmm2, [r2] movd xmm3, [r2+r3] punpckldq xmm0, xmm2 punpckldq xmm1, xmm3 pxor xmm6, xmm6 punpcklbw xmm0, xmm6 punpcklbw xmm1, xmm6 movdqa xmm2, xmm0 paddw xmm0, xmm1 psubw xmm2, xmm1 SSE2_XSawp qdq, xmm0, xmm2, xmm3 movdqa xmm4, xmm0 paddw xmm0, xmm3 psubw xmm4, xmm3 movdqa xmm2, xmm0 punpcklwd xmm0, xmm4 punpckhwd xmm4, xmm2 SSE2_XSawp dq, xmm0, xmm4, xmm3 SSE2_XSawp qdq, xmm0, xmm3, xmm5 movdqa xmm7, xmm0 paddw xmm0, xmm5 psubw xmm7, xmm5 SSE2_XSawp qdq, xmm0, xmm7, xmm1 ; Hadamard transform results are saved in xmm0 and xmm2 movdqa xmm2, xmm0 paddw xmm0, xmm1 psubw xmm2, xmm1 ;load top boundary samples: [a b c d] mov r0, arg1 mov r1, arg2 SIGN_EXTENSION r1, r1d sub r0, r1 %ifdef UNIX64 push r4 push r5 %endif movzx r2d, byte [r0] movzx r3d, byte [r0+1] movzx r4d, byte [r0+2] movzx r5d, byte [r0+3] ; get the transform results of top boundary samples: [a b c d] add r3d, r2d ; r3d = a + b add r5d, r4d ; r5d = c + d add r2d, r2d ; r2d = a + a add r4d, r4d ; r4d = c + c sub r2d, r3d ; r2d = a + a - a - b = a - b sub r4d, r5d ; r4d = c + c - c - d = c - d add r5d, r3d ; r5d = (a + b) + (c + d) add r3d, r3d sub r3d, r5d ; r3d = (a + b) - (c + d) add r4d, r2d ; r4d = (a - b) + (c - d) add r2d, r2d sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d] movdqa xmm6, xmm0 movdqa xmm7, xmm2 movd xmm5, r5d ; store the edi for DC mode pxor xmm3, xmm3 pxor xmm4, xmm4 pinsrw xmm3, r5d, 0 pinsrw xmm3, r4d, 4 psllw xmm3, 2 pinsrw xmm4, r3d, 0 pinsrw xmm4, r2d, 4 psllw xmm4, 2 ; get the satd of H psubw xmm0, xmm3 psubw xmm2, xmm4 WELS_AbsW xmm0, xmm1 WELS_AbsW xmm2, xmm1 paddusw xmm0, xmm2 SSE2_SumWHorizon1 xmm0, xmm1 ; satd of V is stored in xmm0 ;load left boundary samples: [a b c d]' add r0, r1 movzx r2d, byte [r0-1] movzx r3d, byte [r0+r1-1] lea r0 , [r0+2*r1] movzx r4d, byte [r0-1] movzx r5d, byte [r0+r1-1] ; get the transform results of left boundary samples: [a b c d]' add r3d, r2d ; r3d = a + b add r5d, r4d ; r5d = c + d add r2d, r2d ; r2d = a + a add r4d, r4d ; r4d = c + c sub r2d, r3d ; r2d = a + a - a - b = a - b sub r4d, r5d ; r4d = c + c - c - d = c - d add r5d, r3d ; r5d = (a + b) + (c + d) add r3d, r3d sub r3d, r5d ; r3d = (a + b) - (c + d) add r4d, r2d ; r4d = (a - b) + (c - d) add r2d, r2d sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d] ; store the transform results in xmm3 movd xmm3, r5d pinsrw xmm3, r3d, 1 pinsrw xmm3, r2d, 2 pinsrw xmm3, r4d, 3 psllw xmm3, 2 ; get the satd of V movdqa xmm2, xmm6 movdqa xmm4, xmm7 psubw xmm2, xmm3 WELS_AbsW xmm2, xmm1 WELS_AbsW xmm4, xmm1 paddusw xmm2, xmm4 SSE2_SumWHorizon1 xmm2, xmm1 ; satd of H is stored in xmm2 ; DC result is stored in xmm1 add r5d, 4 movd xmm1, r5d paddw xmm1, xmm5 psrlw xmm1, 3 movdqa xmm5, xmm1 psllw xmm1, 4 ; get the satd of DC psubw xmm6, xmm1 WELS_AbsW xmm6, xmm1 WELS_AbsW xmm7, xmm1 paddusw xmm6, xmm7 SSE2_SumWHorizon1 xmm6, xmm1 ; satd of DC is stored in xmm6 %ifdef UNIX64 pop r5 pop r4 %endif ; comparing order: DC H V mov r4, arg5 movd r2d, xmm6 movd r3d, xmm2 movd r6d, xmm0 and r2d, 0xffff shr r2d, 1 and r3d, 0xffff shr r3d, 1 and r6d, 0xffff shr r6d, 1 add r2d, dword arg7 add r3d, dword arg8 add r6d, dword arg9 cmp r2w, r3w jg near not_dc cmp r2w, r6w jg near not_dc_h ; for DC mode movd r3d, xmm5 imul r3d, 0x01010101 movd xmm5, r3d pshufd xmm5, xmm5, 0 movdqa [r4], xmm5 mov r5, arg6 mov dword [r5], 0x02 mov retrd, r2d POP_XMM %ifdef X86_32 pop r6 pop r5 pop r4 pop r3 %endif ret not_dc: cmp r3w, r6w jg near not_dc_h ; for H mode SSE_DB_1_2REG xmm6, xmm7 sub r0, r1 sub r0, r1 movzx r6d, byte [r0-1] movd xmm0, r6d pmuludq xmm0, xmm6 movzx r6d, byte [r0+r1-1] movd xmm1, r6d pmuludq xmm1, xmm6 punpckldq xmm0, xmm1 lea r0, [r0+r1*2] movzx r6d, byte [r0-1] movd xmm2, r6d pmuludq xmm2, xmm6 movzx r6d, byte [r0+r1-1] movd xmm3, r6d pmuludq xmm3, xmm6 punpckldq xmm2, xmm3 punpcklqdq xmm0, xmm2 movdqa [r4],xmm0 mov retrd, r3d mov r5, arg6 mov dword [r5], 0x01 POP_XMM %ifdef X86_32 pop r6 pop r5 pop r4 pop r3 %endif ret not_dc_h: sub r0, r1 sub r0, r1 sub r0, r1 movd xmm0, [r0] pshufd xmm0, xmm0, 0 movdqa [r4],xmm0 mov retrd, r6d mov r5, arg6 mov dword [r5], 0x00 POP_XMM %ifdef X86_32 pop r6 pop r5 pop r4 pop r3 %endif ret %macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 pmaddubsw %1, xmm5 movdqa %2, %1 pmaddwd %1, xmm7 pmaddwd %2, xmm6 movdqa %3, %1 punpckldq %1, %2 punpckhdq %2, %3 movdqa %3, %1 punpcklqdq %1, %2 punpckhqdq %3, %2 paddd xmm4, %1 ;for dc paddd xmm4, %3 ;for dc packssdw %1, %3 psllw %1, 2 %endmacro %macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2 pmaddubsw %1, xmm5 movdqa %2, %1 pmaddwd %1, xmm7 pmaddwd %2, xmm6 movdqa %3, %1 punpckldq %1, %2 punpckhdq %2, %3 movdqa %3, %1 punpcklqdq %1, %2 punpckhqdq %3, %2 ; paddd xmm4, %1 ;for dc ; paddd xmm4, %3 ;for dc movdqa %4, %1 punpcklqdq %4, %3 packssdw %1, %3 psllw %1, 2 %endmacro %macro SSE41_GetX38x4SatdDec 0 pxor xmm7, xmm7 movq xmm0, [r2] movq xmm1, [r2+r3] lea r2, [r2+2*r3] movq xmm2, [r2] movq xmm3, [r2+r3] lea r2, [r2+2*r3] punpcklbw xmm0, xmm7 punpcklbw xmm1, xmm7 punpcklbw xmm2, xmm7 punpcklbw xmm3, xmm7 SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7 SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7 SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2 ;doesn't need another transpose %endmacro %macro SSE41_GetX38x4SatdV 2 pxor xmm0, xmm0 pinsrw xmm0, word[r6+%2], 0 pinsrw xmm0, word[r6+%2+8], 4 psubsw xmm0, xmm7 pabsw xmm0, xmm0 paddw xmm4, xmm0 pxor xmm0, xmm0 pinsrw xmm0, word[r6+%2+2], 0 pinsrw xmm0, word[r6+%2+10], 4 psubsw xmm0, xmm1 pabsw xmm0, xmm0 paddw xmm4, xmm0 pxor xmm0, xmm0 pinsrw xmm0, word[r6+%2+4], 0 pinsrw xmm0, word[r6+%2+12], 4 psubsw xmm0, xmm3 pabsw xmm0, xmm0 paddw xmm4, xmm0 pxor xmm0, xmm0 pinsrw xmm0, word[r6+%2+6], 0 pinsrw xmm0, word[r6+%2+14], 4 psubsw xmm0, xmm2 pabsw xmm0, xmm0 paddw xmm4, xmm0 %endmacro %macro SSE41_GetX38x4SatdH 3 movq xmm0, [r6+%3+8*%1] punpcklqdq xmm0, xmm0 psubsw xmm0, xmm7 pabsw xmm0, xmm0 paddw xmm5, xmm0 pabsw xmm1, xmm1 pabsw xmm2, xmm2 pabsw xmm3, xmm3 paddw xmm2, xmm1;for DC paddw xmm2, xmm3;for DC paddw xmm5, xmm2 %endmacro %macro SSE41_I16X16GetX38x4SatdDC 0 pxor xmm0, xmm0 movq2dq xmm0, mm4 punpcklqdq xmm0, xmm0 psubsw xmm0, xmm7 pabsw xmm0, xmm0 paddw xmm6, xmm0 paddw xmm6, xmm2 %endmacro %macro SSE41_ChromaGetX38x4SatdDC 1 shl %1, 4 movdqa xmm0, [r6+32+%1] psubsw xmm0, xmm7 pabsw xmm0, xmm0 paddw xmm6, xmm0 paddw xmm6, xmm2 %endmacro %macro SSE41_I16x16GetX38x4Satd 2 SSE41_GetX38x4SatdDec SSE41_GetX38x4SatdV %1, %2 SSE41_GetX38x4SatdH %1, %2, 32 SSE41_I16X16GetX38x4SatdDC %endmacro %macro SSE41_ChromaGetX38x4Satd 2 SSE41_GetX38x4SatdDec SSE41_GetX38x4SatdV %1, %2 SSE41_GetX38x4SatdH %1, %2, 16 SSE41_ChromaGetX38x4SatdDC %1 %endmacro %macro SSE41_HSum8W 3 pmaddwd %1, %2 movhlps %3, %1 paddd %1, %3 pshuflw %3, %1,0Eh paddd %1, %3 %endmacro WELS_EXTERN WelsIntra16x16Combined3Satd_sse41 %assign push_num 0 LOAD_7_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d SIGN_EXTENSION r5, r5d %ifndef X86_32 push r12 mov r12, r2 %endif pxor xmm4, xmm4 %ifdef X86_32_PICASM push 0xff01ff01 push 0xff01ff01 push 0x01010101 push 0x01010101 movdqu xmm5, [esp] push 0xffff0001 push 0xffff0001 push 0xffff0001 push 0xffff0001 movdqu xmm6, [esp] push 0x00010001 push 0x00010001 push 0x00010001 push 0x00010001 movdqu xmm7, [esp] add esp, 48 %else movdqa xmm5, [HSumSubDB1] movdqa xmm6, [HSumSubDW1] movdqa xmm7, [PDW1] %endif sub r0, r1 movdqu xmm0, [r0] movhlps xmm1, xmm0 punpcklqdq xmm0, xmm0 punpcklqdq xmm1, xmm1 SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 movdqa [r6], xmm0 ;V movdqa [r6+16], xmm1 add r0, r1 pinsrb xmm0, byte[r0-1], 0 pinsrb xmm0, byte[r0+r1-1], 1 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 2 pinsrb xmm0, byte[r0+r1-1], 3 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 4 pinsrb xmm0, byte[r0+r1-1], 5 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 6 pinsrb xmm0, byte[r0+r1-1], 7 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 8 pinsrb xmm0, byte[r0+r1-1], 9 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 10 pinsrb xmm0, byte[r0+r1-1], 11 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 12 pinsrb xmm0, byte[r0+r1-1], 13 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 14 pinsrb xmm0, byte[r0+r1-1], 15 movhlps xmm1, xmm0 punpcklqdq xmm0, xmm0 punpcklqdq xmm1, xmm1 SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 movdqa [r6+32], xmm0 ;H movdqa [r6+48], xmm1 movd r0d, xmm4 ;dc add r0d, 16 ;(sum+16) shr r0d, 5 ;((sum+16)>>5) shl r0d, 4 ; movd mm4, r0d ; mm4 copy DC pxor xmm4, xmm4 ;V pxor xmm5, xmm5 ;H pxor xmm6, xmm6 ;DC %ifdef UNIX64 push r4 %endif mov r0, 0 mov r4, 0 .loop16x16_get_satd: .loopStart1: SSE41_I16x16GetX38x4Satd r0, r4 inc r0 cmp r0, 4 jl .loopStart1 cmp r4, 16 je .loop16x16_get_satd_end %ifdef X86_32 mov r2, arg3 %else mov r2, r12 %endif add r2, 8 mov r0, 0 add r4, 16 jmp .loop16x16_get_satd .loop16x16_get_satd_end: MMX_DW_1_2REG xmm0, xmm1 psrlw xmm4, 1 ;/2 psrlw xmm5, 1 ;/2 psrlw xmm6, 1 ;/2 SSE41_HSum8W xmm4, xmm0, xmm1 SSE41_HSum8W xmm5, xmm0, xmm1 SSE41_HSum8W xmm6, xmm0, xmm1 %ifdef UNIX64 pop r4 %endif ; comparing order: DC H V movd r3d, xmm6 ;DC movd r1d, xmm5 ;H movd r0d, xmm4 ;V %ifndef X86_32 pop r12 %endif shl r5d, 1 add r1d, r5d add r3d, r5d mov r4, arg5 cmp r3d, r1d jge near not_dc_16x16 cmp r3d, r0d jge near not_dc_h_16x16 ; for DC mode mov dword[r4], 2;I16_PRED_DC mov retrd, r3d jmp near return_satd_intra_16x16_x3 not_dc_16x16: ; for H mode cmp r1d, r0d jge near not_dc_h_16x16 mov dword[r4], 1;I16_PRED_H mov retrd, r1d jmp near return_satd_intra_16x16_x3 not_dc_h_16x16: ; for V mode mov dword[r4], 0;I16_PRED_V mov retrd, r0d return_satd_intra_16x16_x3: WELSEMMS POP_XMM LOAD_7_PARA_POP ret %macro SSE41_ChromaGetX38x8Satd 0 movdqa xmm5, [HSumSubDB1] movdqa xmm6, [HSumSubDW1] movdqa xmm7, [PDW1] sub r0, r1 movq xmm0, [r0] punpcklqdq xmm0, xmm0 SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4 movdqa [r6], xmm0 ;V add r0, r1 pinsrb xmm0, byte[r0-1], 0 pinsrb xmm0, byte[r0+r1-1], 1 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 2 pinsrb xmm0, byte[r0+r1-1], 3 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 4 pinsrb xmm0, byte[r0+r1-1], 5 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 6 pinsrb xmm0, byte[r0+r1-1], 7 punpcklqdq xmm0, xmm0 SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1 movdqa [r6+16], xmm0 ;H ;(sum+2)>>2 movdqa xmm6, [PDQ2] movdqa xmm5, xmm4 punpckhqdq xmm5, xmm1 paddd xmm5, xmm6 psrld xmm5, 2 ;(sum1+sum2+4)>>3 paddd xmm6, xmm6 paddd xmm4, xmm1 paddd xmm4, xmm6 psrld xmm4, 3 ;satd *16 pslld xmm5, 4 pslld xmm4, 4 ;temp satd movdqa xmm6, xmm4 punpcklqdq xmm4, xmm5 psllq xmm4, 32 psrlq xmm4, 32 movdqa [r6+32], xmm4 punpckhqdq xmm5, xmm6 psllq xmm5, 32 psrlq xmm5, 32 movdqa [r6+48], xmm5 pxor xmm4, xmm4 ;V pxor xmm5, xmm5 ;H pxor xmm6, xmm6 ;DC mov r0, 0 SSE41_ChromaGetX38x4Satd r0, 0 inc r0 SSE41_ChromaGetX38x4Satd r0, 0 %endmacro %macro SSEReg2MMX 3 movdq2q %2, %1 movhlps %1, %1 movdq2q %3, %1 %endmacro %macro MMXReg2SSE 4 movq2dq %1, %3 movq2dq %2, %4 punpcklqdq %1, %2 %endmacro ;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41 WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41 %assign push_num 0 LOAD_7_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d SIGN_EXTENSION r5, r5d loop_chroma_satdx3: %ifdef X86_32_PICASM mov r0, esp and esp, 0xfffffff0 push 0xff01ff01 push 0xff01ff01 push 0x01010101 push 0x01010101 movdqa xmm5, [esp] push 0xffff0001 push 0xffff0001 push 0xffff0001 push 0xffff0001 movdqa xmm6, [esp] push 0x00010001 push 0x00010001 push 0x00010001 push 0x00010001 movdqa xmm7, [esp] mov esp, r0 mov r0, [esp + push_num*4 + 4] sub r0, r1 movq xmm0, [r0] punpcklqdq xmm0, xmm0 SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4 movdqa [r6], xmm0 ;V add r0, r1 pinsrb xmm0, byte[r0-1], 0 pinsrb xmm0, byte[r0+r1-1], 1 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 2 pinsrb xmm0, byte[r0+r1-1], 3 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 4 pinsrb xmm0, byte[r0+r1-1], 5 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 6 pinsrb xmm0, byte[r0+r1-1], 7 punpcklqdq xmm0, xmm0 SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1 ;movdqa [r6+16], xmm0 ;H ;(sum+2)>>2 mov DWORD [r6+16], 0x0002 mov DWORD [r6+20], 0x0000 mov DWORD [r6+24], 0x0002 mov DWORD [r6+28], 0x0000 movdqa xmm6, [r6+16] movdqa [r6+16], xmm0 ;H movdqa xmm5, xmm4 punpckhqdq xmm5, xmm1 paddd xmm5, xmm6 psrld xmm5, 2 ;(sum1+sum2+4)>>3 paddd xmm6, xmm6 paddd xmm4, xmm1 paddd xmm4, xmm6 psrld xmm4, 3 ;satd *16 pslld xmm5, 4 pslld xmm4, 4 ;temp satd movdqa xmm6, xmm4 punpcklqdq xmm4, xmm5 psllq xmm4, 32 psrlq xmm4, 32 movdqa [r6+32], xmm4 punpckhqdq xmm5, xmm6 psllq xmm5, 32 psrlq xmm5, 32 movdqa [r6+48], xmm5 pxor xmm4, xmm4 ;V pxor xmm5, xmm5 ;H pxor xmm6, xmm6 ;DC mov r0, 0 SSE41_ChromaGetX38x4Satd r0, 0 inc r0 SSE41_ChromaGetX38x4Satd r0, 0 %else SSE41_ChromaGetX38x8Satd %endif SSEReg2MMX xmm4, mm0,mm1 SSEReg2MMX xmm5, mm2,mm3 SSEReg2MMX xmm6, mm5,mm6 mov r0, arg8 mov r2, arg9 %ifdef X86_32_PICASM mov r0, esp and esp, 0xfffffff0 push 0xff01ff01 push 0xff01ff01 push 0x01010101 push 0x01010101 movdqa xmm5, [esp] push 0xffff0001 push 0xffff0001 push 0xffff0001 push 0xffff0001 movdqa xmm6, [esp] push 0x00010001 push 0x00010001 push 0x00010001 push 0x00010001 movdqa xmm7, [esp] mov esp, r0 mov r0, arg8 sub r0, r1 movq xmm0, [r0] punpcklqdq xmm0, xmm0 SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4 movdqa [r6], xmm0 ;V add r0, r1 pinsrb xmm0, byte[r0-1], 0 pinsrb xmm0, byte[r0+r1-1], 1 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 2 pinsrb xmm0, byte[r0+r1-1], 3 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 4 pinsrb xmm0, byte[r0+r1-1], 5 lea r0, [r0+2*r1] pinsrb xmm0, byte[r0-1], 6 pinsrb xmm0, byte[r0+r1-1], 7 punpcklqdq xmm0, xmm0 SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1 ;movdqa [r6+16], xmm0 ;H ;(sum+2)>>2 mov DWORD [r6+16], 0x0002 mov DWORD [r6+20], 0x0000 mov DWORD [r6+24], 0x0002 mov DWORD [r6+28], 0x0000 movdqa xmm6, [r6+16] movdqa [r6+16], xmm0 ;H movdqa xmm5, xmm4 punpckhqdq xmm5, xmm1 paddd xmm5, xmm6 psrld xmm5, 2 ;(sum1+sum2+4)>>3 paddd xmm6, xmm6 paddd xmm4, xmm1 paddd xmm4, xmm6 psrld xmm4, 3 ;satd *16 pslld xmm5, 4 pslld xmm4, 4 ;temp satd movdqa xmm6, xmm4 punpcklqdq xmm4, xmm5 psllq xmm4, 32 psrlq xmm4, 32 movdqa [r6+32], xmm4 punpckhqdq xmm5, xmm6 psllq xmm5, 32 psrlq xmm5, 32 movdqa [r6+48], xmm5 pxor xmm4, xmm4 ;V pxor xmm5, xmm5 ;H pxor xmm6, xmm6 ;DC mov r0, 0 SSE41_ChromaGetX38x4Satd r0, 0 inc r0 SSE41_ChromaGetX38x4Satd r0, 0 %else SSE41_ChromaGetX38x8Satd %endif MMXReg2SSE xmm0, xmm3, mm0, mm1 MMXReg2SSE xmm1, xmm3, mm2, mm3 MMXReg2SSE xmm2, xmm3, mm5, mm6 paddw xmm4, xmm0 paddw xmm5, xmm1 paddw xmm6, xmm2 MMX_DW_1_2REG xmm0, xmm1 psrlw xmm4, 1 ;/2 psrlw xmm5, 1 ;/2 psrlw xmm6, 1 ;/2 SSE41_HSum8W xmm4, xmm0, xmm1 SSE41_HSum8W xmm5, xmm0, xmm1 SSE41_HSum8W xmm6, xmm0, xmm1 ; comparing order: DC H V movd r3d, xmm6 ;DC movd r1d, xmm5 ;H movd r0d, xmm4 ;V shl r5d, 1 add r1d, r5d add r0d, r5d cmp r3d, r1d jge near not_dc_8x8 cmp r3d, r0d jge near not_dc_h_8x8 ; for DC mode mov dword[r4], 0;I8_PRED_DC mov retrd, r3d jmp near return_satd_intra_8x8_x3 not_dc_8x8: ; for H mode cmp r1d, r0d jge near not_dc_h_8x8 mov dword[r4], 1;I8_PRED_H mov retrd, r1d jmp near return_satd_intra_8x8_x3 not_dc_h_8x8: ; for V mode mov dword[r4], 2;I8_PRED_V mov retrd, r0d return_satd_intra_8x8_x3: WELSEMMS POP_XMM LOAD_7_PARA_POP ret ;*********************************************************************** ; ;Pixel_satd_intra_sse2 END ; ;*********************************************************************** %macro SSSE3_Get16BSadHVDC 2 movd xmm6,%1 pshufb xmm6,xmm1 movdqa %1, xmm6 movdqa xmm0,%2 psadbw xmm0,xmm7 paddw xmm4,xmm0 movdqa xmm0,%2 psadbw xmm0,xmm5 paddw xmm2,xmm0 psadbw xmm6,%2 paddw xmm3,xmm6 %endmacro %macro WelsAddDCValue 4 movzx %2, byte %1 mov %3, %2 add %4, %2 %endmacro ;*********************************************************************** ; ;Pixel_sad_intra_ssse3 BEGIN ; ;*********************************************************************** WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3 %assign push_num 0 LOAD_7_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d SIGN_EXTENSION r5, r5d push r5 push r4 push r3 sub r0, r1 movdqa xmm5,[r0] pxor xmm0,xmm0 psadbw xmm0,xmm5 movhlps xmm1,xmm0 paddw xmm0,xmm1 movd r5d, xmm0 add r0,r1 lea r3,[r1+2*r1] ;ebx r3 WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d ; esi r4d, eax r5d WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d lea r0, [r0+4*r1] add r6, 64 WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d lea r0, [r0+4*r1] add r6, 64 WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d lea r0, [r0+4*r1] add r6, 64 WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d sub r6, 192 add r5d,10h shr r5d,5 movd xmm7,r5d pxor xmm1,xmm1 pshufb xmm7,xmm1 pxor xmm4,xmm4 pxor xmm3,xmm3 pxor xmm2,xmm2 ;sad begin pop r3 lea r4, [r3+2*r3] ;esi r4 SSSE3_Get16BSadHVDC [r6], [r2] SSSE3_Get16BSadHVDC [r6+16], [r2+r3] SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] SSSE3_Get16BSadHVDC [r6+48], [r2+r4] add r6, 64 lea r2, [r2+4*r3] SSSE3_Get16BSadHVDC [r6], [r2] SSSE3_Get16BSadHVDC [r6+16], [r2+r3] SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] SSSE3_Get16BSadHVDC [r6+48], [r2+r4] add r6, 64 lea r2, [r2+4*r3] SSSE3_Get16BSadHVDC [r6], [r2] SSSE3_Get16BSadHVDC [r6+16], [r2+r3] SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] SSSE3_Get16BSadHVDC [r6+48], [r2+r4] add r6, 64 lea r2, [r2+4*r3] SSSE3_Get16BSadHVDC [r6], [r2] SSSE3_Get16BSadHVDC [r6+16], [r2+r3] SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] SSSE3_Get16BSadHVDC [r6+48], [r2+r4] pop r4 pop r5 pslldq xmm3,4 por xmm3,xmm2 movhlps xmm1,xmm3 paddw xmm3,xmm1 movhlps xmm0,xmm4 paddw xmm4,xmm0 ; comparing order: DC H V movd r1d, xmm4 ;DC ;ebx r1d movd r0d, xmm3 ;V ;ecx r0d psrldq xmm3, 4 movd r2d, xmm3 ;H ;esi r2d ;mov eax, [esp+36] ;lamda ;eax r5 shl r5d, 1 add r2d, r5d add r1d, r5d ;mov edx, [esp+32] ;edx r4 cmp r1d, r2d jge near not_dc_16x16_sad cmp r1d, r0d jge near not_dc_h_16x16_sad ; for DC mode mov dword[r4], 2;I16_PRED_DC mov retrd, r1d sub r6, 192 %assign x 0 %rep 16 movdqa [r6+16*x], xmm7 %assign x x+1 %endrep jmp near return_sad_intra_16x16_x3 not_dc_16x16_sad: ; for H mode cmp r2d, r0d jge near not_dc_h_16x16_sad mov dword[r4], 1;I16_PRED_H mov retrd, r2d jmp near return_sad_intra_16x16_x3 not_dc_h_16x16_sad: ; for V mode mov dword[r4], 0;I16_PRED_V mov retrd, r0d sub r6, 192 %assign x 0 %rep 16 movdqa [r6+16*x], xmm5 %assign x x+1 %endrep return_sad_intra_16x16_x3: POP_XMM LOAD_7_PARA_POP ret ;*********************************************************************** ; ;Pixel_sad_intra_ssse3 END ; ;*********************************************************************** ;*********************************************************************** ; ;Pixel_satd_wxh_sse41 BEGIN ; ;*********************************************************************** ;SSE4.1 %macro SSE41_GetSatd8x4 0 movq xmm0, [r0] punpcklqdq xmm0, xmm0 pmaddubsw xmm0, xmm7 movq xmm1, [r0+r1] punpcklqdq xmm1, xmm1 pmaddubsw xmm1, xmm7 movq xmm2, [r2] punpcklqdq xmm2, xmm2 pmaddubsw xmm2, xmm7 movq xmm3, [r2+r3] punpcklqdq xmm3, xmm3 pmaddubsw xmm3, xmm7 psubsw xmm0, xmm2 psubsw xmm1, xmm3 movq xmm2, [r0+2*r1] punpcklqdq xmm2, xmm2 pmaddubsw xmm2, xmm7 movq xmm3, [r0+r4] punpcklqdq xmm3, xmm3 pmaddubsw xmm3, xmm7 movq xmm4, [r2+2*r3] punpcklqdq xmm4, xmm4 pmaddubsw xmm4, xmm7 movq xmm5, [r2+r5] punpcklqdq xmm5, xmm5 pmaddubsw xmm5, xmm7 psubsw xmm2, xmm4 psubsw xmm3, xmm5 SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4 pabsw xmm0, xmm0 pabsw xmm2, xmm2 pabsw xmm1, xmm1 pabsw xmm3, xmm3 movdqa xmm4, xmm3 pblendw xmm3, xmm1, 0xAA pslld xmm1, 16 psrld xmm4, 16 por xmm1, xmm4 pmaxuw xmm1, xmm3 paddw xmm6, xmm1 movdqa xmm4, xmm0 pblendw xmm0, xmm2, 0xAA pslld xmm2, 16 psrld xmm4, 16 por xmm2, xmm4 pmaxuw xmm0, xmm2 paddw xmm6, xmm0 %endmacro %macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE MMX_DW_1_2REG %3, %4 pmaddwd %2, %3 movhlps %4, %2 paddd %2, %4 pshuflw %4, %2,0Eh paddd %2, %4 movd %1, %2 %endmacro ;*********************************************************************** ; ;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd4x4_sse41 %assign push_num 0 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d %ifdef X86_32_PICASM push 0xff01ff01 push 0x01010101 push 0xff01ff01 push 0x01010101 movdqu xmm4, [esp] add esp, 16 %else movdqa xmm4,[HSwapSumSubDB1] %endif movd xmm2,[r2] movd xmm5,[r2+r3] shufps xmm2,xmm5,0 movd xmm3,[r2+r3*2] lea r2, [r3*2+r2] movd xmm5,[r2+r3] shufps xmm3,xmm5,0 movd xmm0,[r0] movd xmm5,[r0+r1] shufps xmm0,xmm5,0 movd xmm1,[r0+r1*2] lea r0, [r1*2+r0] movd xmm5,[r0+r1] shufps xmm1,xmm5,0 pmaddubsw xmm0,xmm4 pmaddubsw xmm1,xmm4 pmaddubsw xmm2,xmm4 pmaddubsw xmm3,xmm4 psubw xmm0,xmm2 psubw xmm1,xmm3 movdqa xmm2,xmm0 paddw xmm0,xmm1 psubw xmm1,xmm2 movdqa xmm2,xmm0 punpcklqdq xmm0,xmm1 punpckhqdq xmm2,xmm1 movdqa xmm1,xmm0 paddw xmm0,xmm2 psubw xmm2,xmm1 movdqa xmm1,xmm0 pblendw xmm0,xmm2,0AAh pslld xmm2,16 psrld xmm1,16 por xmm2,xmm1 pabsw xmm0,xmm0 pabsw xmm2,xmm2 pmaxsw xmm0,xmm2 SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7 POP_XMM LOAD_4_PARA_POP ret ;*********************************************************************** ; ;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd8x8_sse41 %ifdef X86_32 push r4 push r5 %endif %assign push_num 2 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d %ifdef X86_32_PICASM push 0xff01ff01 push 0xff01ff01 push 0x01010101 push 0x01010101 movdqu xmm7, [esp] add esp, 16 %else movdqa xmm7, [HSumSubDB1] %endif lea r4, [r1+r1*2] lea r5, [r3+r3*2] pxor xmm6, xmm6 SSE41_GetSatd8x4 lea r0, [r0+4*r1] lea r2, [r2+4*r3] SSE41_GetSatd8x4 SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 POP_XMM LOAD_4_PARA_POP %ifdef X86_32 pop r5 pop r4 %endif ret ;*********************************************************************** ; ;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd8x16_sse41 %ifdef X86_32 push r4 push r5 push r6 %endif %assign push_num 3 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d %ifdef X86_32_PICASM push 0xff01ff01 push 0xff01ff01 push 0x01010101 push 0x01010101 movdqu xmm7, [esp] add esp, 16 %else movdqa xmm7, [HSumSubDB1] %endif lea r4, [r1+r1*2] lea r5, [r3+r3*2] pxor xmm6, xmm6 mov r6, 0 loop_get_satd_8x16: SSE41_GetSatd8x4 lea r0, [r0+4*r1] lea r2, [r2+4*r3] inc r6 cmp r6, 4 jl loop_get_satd_8x16 SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 POP_XMM LOAD_4_PARA_POP %ifdef X86_32 pop r6 pop r5 pop r4 %endif ret ;*********************************************************************** ; ;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd16x8_sse41 %ifdef X86_32 push r4 push r5 %endif %assign push_num 2 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d push r0 push r2 %ifdef X86_32_PICASM push 0xff01ff01 push 0xff01ff01 push 0x01010101 push 0x01010101 movdqu xmm7, [esp] add esp, 16 %else movdqa xmm7, [HSumSubDB1] %endif lea r4, [r1+r1*2] lea r5, [r3+r3*2] pxor xmm6, xmm6 SSE41_GetSatd8x4 lea r0, [r0+4*r1] lea r2, [r2+4*r3] SSE41_GetSatd8x4 pop r2 pop r0 add r0, 8 add r2, 8 SSE41_GetSatd8x4 lea r0, [r0+4*r1] lea r2, [r2+4*r3] SSE41_GetSatd8x4 SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 POP_XMM LOAD_4_PARA_POP %ifdef X86_32 pop r5 pop r4 %endif ret ;*********************************************************************** ; ;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd16x16_sse41 %ifdef X86_32 push r4 push r5 push r6 %endif %assign push_num 3 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d push r0 push r2 %ifdef X86_32_PICASM push 0xff01ff01 push 0xff01ff01 push 0x01010101 push 0x01010101 movdqu xmm7, [esp] add esp, 16 %else movdqa xmm7, [HSumSubDB1] %endif lea r4, [r1+r1*2] lea r5, [r3+r3*2] pxor xmm6, xmm6 mov r6, 0 loop_get_satd_16x16_left: SSE41_GetSatd8x4 lea r0, [r0+4*r1] lea r2, [r2+4*r3] inc r6 cmp r6, 4 jl loop_get_satd_16x16_left pop r2 pop r0 add r0, 8 add r2, 8 mov r6, 0 loop_get_satd_16x16_right: SSE41_GetSatd8x4 lea r0, [r0+4*r1] lea r2, [r2+4*r3] inc r6 cmp r6, 4 jl loop_get_satd_16x16_right SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 POP_XMM LOAD_4_PARA_POP %ifdef X86_32 pop r6 pop r5 pop r4 %endif ret ;*********************************************************************** ; ;Pixel_satd_wxh_sse41 END ; ;*********************************************************************** ;*********************************************************************** ; ;Pixel_satd_wxh_avx2 BEGIN ; ;*********************************************************************** %ifdef HAVE_AVX2 ; out=%1 pSrcA=%2 pSrcB=%3 HSumSubDB1_256=%4 ymm_clobber=%5 %macro AVX2_LoadDiffSatd16x1 5 vbroadcasti128 %1, [%2] vpmaddubsw %1, %1, %4 ; hadamard neighboring horizontal sums and differences vbroadcasti128 %5, [%3] vpmaddubsw %5, %5, %4 ; hadamard neighboring horizontal sums and differences vpsubw %1, %1, %5 ; diff srcA srcB %endmacro ; out=%1 pSrcA=%2 pSrcA+4*iStride=%3 pSrcB=%4 pSrcB+4*iStride=%5 HSumSubDB1_128x2=%6 ymm_clobber=%7,%8 %macro AVX2_LoadDiffSatd8x2 8 vpbroadcastq %1, [%2] vpbroadcastq %7, [%3] vpblendd %1, %1, %7, 11110000b vpmaddubsw %1, %1, %6 ; hadamard neighboring horizontal sums and differences vpbroadcastq %7, [%4] vpbroadcastq %8, [%5] vpblendd %7, %7, %8, 11110000b vpmaddubsw %7, %7, %6 ; hadamard neighboring horizontal sums and differences vpsubw %1, %1, %7 ; diff srcA srcB %endmacro ; in/out=%1,%2,%3,%4 clobber=%5 %macro AVX2_HDMFour4x4 5 vpsubw %5, %1, %4 ; s3 = x0 - x3 vpaddw %1, %1, %4 ; s0 = x0 + x3 vpsubw %4, %2, %3 ; s2 = x1 - x2 vpaddw %2, %2, %3 ; s1 = x1 + x2 vpsubw %3, %1, %2 ; y2 = s0 - s1 vpaddw %1, %1, %2 ; y0 = s0 + s1 vpaddw %2, %5, %4 ; y1 = s3 + s2 vpsubw %4, %5, %4 ; y3 = s3 - s2 %endmacro ; out=%1 in=%1,%2,%3,%4 clobber=%5 %macro AVX2_SatdFour4x4 5 AVX2_HDMFour4x4 %1, %2, %3, %4, %5 vpabsw %1, %1 vpabsw %2, %2 vpabsw %3, %3 vpabsw %4, %4 ; second stage of horizontal hadamard. ; utilizes that |a + b| + |a - b| = 2 * max(|a|, |b|) vpblendw %5, %1, %2, 10101010b vpslld %2, %2, 16 vpsrld %1, %1, 16 vpor %2, %2, %1 vpmaxuw %2, %2, %5 vpblendw %5, %3, %4, 10101010b vpslld %4, %4, 16 vpsrld %3, %3, 16 vpor %4, %4, %3 vpmaxuw %3, %5, %4 vpaddw %1, %2, %3 %endmacro ; out=%1 pSrcA=%2 iStrideA=%3 3*iStrideA=%4 pSrcB=%5 iStrideB=%6 3*iStrideB=%7 HSumSubDB1_256=%8 ymm_clobber=%9,%10,%11,%12 %macro AVX2_GetSatd16x4 12 AVX2_LoadDiffSatd16x1 %1, %2 + 0 * %3, %5 + 0 * %6, %8, %12 AVX2_LoadDiffSatd16x1 %9, %2 + 1 * %3, %5 + 1 * %6, %8, %12 AVX2_LoadDiffSatd16x1 %10, %2 + 2 * %3, %5 + 2 * %6, %8, %12 AVX2_LoadDiffSatd16x1 %11, %2 + 1 * %4, %5 + 1 * %7, %8, %12 AVX2_SatdFour4x4 %1, %9, %10, %11, %12 %endmacro ; out=%1 pSrcA=%2 iStrideA=%3 3*iStrideA=%4 pSrcB=%5 iStrideB=%6 3*iStrideB=%7 HSumSubDB1_128x2=%8 ymm_clobber=%9,%10,%11,%12,%13 %macro AVX2_GetSatd8x8 13 AVX2_LoadDiffSatd8x2 %1, %2 + 0 * %3, %2 + 4 * %3, %5 + 0 * %6, %5 + 4 * %6, %8, %12, %13 AVX2_LoadDiffSatd8x2 %10, %2 + 2 * %3, %2 + 2 * %4, %5 + 2 * %6, %5 + 2 * %7, %8, %12, %13 add %2, %3 add %5, %6 AVX2_LoadDiffSatd8x2 %9, %2 + 0 * %3, %2 + 4 * %3, %5 + 0 * %6, %5 + 4 * %6, %8, %12, %13 AVX2_LoadDiffSatd8x2 %11, %2 + 2 * %3, %2 + 2 * %4, %5 + 2 * %6, %5 + 2 * %7, %8, %12, %13 AVX2_SatdFour4x4 %1, %9, %10, %11, %12 %endmacro ; d_out=%1 mm_in=%2 mm_clobber=%3 %macro AVX2_SumWHorizon 3 WELS_DW1_VEX y%3 vpmaddwd y%2, y%2, y%3 vextracti128 x%3, y%2, 1 vpaddd x%2, x%2, x%3 vpunpckhqdq x%3, x%2, x%2 vpaddd x%2, x%2, x%3 vpsrldq x%3, x%2, 4 vpaddd x%2, x%2, x%3 vmovd %1, x%2 %endmacro ;*********************************************************************** ; ;int32_t WelsSampleSatd8x16_avx2( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd8x16_avx2 %assign push_num 0 %ifdef X86_32 push r4 %assign push_num 1 %endif mov r4, 2 ; loop cnt jmp WelsSampleSatd8x8N_avx2 ;*********************************************************************** ; ;int32_t WelsSampleSatd8x8_avx2( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd8x8_avx2 %assign push_num 0 %ifdef X86_32 push r4 %assign push_num 1 %endif mov r4, 1 ; loop cnt ; fall through WelsSampleSatd8x8N_avx2: %ifdef X86_32 push r5 push r6 %assign push_num push_num+2 %endif LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d %ifdef X86_32_PICASM mov r1, esp and esp, 0xfffffff0 push 0xff01ff01 push 0xff01ff01 push 0x01010101 push 0x01010101 vbroadcasti128 ymm7, [esp] mov esp, r1 mov r1, [esp + push_num*4 + 8] %else vbroadcasti128 ymm7, [HSumSubDB1] %endif lea r5, [3 * r1] lea r6, [3 * r3] vpxor ymm6, ymm6, ymm6 .loop: AVX2_GetSatd8x8 ymm0, r0, r1, r5, r2, r3, r6, ymm7, ymm1, ymm2, ymm3, ymm4, ymm5 vpaddw ymm6, ymm6, ymm0 sub r4, 1 jbe .loop_end add r0, r5 add r2, r6 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] jmp .loop .loop_end: AVX2_SumWHorizon retrd, mm6, mm5 vzeroupper POP_XMM LOAD_4_PARA_POP %ifdef X86_32 pop r6 pop r5 pop r4 %endif ret ;*********************************************************************** ; ;int32_t WelsSampleSatd16x16_avx2( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd16x16_avx2 %assign push_num 0 %ifdef X86_32 push r4 %assign push_num 1 %endif mov r4, 4 ; loop cnt jmp WelsSampleSatd16x4N_avx2 ;*********************************************************************** ; ;int32_t WelsSampleSatd16x8_avx2( uint8_t *, int32_t, uint8_t *, int32_t, ); ; ;*********************************************************************** WELS_EXTERN WelsSampleSatd16x8_avx2 %assign push_num 0 %ifdef X86_32 push r4 %assign push_num 1 %endif mov r4, 2 ; loop cnt ; fall through WelsSampleSatd16x4N_avx2: %ifdef X86_32 push r5 push r6 %assign push_num push_num+2 %endif LOAD_4_PARA PUSH_XMM 7 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d %ifdef X86_32_PICASM mov r0, esp and esp, 0xfffffff0 push 0xff01ff01 push 0xff01ff01 push 0x01010101 push 0x01010101 vpbroadcastq xmm0, [esp] vpbroadcastq ymm6, [esp + 8] mov esp, r0 mov r0, [esp + push_num*4 + 4] %else vpbroadcastq xmm0, [HSumSubDB1] vpbroadcastq ymm6, [HSumSubDB1 + 8] %endif vpblendd ymm6, ymm0, ymm6, 11110000b lea r5, [3 * r1] lea r6, [3 * r3] vpxor ymm5, ymm5, ymm5 .loop: AVX2_GetSatd16x4 ymm0, r0, r1, r5, r2, r3, r6, ymm6, ymm1, ymm2, ymm3, ymm4 vpaddw ymm5, ymm5, ymm0 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] sub r4, 1 ja .loop AVX2_SumWHorizon retrd, mm5, mm0 vzeroupper POP_XMM LOAD_4_PARA_POP %ifdef X86_32 pop r6 pop r5 pop r4 %endif ret %endif ;*********************************************************************** ; ;Pixel_satd_wxh_avx2 END ; ;*********************************************************************** ;*********************************************************************** ; ;Pixel_sad_wxh_sse2 BEGIN ; ;*********************************************************************** %macro SSE2_GetSad2x16 0 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movdqu xmm1, [r2] MOVDQ xmm2, [r0];[eax] must aligned 16 psadbw xmm1, xmm2 paddw xmm0, xmm1 movdqu xmm1, [r2+r3] MOVDQ xmm2, [r0+r1] psadbw xmm1, xmm2 paddw xmm0, xmm1 %endmacro %macro SSE2_GetSad4x16 0 movdqu xmm0, [r2] MOVDQ xmm2, [r0] psadbw xmm0, xmm2 paddw xmm7, xmm0 movdqu xmm1, [r2+r3] MOVDQ xmm2, [r0+r1] psadbw xmm1, xmm2 paddw xmm7, xmm1 movdqu xmm1, [r2+2*r3] MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16 psadbw xmm1, xmm2 paddw xmm7, xmm1 movdqu xmm1, [r2+r5] MOVDQ xmm2, [r0+r4] psadbw xmm1, xmm2 paddw xmm7, xmm1 %endmacro %macro SSE2_GetSad8x4 0 movq xmm0, [r0] movq xmm1, [r0+r1] lea r0, [r0+2*r1] movhps xmm0, [r0] movhps xmm1, [r0+r1] movq xmm2, [r2] movq xmm3, [r2+r3] lea r2, [r2+2*r3] movhps xmm2, [r2] movhps xmm3, [r2+r3] psadbw xmm0, xmm2 psadbw xmm1, xmm3 paddw xmm6, xmm0 paddw xmm6, xmm1 %endmacro ;*********************************************************************** ; ;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ) ;First parameter can align to 16 bytes, ;In wels, the third parameter can't align to 16 bytes. ; ;*********************************************************************** WELS_EXTERN WelsSampleSad16x16_sse2 %ifdef X86_32 push r4 push r5 %endif %assign push_num 2 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d lea r4, [3*r1] lea r5, [3*r3] pxor xmm7, xmm7 SSE2_GetSad4x16 lea r0, [r0+4*r1] lea r2, [r2+4*r3] SSE2_GetSad4x16 lea r0, [r0+4*r1] lea r2, [r2+4*r3] SSE2_GetSad4x16 lea r0, [r0+4*r1] lea r2, [r2+4*r3] SSE2_GetSad4x16 movhlps xmm0, xmm7 paddw xmm0, xmm7 movd retrd, xmm0 POP_XMM LOAD_4_PARA_POP %ifdef X86_32 pop r5 pop r4 %endif ret ;*********************************************************************** ; ;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ) ;First parameter can align to 16 bytes, ;In wels, the third parameter can't align to 16 bytes. ; ;*********************************************************************** WELS_EXTERN WelsSampleSad16x8_sse2 %assign push_num 0 LOAD_4_PARA SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d movdqu xmm0, [r2] MOVDQ xmm2, [r0] psadbw xmm0, xmm2 movdqu xmm1, [r2+r3] MOVDQ xmm2, [r0+r1] psadbw xmm1, xmm2 paddw xmm0, xmm1 SSE2_GetSad2x16 SSE2_GetSad2x16 SSE2_GetSad2x16 movhlps xmm1, xmm0 paddw xmm0, xmm1 movd retrd, xmm0 LOAD_4_PARA_POP ret WELS_EXTERN WelsSampleSad8x16_sse2 %assign push_num 0 LOAD_4_PARA PUSH_XMM 7 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d pxor xmm6, xmm6 SSE2_GetSad8x4 lea r0, [r0+2*r1] lea r2, [r2+2*r3] SSE2_GetSad8x4 lea r0, [r0+2*r1] lea r2, [r2+2*r3] SSE2_GetSad8x4 lea r0, [r0+2*r1] lea r2, [r2+2*r3] SSE2_GetSad8x4 movhlps xmm0, xmm6 paddw xmm0, xmm6 movd retrd, xmm0 POP_XMM LOAD_4_PARA_POP ret %macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline and %1, 0x1f|(%3>>1) cmp %1, (32-%2)|(%3>>1) %endmacro WELS_EXTERN WelsSampleSad8x8_sse21 %assign push_num 0 mov r2, arg3 push r2 CACHE_SPLIT_CHECK r2, 8, 64 jle near .pixel_sad_8x8_nsplit pop r2 %ifdef X86_32 push r3 push r4 push r5 %endif %assign push_num 3 PUSH_XMM 8 mov r0, arg1 mov r1, arg2 SIGN_EXTENSION r1, r1d pxor xmm7, xmm7 ;ecx r2, edx r4, edi r5 mov r5, r2 and r5, 0x07 sub r2, r5 mov r4, 8 sub r4, r5 shl r5, 3 shl r4, 3 movd xmm5, r5d movd xmm6, r4d mov r5, 8 add r5, r2 mov r3, arg4 SIGN_EXTENSION r3, r3d movq xmm0, [r0] movhps xmm0, [r0+r1] movq xmm1, [r2] movq xmm2, [r5] movhps xmm1, [r2+r3] movhps xmm2, [r5+r3] psrlq xmm1, xmm5 psllq xmm2, xmm6 por xmm1, xmm2 psadbw xmm0, xmm1 paddw xmm7, xmm0 lea r0, [r0+2*r1] lea r2, [r2+2*r3] lea r5, [r5+2*r3] movq xmm0, [r0] movhps xmm0, [r0+r1] movq xmm1, [r2] movq xmm2, [r5] movhps xmm1, [r2+r3] movhps xmm2, [r5+r3] psrlq xmm1, xmm5 psllq xmm2, xmm6 por xmm1, xmm2 psadbw xmm0, xmm1 paddw xmm7, xmm0 lea r0, [r0+2*r1] lea r2, [r2+2*r3] lea r5, [r5+2*r3] movq xmm0, [r0] movhps xmm0, [r0+r1] movq xmm1, [r2] movq xmm2, [r5] movhps xmm1, [r2+r3] movhps xmm2, [r5+r3] psrlq xmm1, xmm5 psllq xmm2, xmm6 por xmm1, xmm2 psadbw xmm0, xmm1 paddw xmm7, xmm0 lea r0, [r0+2*r1] lea r2, [r2+2*r3] lea r5, [r5+2*r3] movq xmm0, [r0] movhps xmm0, [r0+r1] movq xmm1, [r2] movq xmm2, [r5] movhps xmm1, [r2+r3] movhps xmm2, [r5+r3] psrlq xmm1, xmm5 psllq xmm2, xmm6 por xmm1, xmm2 psadbw xmm0, xmm1 paddw xmm7, xmm0 movhlps xmm0, xmm7 paddw xmm0, xmm7 movd retrd, xmm0 POP_XMM %ifdef X86_32 pop r5 pop r4 pop r3 %endif jmp .return .pixel_sad_8x8_nsplit: pop r2 %assign push_num 0 LOAD_4_PARA PUSH_XMM 7 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d pxor xmm6, xmm6 SSE2_GetSad8x4 lea r0, [r0+2*r1] lea r2, [r2+2*r3] SSE2_GetSad8x4 movhlps xmm0, xmm6 paddw xmm0, xmm6 movd retrd, xmm0 POP_XMM LOAD_4_PARA_POP .return: ret ;*********************************************************************** ; ;Pixel_sad_wxh_sse2 END ; ;*********************************************************************** ;*********************************************************************** ; ;Pixel_sad_4_wxh_sse2 BEGIN ; ;*********************************************************************** %macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address psadbw %1, %4 paddw xmm5, %1 psadbw %4, %3 paddw xmm4, %4 movdqu %4, [%5-1] psadbw %4, %2 paddw xmm6, %4 movdqu %4, [%5+1] psadbw %4, %2 paddw xmm7, %4 %endmacro WELS_EXTERN WelsSampleSadFour16x16_sse2 %assign push_num 0 LOAD_5_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref pxor xmm6, xmm6 ;sad pRefMb-1 pxor xmm7, xmm7 ;sad pRefMb+1 movdqa xmm0, [r0] sub r2, r3 movdqu xmm3, [r2] psadbw xmm3, xmm0 paddw xmm4, xmm3 movdqa xmm1, [r0+r1] movdqu xmm3, [r2+r3] psadbw xmm3, xmm1 paddw xmm4, xmm3 movdqu xmm2, [r2+r3-1] psadbw xmm2, xmm0 paddw xmm6, xmm2 movdqu xmm3, [r2+r3+1] psadbw xmm3, xmm0 paddw xmm7, xmm3 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movdqa xmm2, [r0] movdqu xmm3, [r2] SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 movdqa xmm0, [r0+r1] movdqu xmm3, [r2+r3] SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movdqa xmm1, [r0] movdqu xmm3, [r2] SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 movdqa xmm2, [r0+r1] movdqu xmm3, [r2+r3] SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movdqa xmm0, [r0] movdqu xmm3, [r2] SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 movdqa xmm1, [r0+r1] movdqu xmm3, [r2+r3] SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movdqa xmm2, [r0] movdqu xmm3, [r2] SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 movdqa xmm0, [r0+r1] movdqu xmm3, [r2+r3] SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movdqa xmm1, [r0] movdqu xmm3, [r2] SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 movdqa xmm2, [r0+r1] movdqu xmm3, [r2+r3] SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movdqa xmm0, [r0] movdqu xmm3, [r2] SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 movdqa xmm1, [r0+r1] movdqu xmm3, [r2+r3] SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movdqa xmm2, [r0] movdqu xmm3, [r2] SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 movdqa xmm0, [r0+r1] movdqu xmm3, [r2+r3] SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 lea r2, [r2+2*r3] movdqu xmm3, [r2] psadbw xmm2, xmm3 paddw xmm5, xmm2 movdqu xmm2, [r2-1] psadbw xmm2, xmm0 paddw xmm6, xmm2 movdqu xmm3, [r2+1] psadbw xmm3, xmm0 paddw xmm7, xmm3 movdqu xmm3, [r2+r3] psadbw xmm0, xmm3 paddw xmm5, xmm0 movhlps xmm0, xmm4 paddw xmm4, xmm0 movhlps xmm0, xmm5 paddw xmm5, xmm0 movhlps xmm0, xmm6 paddw xmm6, xmm0 movhlps xmm0, xmm7 paddw xmm7, xmm0 punpckldq xmm4, xmm5 punpckldq xmm6, xmm7 punpcklqdq xmm4, xmm6 movdqa [r4],xmm4 POP_XMM LOAD_5_PARA_POP ret WELS_EXTERN WelsSampleSadFour16x8_sse2 %assign push_num 0 LOAD_5_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref pxor xmm6, xmm6 ;sad pRefMb-1 pxor xmm7, xmm7 ;sad pRefMb+1 movdqa xmm0, [r0] sub r2, r3 movdqu xmm3, [r2] psadbw xmm3, xmm0 paddw xmm4, xmm3 movdqa xmm1, [r0+r1] movdqu xmm3, [r2+r3] psadbw xmm3, xmm1 paddw xmm4, xmm3 movdqu xmm2, [r2+r3-1] psadbw xmm2, xmm0 paddw xmm6, xmm2 movdqu xmm3, [r2+r3+1] psadbw xmm3, xmm0 paddw xmm7, xmm3 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movdqa xmm2, [r0] movdqu xmm3, [r2] SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 movdqa xmm0, [r0+r1] movdqu xmm3, [r2+r3] SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movdqa xmm1, [r0] movdqu xmm3, [r2] SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 movdqa xmm2, [r0+r1] movdqu xmm3, [r2+r3] SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movdqa xmm0, [r0] movdqu xmm3, [r2] SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 movdqa xmm1, [r0+r1] movdqu xmm3, [r2+r3] SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 lea r2, [r2+2*r3] movdqu xmm3, [r2] psadbw xmm0, xmm3 paddw xmm5, xmm0 movdqu xmm0, [r2-1] psadbw xmm0, xmm1 paddw xmm6, xmm0 movdqu xmm3, [r2+1] psadbw xmm3, xmm1 paddw xmm7, xmm3 movdqu xmm3, [r2+r3] psadbw xmm1, xmm3 paddw xmm5, xmm1 movhlps xmm0, xmm4 paddw xmm4, xmm0 movhlps xmm0, xmm5 paddw xmm5, xmm0 movhlps xmm0, xmm6 paddw xmm6, xmm0 movhlps xmm0, xmm7 paddw xmm7, xmm0 punpckldq xmm4, xmm5 punpckldq xmm6, xmm7 punpcklqdq xmm4, xmm6 movdqa [r4],xmm4 POP_XMM LOAD_5_PARA_POP ret WELS_EXTERN WelsSampleSadFour8x16_sse2 %assign push_num 0 LOAD_5_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref pxor xmm6, xmm6 ;sad pRefMb-1 pxor xmm7, xmm7 ;sad pRefMb+1 movq xmm0, [r0] movhps xmm0, [r0+r1] sub r2, r3 movq xmm3, [r2] movhps xmm3, [r2+r3] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [r2+r3-1] movq xmm3, [r2+r3+1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] movhps xmm1, [r2-1] movhps xmm3, [r2+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [r2] movhps xmm3, [r2+r3] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [r0] movhps xmm0, [r0+r1] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [r2+r3-1] movq xmm3, [r2+r3+1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] movhps xmm1, [r2-1] movhps xmm3, [r2+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [r2] movhps xmm3, [r2+r3] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [r0] movhps xmm0, [r0+r1] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [r2+r3-1] movq xmm3, [r2+r3+1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] movhps xmm1, [r2-1] movhps xmm3, [r2+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [r2] movhps xmm3, [r2+r3] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [r0] movhps xmm0, [r0+r1] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [r2+r3-1] movq xmm3, [r2+r3+1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] movhps xmm1, [r2-1] movhps xmm3, [r2+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [r2] movhps xmm3, [r2+r3] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [r0] movhps xmm0, [r0+r1] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [r2+r3-1] movq xmm3, [r2+r3+1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] movhps xmm1, [r2-1] movhps xmm3, [r2+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [r2] movhps xmm3, [r2+r3] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [r0] movhps xmm0, [r0+r1] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [r2+r3-1] movq xmm3, [r2+r3+1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] movhps xmm1, [r2-1] movhps xmm3, [r2+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [r2] movhps xmm3, [r2+r3] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [r0] movhps xmm0, [r0+r1] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [r2+r3-1] movq xmm3, [r2+r3+1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] movhps xmm1, [r2-1] movhps xmm3, [r2+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [r2] movhps xmm3, [r2+r3] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [r0] movhps xmm0, [r0+r1] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [r2+r3-1] movq xmm3, [r2+r3+1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] movhps xmm1, [r2-1] movhps xmm3, [r2+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [r2] movhps xmm3, [r2+r3] psadbw xmm0, xmm3 paddw xmm5, xmm0 movhlps xmm0, xmm4 paddw xmm4, xmm0 movhlps xmm0, xmm5 paddw xmm5, xmm0 movhlps xmm0, xmm6 paddw xmm6, xmm0 movhlps xmm0, xmm7 paddw xmm7, xmm0 punpckldq xmm4, xmm5 punpckldq xmm6, xmm7 punpcklqdq xmm4, xmm6 movdqa [r4],xmm4 POP_XMM LOAD_5_PARA_POP ret WELS_EXTERN WelsSampleSadFour8x8_sse2 %assign push_num 0 LOAD_5_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref pxor xmm6, xmm6 ;sad pRefMb-1 pxor xmm7, xmm7 ;sad pRefMb+1 movq xmm0, [r0] movhps xmm0, [r0+r1] sub r2, r3 movq xmm3, [r2] movhps xmm3, [r2+r3] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [r2+r3-1] movq xmm3, [r2+r3+1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] movhps xmm1, [r2-1] movhps xmm3, [r2+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [r2] movhps xmm3, [r2+r3] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [r0] movhps xmm0, [r0+r1] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [r2+r3-1] movq xmm3, [r2+r3+1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] movhps xmm1, [r2-1] movhps xmm3, [r2+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [r2] movhps xmm3, [r2+r3] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [r0] movhps xmm0, [r0+r1] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [r2+r3-1] movq xmm3, [r2+r3+1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] movhps xmm1, [r2-1] movhps xmm3, [r2+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [r2] movhps xmm3, [r2+r3] psadbw xmm0, xmm3 paddw xmm5, xmm0 movq xmm0, [r0] movhps xmm0, [r0+r1] psadbw xmm3, xmm0 paddw xmm4, xmm3 movq xmm1, [r2+r3-1] movq xmm3, [r2+r3+1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] movhps xmm1, [r2-1] movhps xmm3, [r2+1] psadbw xmm1, xmm0 paddw xmm6, xmm1 psadbw xmm3, xmm0 paddw xmm7, xmm3 movq xmm3, [r2] movhps xmm3, [r2+r3] psadbw xmm0, xmm3 paddw xmm5, xmm0 movhlps xmm0, xmm4 paddw xmm4, xmm0 movhlps xmm0, xmm5 paddw xmm5, xmm0 movhlps xmm0, xmm6 paddw xmm6, xmm0 movhlps xmm0, xmm7 paddw xmm7, xmm0 punpckldq xmm4, xmm5 punpckldq xmm6, xmm7 punpcklqdq xmm4, xmm6 movdqa [r4],xmm4 POP_XMM LOAD_5_PARA_POP ret WELS_EXTERN WelsSampleSadFour4x4_sse2 %assign push_num 0 LOAD_5_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d movd xmm0, [r0] movd xmm1, [r0+r1] lea r0, [r0+2*r1] movd xmm2, [r0] movd xmm3, [r0+r1] punpckldq xmm0, xmm1 punpckldq xmm2, xmm3 punpcklqdq xmm0, xmm2 sub r2, r3 movd xmm1, [r2] movd xmm2, [r2+r3] punpckldq xmm1, xmm2 movd xmm2, [r2+r3-1] movd xmm3, [r2+r3+1] lea r2, [r2+2*r3] movd xmm4, [r2] movd xmm5, [r2-1] punpckldq xmm2, xmm5 movd xmm5, [r2+1] punpckldq xmm3, xmm5 movd xmm5, [r2+r3] punpckldq xmm4, xmm5 punpcklqdq xmm1, xmm4 ;-L movd xmm5, [r2+r3-1] movd xmm6, [r2+r3+1] lea r2, [r2+2*r3] movd xmm7, [r2-1] punpckldq xmm5, xmm7 punpcklqdq xmm2, xmm5 ;-1 movd xmm7, [r2+1] punpckldq xmm6, xmm7 punpcklqdq xmm3, xmm6 ;+1 movd xmm6, [r2] movd xmm7, [r2+r3] punpckldq xmm6, xmm7 punpcklqdq xmm4, xmm6 ;+L psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 movhlps xmm0, xmm1 paddw xmm1, xmm0 movhlps xmm0, xmm2 paddw xmm2, xmm0 movhlps xmm0, xmm3 paddw xmm3, xmm0 movhlps xmm0, xmm4 paddw xmm4, xmm0 punpckldq xmm1, xmm4 punpckldq xmm2, xmm3 punpcklqdq xmm1, xmm2 movdqa [r4],xmm1 POP_XMM LOAD_5_PARA_POP ret ;*********************************************************************** ; ;Pixel_sad_4_wxh_sse2 END ; ;*********************************************************************** ;*********************************************************************** ; int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t ) ;*********************************************************************** WELS_EXTERN WelsSampleSad4x4_mmx %assign push_num 0 LOAD_4_PARA SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d movd mm0, [r0] movd mm1, [r0+r1] punpckldq mm0, mm1 movd mm3, [r2] movd mm4, [r2+r3] punpckldq mm3, mm4 psadbw mm0, mm3 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movd mm1, [r0] movd mm2, [r0+r1] punpckldq mm1, mm2 movd mm3, [r2] movd mm4, [r2+r3] punpckldq mm3, mm4 psadbw mm1, mm3 paddw mm0, mm1 movd retrd, mm0 WELSEMMS LOAD_4_PARA_POP ret