ref: 691c2bc1e780f0891dc4d86d998a753092dbcf20
dir: /codec/encoder/core/x86/sample_sc.asm/
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*************************************************************************/
%include "asm_inc.asm"
SECTION .text
;**********************************************************************************************************************************
;
;	uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
;
;	\note:
;		src need align with 16 bytes, ref is optional
;	\return value:
;		return minimal SAD cost, according index carried by index_min_cost
;**********************************************************************************************************************************
; try 8 mv via offset
; xmm7 store sad costs
%macro   SAD_16x16_LINE_SSE41  4	; src, ref, stride_src, stride_ref
    movdqa		xmm0, [%1]
    movdqu		xmm1, [%2]
    movdqu		xmm2, [%2+8h]
    movdqa		xmm3, xmm1
    movdqa		xmm4, xmm2
    mpsadbw		xmm1, xmm0, 0	; 000 B
    paddw		xmm7, xmm1		; accumulate cost
    mpsadbw		xmm3, xmm0, 5	; 101 B
    paddw		xmm7, xmm3		; accumulate cost
    mpsadbw		xmm2, xmm0, 2	; 010 B
    paddw		xmm7, xmm2		; accumulate cost
    mpsadbw		xmm4, xmm0, 7	; 111 B
    paddw		xmm7, xmm4		; accumulate cost
    add			%1, %3
    add			%2, %4
%endmacro	; end of SAD_16x16_LINE_SSE41
%macro   SAD_16x16_LINE_SSE41E  4	; src, ref, stride_src, stride_ref
    movdqa		xmm0, [%1]
    movdqu		xmm1, [%2]
    movdqu		xmm2, [%2+8h]
    movdqa		xmm3, xmm1
    movdqa		xmm4, xmm2
    mpsadbw		xmm1, xmm0, 0	; 000 B
    paddw		xmm7, xmm1		; accumulate cost
    mpsadbw		xmm3, xmm0, 5	; 101 B
    paddw		xmm7, xmm3		; accumulate cost
    mpsadbw		xmm2, xmm0, 2	; 010 B
    paddw		xmm7, xmm2		; accumulate cost
    mpsadbw		xmm4, xmm0, 7	; 111 B
    paddw		xmm7, xmm4		; accumulate cost
%endmacro	; end of SAD_16x16_LINE_SSE41E
WELS_EXTERN SampleSad16x16Hor8_sse41
    ;push ebx
    ;push esi
    ;mov eax, [esp+12]	;   src
    ;mov ecx, [esp+16]	;   stride_src
    ;mov ebx, [esp+20]	;   ref
    ;mov edx, [esp+24]	;   stride_ref
    ;mov esi, [esp+28]	;   base_cost
    %assign  push_num 0
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION	r1, r1d
    SIGN_EXTENSION	r3, r3d
    pxor	xmm7,	xmm7
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
    SAD_16x16_LINE_SSE41E	r0, r2, r1, r3
    pxor	xmm0,	xmm0
    movdqa	xmm6,	xmm7
    punpcklwd	xmm6,	xmm0
    punpckhwd	xmm7,	xmm0
    movdqa	xmm5,	[r4]
    movdqa	xmm4,	xmm5
    punpcklwd	xmm4,	xmm0
    punpckhwd	xmm5,	xmm0
    paddd	xmm4,	xmm6
    paddd	xmm5,	xmm7
    movdqa	xmm3,	xmm4
    pminud	xmm3,	xmm5
    pshufd	xmm2,	xmm3,	01001110B
    pminud	xmm2,	xmm3
    pshufd	xmm3,	xmm2,	10110001B
    pminud	xmm2,	xmm3
    movd	retrd,	xmm2
    pcmpeqd	xmm4,	xmm2
    movmskps	r2d, xmm4
    bsf		r1d,	r2d
    jnz	near WRITE_INDEX
    pcmpeqd	xmm5,	xmm2
    movmskps	r2d, xmm5
    bsf		r1d,	r2d
    add		r1d,	4
WRITE_INDEX:
    mov		[r5],	r1d
    POP_XMM
    LOAD_6_PARA_POP
    ret
;**********************************************************************************************************************************
;
;	uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
;
;	\note:
;		src and ref is optional to align with 16 due inter 8x8
;	\return value:
;		return minimal SAD cost, according index carried by index_min_cost
;
;**********************************************************************************************************************************
; try 8 mv via offset
; xmm7 store sad costs
%macro   SAD_8x8_LINE_SSE41  4	; src, ref, stride_src, stride_ref
    movdqu		xmm0, [%1]
    movdqu		xmm1, [%2]
    movdqa		xmm2, xmm1
    mpsadbw		xmm1, xmm0, 0	; 000 B
    paddw		xmm7, xmm1		; accumulate cost
    mpsadbw		xmm2, xmm0, 5	; 101 B
    paddw		xmm7, xmm2		; accumulate cost
    add			%1, %3
    add			%2, %4
%endmacro	; end of SAD_8x8_LINE_SSE41
%macro   SAD_8x8_LINE_SSE41E  4	; src, ref, stride_src, stride_ref
    movdqu		xmm0, [%1]
    movdqu		xmm1, [%2]
    movdqa		xmm2, xmm1
    mpsadbw		xmm1, xmm0, 0	; 000 B
    paddw		xmm7, xmm1		; accumulate cost
    mpsadbw		xmm2, xmm0, 5	; 101 B
    paddw		xmm7, xmm2		; accumulate cost
%endmacro	; end of SAD_8x8_LINE_SSE41E
WELS_EXTERN SampleSad8x8Hor8_sse41
    %assign  push_num 0
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION	r1, r1d
    SIGN_EXTENSION	r3, r3d
    movdqa xmm7, [r4]	;	load base cost list
    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
    SAD_8x8_LINE_SSE41E	r0, r2, r1, r3
    phminposuw	xmm0, xmm7	; horizon search the minimal sad cost and its index
    movd	retrd, xmm0	; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
    mov		r1d, retrd
    and		retrd, 0xFFFF
    sar		r1d, 16
    mov		[r5], r1d
    POP_XMM
    LOAD_6_PARA_POP
    ret