ref: 783e6fbd124f0d5ef4f993c24ab67abd80706cdb
parent: 7a9f15cb8a1371556242aa2de79d09ae42c57675
author: Guangwei Wang <guangwwa@cisco.com>
date: Fri Oct 28 12:20:30 EDT 2016
avoid text-rel on x86-32bits
--- a/codec/common/x86/dct.asm
+++ b/codec/common/x86/dct.asm
@@ -393,12 +393,38 @@
; out=%1 in=%1 clobber=%2
%macro SSE2_DCT_HORIZONTAL 2
pshuflw %2, %1, 1bh ; [x[3],x[2],x[1],x[0]] low qw
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0xffff0001 ;wels_p1m1p1m1w_128
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ push 0x0001ffff ;wels_p1m1m1p1w_128
+ push 0xffff0001
+ push 0x0001ffff
+ push 0xffff0001
+ push 0x00020001 ;wels_p1p2p1p2w_128
+ push 0x00020001
+ push 0x00020001
+ push 0x00020001
+ pmullw %1, [esp+32] ; [x[0],-x[1],x[2],-x[3], ...]
+%else
pmullw %1, [wels_p1m1p1m1w_128] ; [x[0],-x[1],x[2],-x[3], ...]
+%endif
pshufhw %2, %2, 1bh ; [x[3],x[2],x[1],x[0]] high qw
paddw %1, %2 ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
pshufd %2, %1, 0b1h ; [s[2],s[3],s[0],s[1], ...]
+%ifdef X86_32_PICASM
+ pmullw %1, [esp+16] ; [s[0],-s[1],-s[2],s[3], ...]
+ pmullw %2, [esp] ; [s[2],2*s[3],s[0],2*s[1], ...]]
+ mov esp, r0
+ pop r0
+%else
pmullw %1, [wels_p1m1m1p1w_128] ; [s[0],-s[1],-s[2],s[3], ...]
pmullw %2, [wels_p1p2p1p2w_128] ; [s[2],2*s[3],s[0],2*s[1], ...]]
+%endif
paddw %1, %2 ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
%endmacro
@@ -410,7 +436,22 @@
;
; out=%1 in=%1 wels_p1m1m1p1w_128=%2 clobber=%3,%4
%macro SSE2_IDCT_HORIZONTAL 4
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x80000000 ;wels_p0m8000p0m8000w_128
+ push 0x80000000
+ push 0x80000000
+ push 0x80000000
+ push 0xffffffff ;wels_p1p1m1m1w_128
+ push 0x00010001
+ push 0xffffffff
+ push 0x00010001
+ movdqa %3, [esp+16]
+%else
movdqa %3, [wels_p0m8000p0m8000w_128]
+%endif
pmulhw %3, %1 ; x[0:7] * [0,-8000h,0,-8000h, ...] >> 16
pshufd %4, %1, 0b1h ; [x[2],x[3],x[0],x[1], ...]
pmullw %4, %2 ; [x[2],-x[3],-x[0],x[1], ...]
@@ -417,7 +458,13 @@
paddw %1, %3 ; [x[0]+0,x[1]+(-x[1]>>1),x[2]+0,x[3]+(-x[3]>>1), ...]
paddw %1, %4 ; s = [x[0]+x[2],(x[1]>>1)-x[3],x[2]-x[0],(x[3]>>1)+x[1], ...]
pshuflw %3, %1, 1bh ; [s[3],s[2],s[1],s[0]] low qw
+%ifdef X86_32_PICASM
+ pmullw %1, [esp] ; [s[0],s[1],-s[2],-s[3], ...]
+ mov esp, r0
+ pop r0
+%else
pmullw %1, [wels_p1p1m1m1w_128] ; [s[0],s[1],-s[2],-s[3], ...]
+%endif
pshufhw %3, %3, 1bh ; [s[3],s[2],s[1],s[0]] high qw
pmullw %3, %2 ; [s[3],-s[2],-s[1],s[0], ...]
paddw %1, %3 ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...]
@@ -434,9 +481,24 @@
punpckhqdq %2, %1 ; s03 = [x0+x3,x0-x3]
punpcklqdq %3, %1 ; s12 = [x1+x2,x1-x2]
movdqa %1, %2
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x00020002 ;wels_4xp1w_4xp2w
+ push 0x00020002
+ push 0x00010001
+ push 0x00010001
+ pmullw %1, [esp] ; [s03[0],2*s03[1]]
+ paddw %1, %3 ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]]
+ pmullw %3, [esp] ; [s12[0],2*s12[1]]
+ mov esp, r0
+ pop r0
+%else
pmullw %1, [wels_4xp1w_4xp2w] ; [s03[0],2*s03[1]]
paddw %1, %3 ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]]
pmullw %3, [wels_4xp1w_4xp2w] ; [s12[0],2*s12[1]]
+%endif
psubw %2, %3 ; [y2,y3] = [s03[0]-s12[0],s03[1]-2*s12[1]]
%endmacro
@@ -444,7 +506,20 @@
; Output is scrambled to save a negation.
; [y1,y0]=%1 [y2,y3]=%2 [x0,x1]=%1 [x2,x3]=%2 clobber=%3,%4
%macro SSE2_IDCT_4x4P 4
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x80008000 ;wels_4xp0w_4xm8000w
+ push 0x80008000
+ push 0x00000000
+ push 0x00000000
+ movdqa %4, [esp]
+ mov esp, r0
+ pop r0
+%else
movdqa %4, [wels_4xp0w_4xm8000w]
+%endif
movdqa %3, %1
pmulhw %3, %4 ; x[0:1] * [0,-8000h] >> 16
pmulhw %4, %2 ; x[2:3] * [0,-8000h] >> 16
@@ -521,7 +596,18 @@
;Load 4x8
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x0001ffff ;wels_p1m1m1p1w_128
+ push 0xffff0001
+ push 0x0001ffff
+ push 0xffff0001
+ movdqa xmm7, [esp]
+%else
movdqa xmm7, [wels_p1m1m1p1w_128]
+%endif
SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
@@ -540,7 +626,13 @@
lea r2, [r2 + 2 * r3]
SSE2_Load4x8p r4+64, xmm0, xmm1, xmm4, xmm2, xmm5
+%ifdef X86_32_PICASM
+ movdqa xmm7, [esp]
+ mov esp, r5
+ pop r5
+%else
movdqa xmm7, [wels_p1m1m1p1w_128]
+%endif
SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
@@ -604,7 +696,20 @@
SSE2_Load2x4P xmm0, r4
SSE2_Load2x4P xmm1, r4+16
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xfffffff0
+ push 0x0001ffff ;wels_p1m1m1p1w_128
+ push 0xffff0001
+ push 0x0001ffff
+ push 0xffff0001
+ movdqa xmm4, [esp]
+ mov esp, r5
+ pop r5
+%else
movdqa xmm4, [wels_p1m1m1p1w_128]
+%endif
SSE2_IDCT_HORIZONTAL xmm0, xmm4, xmm2, xmm3
SSE2_IDCT_HORIZONTAL xmm1, xmm4, xmm2, xmm3
SSE2_IDCT_4x4P xmm0, xmm1, xmm2, xmm3
@@ -710,7 +815,20 @@
vpshufb y%9, y%9, y%8
vpaddsw y%4, y%4, y%9
vpackuswb y%3, y%3, y%4
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xffffffe0
+ push 0x0d0f0e0c ;wels_shufb0231_128
+ push 0x090b0a08
+ push 0x05070604
+ push 0x01030200
+ vbroadcasti128 y%4, [esp]
+ mov esp, r0
+ pop r0
+%else
vbroadcasti128 y%4, [wels_shufb0231_128]
+%endif
vpshufb y%3, y%3, y%4
vextracti128 x%4, y%3, 1
vmovlps [%1 ], x%3
@@ -788,7 +906,20 @@
AVX2_Loadzx4x4P %8, %4, %5, y%7, %9, %10
vpaddsw y%3, y%3, y%8
vpackuswb y%3, y%3, y%3
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xffffffe0
+ push 0x0d0f0e0c ;wels_shufb0231_128
+ push 0x090b0a08
+ push 0x05070604
+ push 0x01030200
+ vbroadcasti128 y%8, [esp]
+ mov esp, r0
+ pop r0
+%else
vbroadcasti128 y%8, [wels_shufb0231_128]
+%endif
vpshufb y%3, y%3, y%8
vextracti128 x%8, y%3, 1
vmovd [%1 ], x%3
@@ -834,10 +965,39 @@
; Uses scrambled input to save a negation.
; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 wels_shufb2301=%2 clobber=%3
%macro AVX2_DCT_HORIZONTAL 3
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xffffffe0
+ push 0xffff0001 ;wels_p1m1p1m1w_256
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ push 0xfffeffff ;wels_p1m2m1m2w_256
+ push 0x00020001
+ push 0xfffeffff
+ push 0x00020001
+ push 0xfffeffff
+ push 0x00020001
+ push 0xfffeffff
+ push 0x00020001
+ vpsignw %3, %1, [esp+32] ; [x0,-x3,x1,-x2]
+%else
vpsignw %3, %1, [wels_p1m1p1m1w_256] ; [x0,-x3,x1,-x2]
+%endif
vpshufb %1, %1, %2 ; [x3,x0,x2,x1]
vpaddw %1, %1, %3 ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
+%ifdef X86_32_PICASM
+ vpmullw %3, %1, [esp] ; [s[0],2*s[1],-s[2],-2*s[3], ...]
+ mov esp, r0
+ pop r0
+%else
vpmullw %3, %1, [wels_p1p2m1m2w_256] ; [s[0],2*s[1],-s[2],-2*s[3], ...]
+%endif
vpshufd %1, %1, 0b1h ; [s[2],s[3],s[0],s[1], ...]
vpaddw %1, %1, %3 ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1], ...]
%endmacro
@@ -848,11 +1008,40 @@
%macro AVX2_IDCT_HORIZONTAL 3
vpsraw %3, %1, 1 ; [x0>>1,x1>>1,x2>>1,x3>>1]
vpblendw %3, %1, %3, 10101010b ; [x0,x1>>1,x2,x3>>1]
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xffffffe0
+ push 0xffffffff ;wels_p1p1m1m1w_256
+ push 0x00010001
+ push 0xffffffff
+ push 0x00010001
+ push 0xffffffff
+ push 0x00010001
+ push 0xffffffff
+ push 0x00010001
+ push 0xffff0001 ;wels_p1m1p1m1w_256
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ vpsignw %1, %1, [esp+32] ; [x0,x1,-x2,-x3]
+%else
vpsignw %1, %1, [wels_p1p1m1m1w_256] ; [x0,x1,-x2,-x3]
+%endif
vpshufd %3, %3, 0b1h ; [x2,x3>>1,x0,x1>>1]
vpaddw %1, %3, %1 ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
vpshufb %3, %1, %2 ; [s[1],s[0],s[3],s[2], ...]
+%ifdef X86_32_PICASM
+ vpsignw %1, %1, [esp] ; [s[0],-s[1],s[2],-s[3], ...]
+ mov esp, r0
+ pop r0
+%else
vpsignw %1, %1, [wels_p1m1p1m1w_256] ; [s[0],-s[1],s[2],-s[3], ...]
+%endif
vpaddw %1, %1, %3 ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2], ...]
%endmacro
@@ -860,10 +1049,39 @@
; Uses scrambled input to save a negation.
; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 clobber=%2
%macro AVX2_DCT_4x4P 2
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xffffffe0
+ push 0xffffffff ;wels_4xp1w_4xm1w_256
+ push 0xffffffff
+ push 0x00010001
+ push 0x00010001
+ push 0xffffffff
+ push 0xffffffff
+ push 0x00010001
+ push 0x00010001
+ push 0xfffefffe ;wels_4xp1w_4xp2w_4xm1w_4xm2w
+ push 0xfffefffe
+ push 0xffffffff
+ push 0xffffffff
+ push 0x00020002
+ push 0x00020002
+ push 0x00010001
+ push 0x00010001
+ vpsignw %2, %1, [esp+32] ; [x0,-x3,x1,-x2]
+%else
vpsignw %2, %1, [wels_4xp1w_4xm1w_256] ; [x0,-x3,x1,-x2]
+%endif
vpshufd %1, %1, 4eh ; [x3,x0,x2,x1]
vpaddw %1, %1, %2 ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
+%ifdef X86_32_PICASM
+ vpmullw %2, %1, [esp] ; [s[0],2*s[1],-s[2],-2*s[3]]
+ mov esp, r0
+ pop r0
+%else
vpmullw %2, %1, [wels_4xp1w_4xp2w_4xm1w_4xm2w] ; [s[0],2*s[1],-s[2],-2*s[3]]
+%endif
vpermq %1, %1, 4eh ; [s[2],s[3],s[0],s[1]]
vpaddw %1, %1, %2 ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1]]
%endmacro
@@ -874,11 +1092,40 @@
%macro AVX2_IDCT_4x4P 2
vpsraw %2, %1, 1 ; [x0>>1,x1>>1,x2>>1,x3>>1]
vpblendw %2, %1, %2, 11110000b ; [x0,x1>>1,x2,x3>>1]
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xffffffe0
+ push 0xffffffff ;wels_8xp1w_8xm1w
+ push 0xffffffff
+ push 0xffffffff
+ push 0xffffffff
+ push 0x00010001
+ push 0x00010001
+ push 0x00010001
+ push 0x00010001
+ push 0xffffffff ;wels_4xp1w_4xm1w_256
+ push 0xffffffff
+ push 0x00010001
+ push 0x00010001
+ push 0xffffffff
+ push 0xffffffff
+ push 0x00010001
+ push 0x00010001
+ vpsignw %1, %1, [esp+32] ; [x0,x1,-x2,-x3]
+%else
vpsignw %1, %1, [wels_8xp1w_8xm1w] ; [x0,x1,-x2,-x3]
+%endif
vpermq %2, %2, 4eh ; [x2,x3>>1,x0,x1>>1]
vpaddw %1, %2, %1 ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
vpshufd %2, %1, 4eh ; [s[1],s[0],s[3],s[2]]
+%ifdef X86_32_PICASM
+ vpmullw %1, %1, [esp] ; [s[0],-s[1],s[2],-s[3], ...]
+ mov esp, r0
+ pop r0
+%else
vpmullw %1, %1, [wels_4xp1w_4xm1w_256] ; [s[0],-s[1],s[2],-s[3], ...]
+%endif
vpaddw %1, %1, %2 ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2]]
%endmacro
@@ -892,7 +1139,22 @@
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r4, r4d
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x80068005 ;wels_shufb0312_movzxw_128
+ push 0x80078004
+ push 0x80028001
+ push 0x80038000
+ push 0x0d0c0f0e ;wels_shufb2301_128
+ push 0x09080b0a
+ push 0x05040706
+ push 0x01000302
+ vbroadcasti128 ymm6, [esp+16]
+%else
vbroadcasti128 ymm6, [wels_shufb0312_movzxw_128]
+%endif
;Load 4x16
AVX2_LoadDiff16P mm0, r1, r2, r3, r4, mm6, mm4, mm5
@@ -907,7 +1169,13 @@
AVX2_LoadDiff16P mm3, r1, r2, r3, r4, mm6, mm4, mm5
AVX2_DCT ymm0, ymm1, ymm2, ymm3, ymm5
+%ifdef X86_32_PICASM
+ vbroadcasti128 ymm6, [esp]
+ mov esp, r5
+ pop r5
+%else
vbroadcasti128 ymm6, [wels_shufb2301_128]
+%endif
AVX2_DCT_HORIZONTAL ymm0, ymm6, ymm5
AVX2_DCT_HORIZONTAL ymm1, ymm6, ymm5
AVX2_DCT_HORIZONTAL ymm2, ymm6, ymm5
@@ -940,7 +1208,26 @@
SIGN_EXTENSION r3, r3d
AVX2_Load4x16P mm0, mm1, mm2, mm3, r4, mm5
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x0d0c0f0e ;wels_shufb2301_128
+ push 0x09080b0a
+ push 0x05040706
+ push 0x01000302
+ push 0x80068005 ;wels_shufb0312_movzxw_128
+ push 0x80078004
+ push 0x80028001
+ push 0x80038000
+ push 0x00200020 ;wels_dw32_128
+ push 0x00200020
+ push 0x00200020
+ push 0x00200020
+ vbroadcasti128 ymm6, [esp+32]
+%else
vbroadcasti128 ymm6, [wels_shufb2301_128]
+%endif
AVX2_IDCT_HORIZONTAL ymm0, ymm6, ymm5
AVX2_IDCT_HORIZONTAL ymm1, ymm6, ymm5
AVX2_IDCT_HORIZONTAL ymm2, ymm6, ymm5
@@ -947,8 +1234,15 @@
AVX2_IDCT_HORIZONTAL ymm3, ymm6, ymm5
AVX2_IDCT ymm0, ymm1, ymm2, ymm3, ymm5
+%ifdef X86_32_PICASM
+ vbroadcasti128 ymm6, [esp+16]
+ vbroadcasti128 ymm7, [esp]
+ mov esp, r5
+ pop r5
+%else
vbroadcasti128 ymm6, [wels_shufb0312_movzxw_128]
vbroadcasti128 ymm7, [wels_dw32_128]
+%endif
AVX2_StoreDiff32P r0, r1, mm0, mm1, r2, r3, mm7, mm6, mm5, mm4
add r2, r3
add r0, r1
@@ -969,10 +1263,31 @@
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r4, r4d
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x80068005 ;wels_shufb0312_movzxw_128
+ push 0x80078004
+ push 0x80028001
+ push 0x80038000
+ push 0x0d0c0f0e ;wels_shufb2301_128
+ push 0x09080b0a
+ push 0x05040706
+ push 0x01000302
+ vbroadcasti128 ymm1, [esp+16]
+%else
vbroadcasti128 ymm1, [wels_shufb0312_movzxw_128]
+%endif
AVX2_LoadDiff4x4P mm0, r1, r2, r3, r4, mm1, mm2, mm3, mm4
AVX2_DCT_4x4P ymm0, ymm2
+%ifdef X86_32_PICASM
+ vbroadcasti128 ymm1, [esp]
+ mov esp, r5
+ pop r5
+%else
vbroadcasti128 ymm1, [wels_shufb2301_128]
+%endif
AVX2_DCT_HORIZONTAL ymm0, ymm1, ymm2
AVX2_Store4x4P r0, mm0
vzeroupper
@@ -1001,11 +1316,37 @@
SIGN_EXTENSION r3, r3d
AVX2_Load4x4P mm0, r4
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x0d0c0f0e ;wels_shufb2301_128
+ push 0x09080b0a
+ push 0x05040706
+ push 0x01000302
+ push 0x80068005 ;wels_shufb0312_movzxw_128
+ push 0x80078004
+ push 0x80028001
+ push 0x80038000
+ push 0x00200020 ;wels_dw32_128
+ push 0x00200020
+ push 0x00200020
+ push 0x00200020
+ vbroadcasti128 ymm4, [esp+32]
+%else
vbroadcasti128 ymm4, [wels_shufb2301_128]
+%endif
AVX2_IDCT_HORIZONTAL ymm0, ymm4, ymm1
AVX2_IDCT_4x4P ymm0, ymm1
+%ifdef X86_32_PICASM
+ vbroadcasti128 ymm4, [esp+16]
+ vbroadcasti128 ymm5, [esp]
+ mov esp, r5
+ pop r5
+%else
vbroadcasti128 ymm4, [wels_shufb0312_movzxw_128]
vbroadcasti128 ymm5, [wels_dw32_128]
+%endif
AVX2_StoreDiff4x4P r0, r1, mm0, r2, r3, mm5, mm4, mm1, mm2, mm3
vzeroupper
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -157,9 +157,25 @@
; Unbias and split into a non-negative and a non-positive part.
; Clip each part to iTc via minub.
; Add/subtract each part to/from p0/q0 and clip.
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ sub esp, 16
+ and esp, -16
+ push 0x60606060 ;WELS_DB96_16
+ push 0x60606060
+ push 0x60606060
+ push 0x60606060
+ movdqa %6, [esp]
+ psubusb %6, %8
+ psubusb %8, [esp]
+ mov esp, r0
+ pop r0
+%else
movdqa %6, [WELS_DB96_16]
psubusb %6, %8
psubusb %8, [WELS_DB96_16]
+%endif
pminub %6, %5
pminub %8, %5
psubusb %2, %6
@@ -182,8 +198,21 @@
movd xmm1, arg3d
movd xmm2, arg4d
pxor xmm3, xmm3
+%ifdef X86_32_PICASM
+ push r4
+ mov r4, esp
+ sub esp, 16
+ and esp, -16
+ push 0x7f7f7f7f
+ push 0x7f7f7f7f
+ push 0x7f7f7f7f
+ push 0x7f7f7f7f
+ pxor xmm1, [esp]
+ pxor xmm2, [esp]
+%else
pxor xmm1, [WELS_DB127_16]
pxor xmm2, [WELS_DB127_16]
+%endif
pshufb xmm1, xmm3 ; iAlpha ^ 0x7f
pshufb xmm2, xmm3 ; iBeta ^ 0x7f
mov r2, r1 ; iStride
@@ -196,22 +225,40 @@
MOVDQ xmm0, [r0 + 0 * r2] ; q0
movdqa xmm4, xmm6
SSE2_AbsDiffUB xmm6, xmm0, xmm3 ; |p0 - q0|
+%ifdef X86_32_PICASM
+ SSE2_CmpltUB xmm6, xmm1, [esp] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
+%else
SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
+%endif
MOVDQ xmm1, [r0 + 1 * r2] ; q1
SSE2_AbsDiffUB xmm7, xmm4, xmm3 ; |p1 - p0|
SSE2_AbsDiffUB xmm0, xmm1, xmm3 ; |q1 - q0|
pmaxub xmm7, xmm0 ; max(|p1 - p0|, |q1 - q0|)
+%ifdef X86_32_PICASM
+ SSE2_CmpltUB xmm7, xmm2, [esp] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
+%else
SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
+%endif
pand xmm6, xmm7 ; bDeltaP0Q0P1P0Q1Q0 = bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
MOVDQ xmm7, [r3 + 2 * r1] ; p2
movdqa xmm0, xmm7
SSE2_AbsDiffUB xmm7, xmm4, xmm3 ; |p2 - p0|
+%ifdef X86_32_PICASM
+ SSE2_CmpltUB xmm7, xmm2, [esp] ; bDeltaP2P0 = |p2 - p0| < iBeta
+%else
SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP2P0 = |p2 - p0| < iBeta
+%endif
MOVDQ xmm5, [r0 + 2 * r2] ; q2
MOVDQ xmm3, [r0 + 0 * r2] ; q0
movdqa xmm1, xmm5
SSE2_AbsDiffUB xmm5, xmm3, xmm4 ; |q2 - q0|
+%ifdef X86_32_PICASM
+ SSE2_CmpltUB xmm5, xmm2, [esp] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
+ mov esp, r4
+ pop r4
+%else
SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
+%endif
pavgb xmm3, [r3 + 0 * r1]
pcmpeqw xmm2, xmm2 ; FFh
@@ -226,7 +273,21 @@
pxor xmm1, xmm2
movd xmm3, [r4]
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ sub esp, 16
+ and esp, -16
+ push 0x03030303 ;WELS_SHUFB0000111122223333
+ push 0x02020202
+ push 0x01010101
+ push 0x00000000
+ pshufb xmm3, [esp] ; iTc
+ mov esp, r0
+ pop r0
+%else
pshufb xmm3, [WELS_SHUFB0000111122223333] ; iTc
+%endif
movdqa xmm4, xmm3 ; iTc0 = iTc
pcmpgtb xmm3, xmm2 ; iTc > -1 ? 0xff : 0x00
pand xmm6, xmm3 ; bDeltaP0Q0P1P0Q1Q0 &= iTc > -1
@@ -328,8 +389,21 @@
add r2, 1
movd xmm3, r2d
pxor xmm4, xmm4
+%ifdef X86_32_PICASM
+ push r4
+ mov r4, esp
+ sub esp, 16
+ and esp, -16
+ push 0x7f7f7f7f ;WELS_DB127_16
+ push 0x7f7f7f7f
+ push 0x7f7f7f7f
+ push 0x7f7f7f7f
+ pxor xmm1, [esp]
+ pxor xmm2, [esp]
+%else
pxor xmm1, [WELS_DB127_16]
pxor xmm2, [WELS_DB127_16]
+%endif
pshufb xmm1, xmm4 ; iAlpha ^ 0x7f
pshufb xmm2, xmm4 ; iBeta ^ 0x7f
pshufb xmm3, xmm4 ; (iAlpha >> 2) + 1
@@ -344,23 +418,41 @@
movdqa xmm4, xmm6
SSE2_AbsDiffUB xmm6, xmm0, xmm5 ; |p0 - q0|
SSE2_CmpgeUB xmm3, xmm6 ; |p0 - q0| < (iAlpha >> 2) + 2
+%ifdef X86_32_PICASM
+ SSE2_CmpltUB xmm6, xmm1, [esp] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
+%else
SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
+%endif
MOVDQ xmm1, [r0 + 1 * r2] ; q1
SSE2_AbsDiffUB xmm7, xmm4, xmm5 ; |p1 - p0|
SSE2_AbsDiffUB xmm0, xmm1, xmm5 ; |q1 - q0|
pmaxub xmm7, xmm0 ; max(|p1 - p0|, |q1 - q0|)
+%ifdef X86_32_PICASM
+ SSE2_CmpltUB xmm7, xmm2, [esp] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
+%else
SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
+%endif
pand xmm6, xmm7 ; & bDeltaP0Q0
MOVDQ xmm7, [r3 + 2 * r1] ; p2
SSE2_AbsDiffUB xmm7, xmm4, xmm5 ; |p2 - p0|
+%ifdef X86_32_PICASM
+ SSE2_CmpltUB xmm7, xmm2, [esp] ; bDeltaP2P0 = |p2 - p0| < iBeta
+%else
SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP2P0 = |p2 - p0| < iBeta
+%endif
pand xmm7, xmm3 ; &= |p0 - q0| < (iAlpha >> 2) + 2
MOVDQ xmm0, [r0 + 0 * r2] ; q0
MOVDQ xmm5, [r0 + 2 * r2] ; q2
SSE2_AbsDiffUB xmm5, xmm0, xmm4 ; |q2 - q0|
+%ifdef X86_32_PICASM
+ SSE2_CmpltUB xmm5, xmm2, [esp] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
+ mov esp, r4
+ pop r4
+%else
SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
+%endif
pand xmm5, xmm3 ; &= |p0 - q0| < (iAlpha >> 2) + 2
%ifdef X86_32
@@ -369,12 +461,26 @@
mov r2, esp
sub esp, 16
and esp, -16
+%ifdef X86_32_PICASM
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ sub esp, 16
movdqa [esp], xmm5
+ SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [esp+16]
+ movdqa xmm5, [esp]
+ neg r1
+ SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [esp+16]
+ mov esp, r2
+%else
+ movdqa [esp], xmm5
SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [WELS_DB1_16]
movdqa xmm5, [esp]
mov esp, r2
neg r1
SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [WELS_DB1_16]
+%endif
%else
movdqa xmm9, [WELS_DB1_16]
SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm8, xmm4, 1, xmm9
--- a/codec/common/x86/mc_chroma.asm
+++ b/codec/common/x86/mc_chroma.asm
@@ -119,7 +119,14 @@
paddw mm0, mm1
movq mm1,mm7
+%ifdef X86_32_PICASM
+ pcmpeqw mm7, mm7
+ psrlw mm7, 15
+ psllw mm7, 5
+ paddw mm0, mm7
+%else
paddw mm0, [h264_d0x20_mmx]
+%endif
psrlw mm0, 6
WELS_Zero mm7
@@ -194,7 +201,14 @@
paddw xmm0, xmm1
movdqa xmm1,xmm7
+%ifdef X86_32_PICASM
+ pcmpeqw xmm7, xmm7
+ psrlw xmm7, 15
+ psllw xmm7, 5
+ paddw xmm0, xmm7
+%else
paddw xmm0, [h264_d0x20_sse2]
+%endif
psrlw xmm0, 6
WELS_Zero xmm7
@@ -243,7 +257,13 @@
sub r2, r3 ;sub esi, edi
sub r2, r3
+%ifdef X86_32_PICASM
+ pcmpeqw xmm7, xmm7
+ psrlw xmm7, 15
+ psllw xmm7, 5
+%else
movdqa xmm7, [h264_d0x20_sse2]
+%endif
movdqu xmm0, [r0]
movdqa xmm1, xmm0
--- a/codec/common/x86/mc_luma.asm
+++ b/codec/common/x86/mc_luma.asm
@@ -112,8 +112,22 @@
SECTION .text
+%ifdef X86_32_PICASM
+%macro MOVEIMM_DW16 1
+ pcmpeqw %1, %1
+ psrlw %1, 15
+ psllw %1, 4
+%endmacro
+%macro MOVEIMM_DW32 1
+ pcmpeqw %1, %1
+ psrlw %1, 15
+ psllw %1, 5
+%endmacro
+
+%endif
+
;*******************************************************************************
; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
; int iSrcStride,
@@ -130,7 +144,11 @@
sub r0, 2
WELS_Zero mm7
+%ifdef X86_32_PICASM
+ MOVEIMM_DW16 mm6
+%else
movq mm6, [h264_w0x10_1]
+%endif
.height_loop:
movd mm0, [r0]
punpcklbw mm0, mm7
@@ -179,9 +197,14 @@
%macro FILTER_HV_W8 9
paddw %1, %6
+%ifdef X86_32_PICASM
+ MOVEIMM_DW16 %8
+ paddw %1, %8
+%else
+ paddw %1, [h264_w0x10_1]
+%endif
movdqa %8, %3
movdqa %7, %2
- paddw %1, [h264_w0x10_1]
paddw %8, %4
paddw %7, %5
psllw %8, 2
@@ -198,9 +221,14 @@
%macro FILTER_HV_W4 9
paddw %1, %6
+%ifdef X86_32_PICASM
+MOVEIMM_DW16 %8
+paddw %1, %8
+%else
+paddw %1, [h264_w0x10_1]
+%endif
movdqa %8, %3
movdqa %7, %2
-paddw %1, [h264_w0x10_1]
paddw %8, %4
paddw %7, %5
psllw %8, 2
@@ -291,7 +319,11 @@
lea r0, [r0-2] ;pSrc -= 2;
pxor xmm7, xmm7
+%ifdef X86_32_PICASM
+ MOVEIMM_DW16 xmm6
+%else
movdqa xmm6, [h264_w0x10_1]
+%endif
.y_loop:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
@@ -347,7 +379,11 @@
lea r0, [r0-2] ;pSrc -= 2;
pxor xmm7, xmm7
+%ifdef X86_32_PICASM
+ MOVEIMM_DW16 xmm6
+%else
movdqa xmm6, [h264_w0x10_1]
+%endif
.y_loop:
movq xmm0, [r0]
@@ -819,7 +855,12 @@
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
+%ifdef X86_32_PICASM
+ MOVEIMM_DW16 xmm6
+ paddw xmm0, xmm6
+%else
paddw xmm0, [h264_w0x10_1]
+%endif
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2], xmm0
@@ -836,7 +877,11 @@
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
+%ifdef X86_32_PICASM
+ paddw xmm2, xmm6
+%else
paddw xmm2, [h264_w0x10_1]
+%endif
psraw xmm2, 5
packuswb xmm2, xmm2
movq [r2+1], xmm2
@@ -873,7 +918,12 @@
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
+%ifdef X86_32_PICASM
+ MOVEIMM_DW16 xmm6
+ paddw xmm0, xmm6
+%else
paddw xmm0, [h264_w0x10_1]
+%endif
psraw xmm0, 5
packuswb xmm0, xmm0
movq [r2], xmm0
@@ -901,7 +951,12 @@
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
+%ifdef X86_32_PICASM
+ MOVEIMM_DW16 xmm6
+ paddw xmm0, xmm6
+%else
paddw xmm0, [h264_w0x10_1]
+%endif
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2+8], xmm0
@@ -919,7 +974,11 @@
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
+%ifdef X86_32_PICASM
+ paddw xmm2, xmm6
+%else
paddw xmm2, [h264_w0x10_1]
+%endif
psraw xmm2, 5
packuswb xmm2, xmm2
movq [r2+9], xmm2
@@ -976,7 +1035,12 @@
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
+%ifdef X86_32_PICASM
+MOVEIMM_DW16 xmm6
+paddw xmm0, xmm6
+%else
paddw xmm0, [h264_w0x10_1]
+%endif
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2], xmm0
@@ -993,7 +1057,11 @@
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
+%ifdef X86_32_PICASM
+paddw xmm2, xmm6
+%else
paddw xmm2, [h264_w0x10_1]
+%endif
psraw xmm2, 5
packuswb xmm2, xmm2
movd [r2+1], xmm2
@@ -1170,7 +1238,12 @@
psubw %1, %7
psraw %1, 2
paddw %8, %1
+%ifdef X86_32_PICASM
+ MOVEIMM_DW32 %7
+ paddw %8, %7
+%else
paddw %8, [h264_mc_hc_32]
+%endif
psraw %8, 6
packuswb %8, %8
movq %9, %8
@@ -1522,7 +1595,12 @@
psubw %1, %7
psraw %1, 2
paddw %8, %1
+%ifdef X86_32_PICASM
+MOVEIMM_DW32 %7
+paddw %8, %7
+%else
paddw %8, [h264_mc_hc_32]
+%endif
psraw %8, 6
packuswb %8, %8
movd %9, %8
@@ -1801,7 +1879,12 @@
movdqa %7, %3
pmaddubsw %7, %6
paddw %1, %7
+%ifdef X86_32_PICASM
+ MOVEIMM_DW16 %7
+ paddw %1, %7
+%else
paddw %1, [h264_w0x10_1]
+%endif
psraw %1, 5
%endmacro
@@ -1818,7 +1901,12 @@
movdqa %7, %4
pmaddubsw %7, %6
paddw %1, %7
+%ifdef X86_32_PICASM
+ MOVEIMM_DW16 %7
+ paddw %1, %7
+%else
paddw %1, [h264_w0x10_1]
+%endif
psraw %1, 5
%endmacro
@@ -1828,7 +1916,20 @@
pshufb %1, %2
pshufb %5, %3
pshufd %6, %1, 10110001b
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x14141414 ;db20_128
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ pmaddubsw %1, [esp]
+ mov esp, r0
+ pop r0
+%else
pmaddubsw %1, [db20_128]
+%endif
pmaddubsw %5, %4
pmaddubsw %6, %4
paddw %1, %5
@@ -1838,7 +1939,12 @@
; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
%macro SSSE3_FilterHorizontal_8px 6
SSSE3_FilterHorizontalbw_8px %1, %2, %3, %4, %5, %6
+%ifdef X86_32_PICASM
+ MOVEIMM_DW16 %5
+ paddw %1, %5
+%else
paddw %1, [h264_w0x10_1]
+%endif
psraw %1, 5
%endmacro
@@ -1853,7 +1959,20 @@
pshufb %7, %4
punpcklqdq %6, %7
pshufd %7, %1, 10110001b
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x14141414 ;db20_128
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ pmaddubsw %1, [esp]
+ mov esp, r0
+ pop r0
+%else
pmaddubsw %1, [db20_128]
+%endif
pmaddubsw %6, %5
pmaddubsw %7, %5
paddw %1, %6
@@ -1863,13 +1982,31 @@
; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
%macro SSSE3_FilterHorizontal_2x4px 7
SSSE3_FilterHorizontalbw_2x4px %1, %2, %3, %4, %5, %6, %7
+%ifdef X86_32_PICASM
+ MOVEIMM_DW16 %6
+ paddw %1, %6
+%else
paddw %1, [h264_w0x10_1]
+%endif
psraw %1, 5
%endmacro
; pixels=%1 -32768>>scale=%2 tmp=%3
%macro SSSE3_FilterHorizontalbw_2px 3
+%ifdef X86_32_PICASM
+ push r1
+ mov r1, esp
+ and esp, 0xfffffff0
+ push 0x0000fe0a
+ push 0xd8d80afe
+ push 0x0000fe0a
+ push 0xd8d80afe
+ pmaddubsw %1, [esp]
+ mov esp, r1
+ pop r1
+%else
pmaddubsw %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_128]
+%endif
pmaddwd %1, %2
pshufd %3, %1, 10110001b
paddd %1, %3
@@ -1877,8 +2014,33 @@
; pixels=%1 tmp=%2
%macro SSSE3_FilterHorizontal_2px 2
+%ifdef X86_32_PICASM
+ push r1
+ mov r1, esp
+ and esp, 0xfffffff0
+ push 0x0000fe0a
+ push 0xd8d80afe
+ push 0x0000fe0a
+ push 0xd8d80afe
+ pmaddubsw %1, [esp]
+ push 0xfc00fc00
+ push 0xfc00fc00
+ push 0xfc00fc00
+ push 0xfc00fc00
+ pmaddwd %1, [esp]
+ pshufd %2, %1, 10110001b
+ paddd %1, %2
+ push 0x00008000
+ push 0x00008000
+ push 0x00008000
+ push 0x00008000
+ paddd %1, [esp]
+ mov esp, r1
+ pop r1
+%else
SSSE3_FilterHorizontalbw_2px %1, [dwm1024_128], %2
paddd %1, [dd32768_128]
+%endif
%endmacro
; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
@@ -1893,8 +2055,14 @@
paddw %7, %4
paddw %1, %7
psraw %1, 2
+%ifdef X86_32_PICASM
+ paddw %1, %7
+ MOVEIMM_DW32 %7
+ paddw %1, %7
+%else
paddw %7, [h264_mc_hc_32]
paddw %1, %7
+%endif
psraw %1, 6
%endmacro
@@ -1931,6 +2099,23 @@
lea i_srcstride3, [3 * i_srcstride]
cmp i_width, 4
jg .width8or16
+
+%ifdef X86_32_PICASM
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ movdqu xmm6, [esp]
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ movdqu xmm7, [esp]
+ push 0x14141414 ;db20_128
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+%endif
movd xmm0, [p_src]
movd xmm4, [p_src + i_srcstride]
punpcklbw xmm0, xmm4
@@ -1949,8 +2134,14 @@
movd xmm3, [p_src]
punpcklbw xmm4, xmm3
punpcklqdq xmm2, xmm4
+%ifdef X86_32_PICASM
+ movdqu xmm5, [esp]
+ SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, xmm6, xmm5, xmm7, xmm4
+ add esp, 48
+%else
movdqa xmm5, [db20_128]
SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+%endif
packuswb xmm0, xmm0
movd [p_dst], xmm0
psrlq xmm0, 32
@@ -1961,7 +2152,11 @@
movd xmm0, [p_src + 2 * i_srcstride]
punpcklbw xmm4, xmm0
punpcklqdq xmm3, xmm4
+%ifdef X86_32_PICASM
+ SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, xmm6, xmm5, xmm7, xmm4
+%else
SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+%endif
packuswb xmm1, xmm1
movd [p_dst], xmm1
psrlq xmm1, 32
@@ -1972,7 +2167,11 @@
movd xmm4, [p_src + i_srcstride3]
punpcklbw xmm0, xmm4
jg .width4_height_ge8
+%ifdef X86_32_PICASM
+ SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, xmm6, xmm5, xmm7, xmm4
+%else
SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+%endif
packuswb xmm2, xmm2
movd [p_dst], xmm2
.width4_height_le5_done:
@@ -1987,7 +2186,11 @@
movd xmm1, [p_src]
punpcklbw xmm4, xmm1
punpcklqdq xmm0, xmm4
+%ifdef X86_32_PICASM
+ SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, xmm6, xmm5, xmm7, xmm4
+%else
SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+%endif
packuswb xmm2, xmm2
movd [p_dst], xmm2
psrlq xmm2, 32
@@ -1998,7 +2201,11 @@
movd xmm2, [p_src + 2 * i_srcstride]
punpcklbw xmm4, xmm2
punpcklqdq xmm1, xmm4
+%ifdef X86_32_PICASM
+ SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, xmm6, xmm5, xmm7, xmm4
+%else
SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+%endif
packuswb xmm3, xmm3
movd [p_dst], xmm3
psrlq xmm3, 32
@@ -2008,7 +2215,11 @@
lea p_dst, [p_dst + 2 * i_dststride]
movd xmm4, [p_src + i_srcstride3]
punpcklbw xmm2, xmm4
+%ifdef X86_32_PICASM
+ SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, xmm6, xmm5, xmm7, xmm4
+%else
SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+%endif
packuswb xmm0, xmm0
movd [p_dst], xmm0
.width4_height_ge8_done:
@@ -2027,6 +2238,31 @@
.xloop:
push p_src
push p_dst
+%ifdef X86_32_PICASM
+ push i_width
+ mov i_width, esp
+ and esp, 0xfffffff0
+ push 0xfb01fb01 ;[esp+64]maddubsw_p1m5_128
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0x14141414 ;[esp+48]db20_128
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x01fb01fb ;[esp+32]maddubsw_m5p1_128
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x14fb14fb ;[esp+16]maddubsw_m5p20_128
+ push 0x14fb14fb
+ push 0x14fb14fb
+ push 0x14fb14fb
+ push 0xfb14fb14 ;[esp] maddubsw_p20m5_128
+ push 0xfb14fb14
+ push 0xfb14fb14
+ push 0xfb14fb14
+%endif
test i_ycnt, 1
jnz .yloop_begin_even
movq xmm0, [p_src]
@@ -2040,7 +2276,11 @@
movq xmm5, [p_src + i_srcstride]
lea p_src, [p_src + 2 * i_srcstride]
punpcklbw xmm4, xmm5
+%ifdef X86_32_PICASM
+ SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [esp+64], [esp+48], [esp+32], xmm7
+%else
SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm7
+%endif
packuswb xmm0, xmm0
movlps [p_dst], xmm0
add p_dst, i_dststride
@@ -2057,20 +2297,36 @@
punpcklbw xmm4, xmm5
.yloop:
movq xmm6, [p_src]
+%ifdef X86_32_PICASM
+ SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [esp+16], [esp], xmm0, xmm7
+%else
SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm0, xmm7
+%endif
movq xmm7, [p_src + i_srcstride]
punpcklbw xmm6, xmm7
+%ifdef X86_32_PICASM
+ SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [esp+64], [esp+48], [esp+32], xmm0
+%else
SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm0
+%endif
packuswb xmm1, xmm2
movlps [p_dst], xmm1
movhps [p_dst + i_dststride], xmm1
lea p_dst, [p_dst + 2 * i_dststride]
movq xmm0, [p_src + 2 * i_srcstride]
+%ifdef X86_32_PICASM
+ SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [esp+16], [esp], xmm2, xmm1
+%else
SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm2, xmm1
+%endif
movq xmm1, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
punpcklbw xmm0, xmm1
+%ifdef X86_32_PICASM
+ SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [esp+64], [esp+48], [esp+32], xmm2
+%else
SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm2
+%endif
packuswb xmm3, xmm4
movlps [p_dst], xmm3
movhps [p_dst + i_dststride], xmm3
@@ -2078,20 +2334,36 @@
jle .yloop_exit
lea p_dst, [p_dst + 2 * i_dststride]
movq xmm2, [p_src]
+%ifdef X86_32_PICASM
+ SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [esp+16], [esp], xmm4, xmm3
+%else
SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm4, xmm3
+%endif
movq xmm3, [p_src + i_srcstride]
punpcklbw xmm2, xmm3
+%ifdef X86_32_PICASM
+ SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [esp+64], [esp+48], [esp+32], xmm4
+%else
SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm4
+%endif
packuswb xmm5, xmm6
movlps [p_dst], xmm5
movhps [p_dst + i_dststride], xmm5
lea p_dst, [p_dst + 2 * i_dststride]
movq xmm4, [p_src + 2 * i_srcstride]
+%ifdef X86_32_PICASM
+ SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [esp+16], [esp], xmm6, xmm5
+%else
SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm6, xmm5
+%endif
movq xmm5, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
punpcklbw xmm4, xmm5
+%ifdef X86_32_PICASM
+ SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [esp+64], [esp+48], [esp+32], xmm6
+%else
SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm6
+%endif
packuswb xmm7, xmm0
movlps [p_dst], xmm7
movhps [p_dst + i_dststride], xmm7
@@ -2099,6 +2371,10 @@
sub i_ycnt, 8
jg .yloop
.yloop_exit:
+%ifdef X86_32_PICASM
+ mov esp, i_width
+ pop i_width
+%endif
pop p_dst
pop p_src
sub i_width, 8
@@ -2148,9 +2424,28 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+%ifdef X86_32_PICASM
+ push 0x090a0809 ;shufb_32435465768798A9
+ push 0x07080607
+ push 0x05060405
+ push 0x03040203
+ movdqu xmm4, [esp]
+ push 0x0c0b0b0a
+ push 0x06050504
+ push 0x08070706
+ push 0x02010100
+ movdqu xmm5, [esp]
+ push 0x01fb01fb
+ push 0xfb01fb01
+ push 0x01fb01fb
+ push 0xfb01fb01
+ movdqu xmm6, [esp]
+ add esp, 48
+%else
movdqa xmm4, [shufb_32435465768798A9]
movdqa xmm5, [shufb_011267784556ABBC]
movdqa xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
cmp i_width, 8
je .width8_yloop
jg .width16_yloop
@@ -2229,9 +2524,28 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+%ifdef X86_32_PICASM
+ push 0x090a0809 ;shufb_32435465768798A9
+ push 0x07080607
+ push 0x05060405
+ push 0x03040203
+ movdqu xmm5, [esp]
+ push 0x0c0b0b0a
+ push 0x06050504
+ push 0x08070706
+ push 0x02010100
+ movdqu xmm6, [esp]
+ push 0x01fb01fb
+ push 0xfb01fb01
+ push 0x01fb01fb
+ push 0xfb01fb01
+ movdqu xmm7, [esp]
+ add esp, 48
+%else
movdqa xmm5, [shufb_32435465768798A9]
movdqa xmm6, [shufb_011267784556ABBC]
movdqa xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
cmp i_width, 9
je .width9_yloop
jg .width17_yloop
@@ -2329,9 +2643,28 @@
SIGN_EXTENSION r3, r3d
sub p_src, i_srcstride
sub p_src, i_srcstride
+%ifdef X86_32_PICASM
+ push 0x090a0809 ;shufb_32435465768798A9
+ push 0x07080607
+ push 0x05060405
+ push 0x03040203
+ movdqu xmm4, [esp]
+ push 0x0c0b0b0a
+ push 0x06050504
+ push 0x08070706
+ push 0x02010100
+ movdqu xmm5, [esp]
+ push 0x01fb01fb
+ push 0xfb01fb01
+ push 0x01fb01fb
+ push 0xfb01fb01
+ movdqu xmm6, [esp]
+ add esp, 48
+%else
movdqa xmm4, [shufb_32435465768798A9]
movdqa xmm5, [shufb_011267784556ABBC]
movdqa xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
sub i_height, 1
.yloop:
movdqu xmm0, [p_src - 2]
@@ -2443,9 +2776,28 @@
SIGN_EXTENSION r4, r4d
sub p_src, i_srcstride
sub p_src, i_srcstride
+%ifdef X86_32_PICASM
+ push 0x090a0809 ;shufb_32435465768798A9
+ push 0x07080607
+ push 0x05060405
+ push 0x03040203
+ movdqu xmm4, [esp]
+ push 0x0c0b0b0a
+ push 0x06050504
+ push 0x08070706
+ push 0x02010100
+ movdqu xmm5, [esp]
+ push 0x01fb01fb
+ push 0xfb01fb01
+ push 0x01fb01fb
+ push 0xfb01fb01
+ movdqu xmm6, [esp]
+ add esp, 48
+%else
movdqa xmm4, [shufb_32435465768798A9]
movdqa xmm5, [shufb_011267784556ABBC]
movdqa xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
sub i_height, 1
.yloop:
movdqu xmm0, [p_src - 2]
@@ -2623,9 +2975,28 @@
sub p_src, i_srcstride
pcmpeqw xmm4, xmm4
psllw xmm4, 15 ; dw -32768
+%ifdef X86_32_PICASM
+ push 0x090a0809 ;shufb_32435465768798A9
+ push 0x07080607
+ push 0x05060405
+ push 0x03040203
+ movdqu xmm5, [esp]
+ push 0x0c0b0b0a
+ push 0x06050504
+ push 0x08070706
+ push 0x02010100
+ movdqu xmm6, [esp]
+ push 0x01fb01fb
+ push 0xfb01fb01
+ push 0x01fb01fb
+ push 0xfb01fb01
+ movdqu xmm7, [esp]
+ add esp, 48
+%else
movdqa xmm5, [shufb_32435465768798A9]
movdqa xmm6, [shufb_011267784556ABBC]
movdqa xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
cmp i_width, 9
jne .width17_yloop
@@ -2909,7 +3280,24 @@
vpshufb %5, %1, %3
vpshufb %1, %1, %2
vpshufd %6, %1, 10110001b
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xffffffe0
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ vpmaddubsw %1, %1, [esp]
+ mov esp, r0
+ pop r0
+%else
vpmaddubsw %1, %1, [db20_256]
+%endif
vpmaddubsw %5, %5, %4
vpmaddubsw %6, %6, %4
vpaddw %1, %1, %5
@@ -2919,7 +3307,14 @@
; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 db20=%4 tmp=%5,%6
%macro AVX2_FilterHorizontal_16px 6
AVX2_FilterHorizontalbw_16px %1, %2, %3, %4, %5, %6
+%ifdef X86_32_PICASM
+ vpcmpeqw %6, %6, %6
+ vpsrlw %6, %6, 15
+ vpsllw %6, %6, 4
+ vpaddw %1, %1, %6
+%else
vpaddw %1, %1, [h264_w0x10_256]
+%endif
vpsraw %1, %1, 5
%endmacro
@@ -2932,7 +3327,24 @@
vpunpcklqdq %1, %1, %2
vpunpcklqdq %6, %6, %7
vpshufd %7, %1, 10110001b
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xffffffe0
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ vpmaddubsw %1, %1, [esp]
+ mov esp, r0
+ pop r0
+%else
vpmaddubsw %1, %1, [db20_256]
+%endif
vpmaddubsw %6, %6, %5
vpmaddubsw %7, %7, %5
vpaddw %1, %1, %6
@@ -2942,7 +3354,14 @@
; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 db20=%5 tmp=%6,%7
%macro AVX2_FilterHorizontal_4x4px 7
AVX2_FilterHorizontalbw_4x4px %1, %2, %3, %4, %5, %6, %7
+%ifdef X86_32_PICASM
+ vpcmpeqw %7, %7, %7
+ vpsrlw %7, %7, 15
+ vpsllw %7, %7, 4
+ vpaddw %1, %1, %7
+%else
vpaddw %1, %1, [h264_w0x10_256]
+%endif
vpsraw %1, %1, 5
%endmacro
@@ -2956,8 +3375,45 @@
; pixels=%1 tmp=%2
%macro AVX2_FilterHorizontal_4px 2
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xffffffe0
+ push 0x0000fe0a ;maddubsw_m2p10_m40m40_p10m2_p0p0_256
+ push 0xd8d80afe
+ push 0x0000fe0a
+ push 0xd8d80afe
+ push 0x0000fe0a
+ push 0xd8d80afe
+ push 0x0000fe0a
+ push 0xd8d80afe
+ push 0xfc00fc00 ;dwm1024_256
+ push 0xfc00fc00
+ push 0xfc00fc00
+ push 0xfc00fc00
+ push 0xfc00fc00
+ push 0xfc00fc00
+ push 0xfc00fc00
+ push 0xfc00fc00
+ push 0x00008000 ;dd32768_256
+ push 0x00008000
+ push 0x00008000
+ push 0x00008000
+ push 0x00008000
+ push 0x00008000
+ push 0x00008000
+ push 0x00008000
+ vpmaddubsw %1, %1, [esp+64]
+ vpmaddwd %1, %1, [esp+32]
+ vpshufd %2, %1, 10110001b
+ vpaddd %1, %1, %2
+ vpaddd %1, %1, [esp]
+ mov esp, r0
+ pop r0
+%else
AVX2_FilterHorizontalbw_4px %1, [dwm1024_256], %2
vpaddd %1, %1, [dd32768_256]
+%endif
%endmacro
; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
@@ -2967,7 +3423,14 @@
vpaddw %1, %1, %7
vpmaddubsw %7, %3, %6
vpaddw %1, %1, %7
+%ifdef X86_32_PICASM
+ vpcmpeqw %7, %7, %7
+ vpsrlw %7, %7, 15
+ vpsllw %7, %7, 4
+ vpaddw %1, %1, %7
+%else
vpaddw %1, %1, [h264_w0x10_256]
+%endif
vpsraw %1, %1, 5
%endmacro
@@ -2981,7 +3444,14 @@
vpaddw %1, %1, %7
vpmaddubsw %7, %4, %6
vpaddw %1, %1, %7
+%ifdef X86_32_PICASM
+ vpcmpeqw %7, %7, %7
+ vpsrlw %7, %7, 15
+ vpsllw %7, %7, 4
+ vpaddw %1, %1, %7
+%else
vpaddw %1, %1, [h264_w0x10_256]
+%endif
vpsraw %1, %1, 5
%endmacro
@@ -2995,7 +3465,24 @@
vpaddw %7, %3, %4
vpaddw %1, %1, %7
vpsraw %1, %1, 2
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xffffffe0
+ push 0x00200020
+ push 0x00200020
+ push 0x00200020
+ push 0x00200020
+ push 0x00200020
+ push 0x00200020
+ push 0x00200020
+ push 0x00200020
+ vpaddw %7, %7, [esp]
+ mov esp, r0
+ pop r0
+%else
vpaddw %7, %7, [dw32_256]
+%endif
vpaddw %1, %1, %7
vpsraw %1, %1, 6
%endmacro
@@ -3035,6 +3522,32 @@
je .width8
jg .width16
; .width4:
+%ifdef X86_32_PICASM
+ push i_width
+ mov i_width, esp
+ and esp, 0xffffffe0
+ sub esp, 16
+ push 0x14141414 ;db20_128
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0xfb01fb01 ;maddubsw_p1m5_256
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0x01fb01fb ;maddubsw_m5p1_256
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+%endif
vmovd xmm0, [p_src]
vpbroadcastd xmm5, [p_src + i_srcstride]
vpunpcklbw xmm0, xmm0, xmm5
@@ -3061,8 +3574,13 @@
vpunpcklbw ymm5, ymm5, ymm4
vpblendd ymm3, ymm3, ymm5, 11001100b
vpblendd ymm2, ymm2, ymm3, 11110000b
+%ifdef X86_32_PICASM
+ vbroadcasti128 ymm6, [esp+64]
+ AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [esp+32], ymm6, [esp], ymm5
+%else
vbroadcasti128 ymm6, [db20_128]
AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5
+%endif
vpackuswb ymm0, ymm0, ymm0
vmovd [p_dst], xmm0
vpsrlq xmm5, xmm0, 32
@@ -3078,7 +3596,11 @@
vpbroadcastd ymm5, [p_src + i_srcstride3]
vpunpcklbw ymm4, ymm4, ymm5
jg .width4_height_ge8
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [esp+32], xmm6, [esp], xmm5
+%else
AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5
+%endif
vpackuswb xmm2, xmm2, xmm2
vmovd [p_dst], xmm2
jmp .width4_done
@@ -3094,7 +3616,11 @@
vpunpcklbw ymm5, ymm5, ymm0
vpblendd ymm1, ymm1, ymm5, 11001100b
vpblendd ymm4, ymm4, ymm1, 11110000b
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [esp+32], ymm6, [esp], ymm5
+%else
AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5
+%endif
vpackuswb ymm2, ymm2, ymm2
vmovd [p_dst], xmm2
vpsrlq xmm5, xmm2, 32
@@ -3109,10 +3635,18 @@
lea p_dst, [p_dst + 2 * i_dststride]
vmovd xmm5, [p_src + i_srcstride3]
vpunpcklbw xmm0, xmm0, xmm5
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [esp+32], xmm6, [esp], xmm5
+%else
AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5
+%endif
vpackuswb xmm4, xmm4, xmm4
vmovd [p_dst], xmm4
.width4_done:
+%ifdef X86_32_PICASM
+ mov esp, i_width
+ pop i_width
+%endif
vzeroupper
POP_XMM
LOAD_6_PARA_POP
@@ -3122,6 +3656,32 @@
ret
.width8:
+%ifdef X86_32_PICASM
+ push i_width
+ mov i_width, esp
+ and esp, 0xffffffe0
+ sub esp, 16
+ push 0x14141414 ;db20_128
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0xfb01fb01 ;maddubsw_p1m5_256
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0x01fb01fb ;maddubsw_m5p1_256
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+%endif
sub i_height, 1
vmovq xmm0, [p_src]
vmovq xmm4, [p_src + i_srcstride]
@@ -3141,8 +3701,13 @@
vmovq xmm3, [p_src + 2 * i_srcstride]
vpunpcklbw xmm4, xmm4, xmm3
vinserti128 ymm2, ymm2, xmm4, 1
+%ifdef X86_32_PICASM
+ vbroadcasti128 ymm5, [esp+64]
+ AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [esp+32], ymm5, [esp], ymm4
+%else
vbroadcasti128 ymm5, [db20_128]
AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
+%endif
vmovq xmm4, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vpunpcklbw xmm3, xmm3, xmm4
@@ -3149,7 +3714,11 @@
vmovq xmm6, [p_src]
vpunpcklbw xmm4, xmm4, xmm6
vinserti128 ymm3, ymm3, xmm4, 1
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [esp+32], ymm5, [esp], ymm4
+%else
AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
+%endif
vpackuswb ymm0, ymm0, ymm1
vmovlps [p_dst], xmm0
vextracti128 xmm1, ymm0, 1
@@ -3163,7 +3732,11 @@
vmovq xmm4, [p_src + i_srcstride]
vpunpcklbw xmm0, xmm6, xmm4
jg .width8_height_ge8
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [esp+32], xmm5, [esp], xmm4
+%else
AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4
+%endif
vpackuswb xmm2, xmm2, xmm2
vmovlps [p_dst], xmm2
jmp .width8_done
@@ -3171,7 +3744,11 @@
vmovq xmm1, [p_src + 2 * i_srcstride]
vpunpcklbw xmm4, xmm4, xmm1
vinserti128 ymm0, ymm0, xmm4, 1
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [esp+32], ymm5, [esp], ymm4
+%else
AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
+%endif
vmovq xmm4, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vpunpcklbw xmm1, xmm1, xmm4
@@ -3178,7 +3755,11 @@
vmovq xmm6, [p_src]
vpunpcklbw xmm4, xmm4, xmm6
vinserti128 ymm1, ymm1, xmm4, 1
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [esp+32], ymm5, [esp], ymm4
+%else
AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
+%endif
vpackuswb ymm2, ymm2, ymm3
vmovlps [p_dst], xmm2
vextracti128 xmm3, ymm2, 1
@@ -3192,10 +3773,18 @@
jl .width8_done
vmovq xmm4, [p_src + i_srcstride]
vpunpcklbw xmm2, xmm6, xmm4
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [esp+32], xmm5, [esp], xmm4
+%else
AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4
+%endif
vpackuswb xmm0, xmm0, xmm0
vmovlps [p_dst], xmm0
.width8_done:
+%ifdef X86_32_PICASM
+ mov esp, i_width
+ pop i_width
+%endif
vzeroupper
POP_XMM
LOAD_6_PARA_POP
@@ -3205,6 +3794,51 @@
ret
.width16:
+%ifdef X86_32_PICASM
+ push i_width
+ mov i_width, esp
+ and esp, 0xffffffe0
+ push 0x14141414 ;db20_128
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0x14141414
+ push 0xfb01fb01 ;maddubsw_p1m5_256
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0xfb01fb01
+ push 0x01fb01fb ;maddubsw_m5p1_256
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x01fb01fb
+ push 0x14fb14fb ;maddubsw_m5p20_256
+ push 0x14fb14fb
+ push 0x14fb14fb
+ push 0x14fb14fb
+ push 0x14fb14fb
+ push 0x14fb14fb
+ push 0x14fb14fb
+ push 0x14fb14fb
+ push 0xfb14fb14 ;maddubsw_p20m5_256
+ push 0xfb14fb14
+ push 0xfb14fb14
+ push 0xfb14fb14
+ push 0xfb14fb14
+ push 0xfb14fb14
+ push 0xfb14fb14
+ push 0xfb14fb14
+%endif
sub i_height, 1
test i_height, 1
jnz .width16_yloop_begin_even
@@ -3231,7 +3865,11 @@
lea p_src, [p_src + 2 * i_srcstride]
vpblendd ymm5, ymm5, ymm6, 11110000b
vpunpcklbw ymm4, ymm4, ymm5
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [esp+96], [esp+128], [esp+64], ymm7
+%else
AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm7
+%endif
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 1000b
vmovdqa [p_dst], xmm0
@@ -3261,12 +3899,20 @@
vmovq xmm6, [p_src]
vpbroadcastq ymm7, [p_src + 8]
vpblendd ymm6, ymm6, ymm7, 11110000b
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [esp+32], [esp], ymm0, ymm7
+%else
AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm0, ymm7
+%endif
vmovq xmm7, [p_src + i_srcstride]
vpbroadcastq ymm0, [p_src + i_srcstride + 8]
vpblendd ymm7, ymm7, ymm0, 11110000b
vpunpcklbw ymm6, ymm6, ymm7
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [esp+96], [esp+128], [esp+64], ymm0
+%else
AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm0
+%endif
vpackuswb ymm1, ymm1, ymm2
vpermq ymm1, ymm1, 11011000b
vmovdqa [p_dst], xmm1
@@ -3275,13 +3921,21 @@
vmovq xmm0, [p_src + 2 * i_srcstride]
vpbroadcastq ymm1, [p_src + 2 * i_srcstride + 8]
vpblendd ymm0, ymm0, ymm1, 11110000b
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [esp+32], [esp], ymm2, ymm1
+%else
AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm2, ymm1
+%endif
vmovq xmm1, [p_src + i_srcstride3]
vpbroadcastq ymm2, [p_src + i_srcstride3 + 8]
lea p_src, [p_src + 4 * i_srcstride]
vpblendd ymm1, ymm1, ymm2, 11110000b
vpunpcklbw ymm0, ymm0, ymm1
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [esp+96], [esp+128], [esp+64], ymm2
+%else
AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm2
+%endif
vpackuswb ymm3, ymm3, ymm4
vpermq ymm3, ymm3, 11011000b
vmovdqa [p_dst], xmm3
@@ -3290,12 +3944,20 @@
vmovq xmm2, [p_src]
vpbroadcastq ymm3, [p_src + 8]
vpblendd ymm2, ymm2, ymm3, 11110000b
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [esp+32], [esp], ymm4, ymm3
+%else
AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm4, ymm3
+%endif
vmovq xmm3, [p_src + i_srcstride]
vpbroadcastq ymm4, [p_src + i_srcstride + 8]
vpblendd ymm3, ymm3, ymm4, 11110000b
vpunpcklbw ymm2, ymm2, ymm3
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [esp+96], [esp+128], [esp+64], ymm4
+%else
AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm4
+%endif
vpackuswb ymm5, ymm5, ymm6
vpermq ymm5, ymm5, 11011000b
vmovdqa [p_dst], xmm5
@@ -3304,13 +3966,21 @@
vmovq xmm4, [p_src + 2 * i_srcstride]
vpbroadcastq ymm5, [p_src + 2 * i_srcstride + 8]
vpblendd ymm4, ymm4, ymm5, 11110000b
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [esp+32], [esp], ymm6, ymm5
+%else
AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm6, ymm5
+%endif
vmovq xmm5, [p_src + i_srcstride3]
vpbroadcastq ymm6, [p_src + i_srcstride3 + 8]
lea p_src, [p_src + 4 * i_srcstride]
vpblendd ymm5, ymm5, ymm6, 11110000b
vpunpcklbw ymm4, ymm4, ymm5
+%ifdef X86_32_PICASM
+ AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [esp+96], [esp+128], [esp+64], ymm6
+%else
AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm6
+%endif
vpackuswb ymm7, ymm7, ymm0
vpermq ymm7, ymm7, 11011000b
vmovdqa [p_dst], xmm7
@@ -3318,6 +3988,10 @@
lea p_dst, [p_dst + 2 * i_dststride]
sub i_height, 8
jg .width16_yloop
+%ifdef X86_32_PICASM
+ mov esp, i_width
+ pop i_width
+%endif
vzeroupper
POP_XMM
LOAD_6_PARA_POP
@@ -3358,9 +4032,32 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+%ifdef X86_32_PICASM
+ push r1
+ mov r1, esp
+ and esp, 0xfffffff0
+ push 0x090a0809 ;shufb_32435465768798A9
+ push 0x07080607
+ push 0x05060405
+ push 0x03040203
+ vbroadcasti128 ymm4, [esp]
+ push 0x0c0b0b0a
+ push 0x06050504
+ push 0x08070706
+ push 0x02010100
+ vbroadcasti128 ymm5, [esp]
+ push 0x01fb01fb
+ push 0xfb01fb01
+ push 0x01fb01fb
+ push 0xfb01fb01
+ vbroadcasti128 ymm6, [esp]
+ mov esp, r1
+ pop r1
+%else
vbroadcasti128 ymm4, [shufb_32435465768798A9]
vbroadcasti128 ymm5, [shufb_011267784556ABBC]
vbroadcasti128 ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
cmp i_width, 8
je .width8
jg .width16_yloop
@@ -3464,9 +4161,32 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
+%ifdef X86_32_PICASM
+ push r1
+ mov r1, esp
+ and esp, 0xfffffff0
+ push 0x090a0809 ;shufb_32435465768798A9
+ push 0x07080607
+ push 0x05060405
+ push 0x03040203
+ vbroadcasti128 ymm5, [esp]
+ push 0x0c0b0b0a
+ push 0x06050504
+ push 0x08070706
+ push 0x02010100
+ vbroadcasti128 ymm6, [esp]
+ push 0x01fb01fb
+ push 0xfb01fb01
+ push 0x01fb01fb
+ push 0xfb01fb01
+ vbroadcasti128 ymm7, [esp]
+ mov esp, r1
+ pop r1
+%else
vbroadcasti128 ymm5, [shufb_32435465768798A9]
vbroadcasti128 ymm6, [shufb_011267784556ABBC]
vbroadcasti128 ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
cmp i_width, 9
je .width9
jg .width17
@@ -3607,9 +4327,32 @@
sub p_src, i_srcstride
sub p_src, i_srcstride
lea i_srcstride3, [3 * i_srcstride]
+%ifdef X86_32_PICASM
+ push r1
+ mov r1, esp
+ and esp, 0xfffffff0
+ push 0x090a0809 ;shufb_32435465768798A9
+ push 0x07080607
+ push 0x05060405
+ push 0x03040203
+ vbroadcasti128 ymm4, [esp]
+ push 0x0c0b0b0a
+ push 0x06050504
+ push 0x08070706
+ push 0x02010100
+ vbroadcasti128 ymm5, [esp]
+ push 0x01fb01fb
+ push 0xfb01fb01
+ push 0x01fb01fb
+ push 0xfb01fb01
+ vbroadcasti128 ymm6, [esp]
+ mov esp, r1
+ pop r1
+%else
vbroadcasti128 ymm4, [shufb_32435465768798A9]
vbroadcasti128 ymm5, [shufb_011267784556ABBC]
vbroadcasti128 ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
sub i_height, 3
.yloop:
vmovdqu xmm0, [p_src - 2]
@@ -3732,9 +4475,32 @@
SIGN_EXTENSION r3, r3d
sub p_src, i_srcstride
sub p_src, i_srcstride
+%ifdef X86_32_PICASM
+ push r1
+ mov r1, esp
+ and esp, 0xfffffff0
+ push 0x090a0809 ;shufb_32435465768798A9
+ push 0x07080607
+ push 0x05060405
+ push 0x03040203
+ vbroadcasti128 ymm3, [esp]
+ push 0x0c0b0b0a
+ push 0x06050504
+ push 0x08070706
+ push 0x02010100
+ vbroadcasti128 ymm4, [esp]
+ push 0x01fb01fb
+ push 0xfb01fb01
+ push 0x01fb01fb
+ push 0xfb01fb01
+ vbroadcasti128 ymm5, [esp]
+ mov esp, r1
+ pop r1
+%else
vbroadcasti128 ymm3, [shufb_32435465768798A9]
vbroadcasti128 ymm4, [shufb_011267784556ABBC]
vbroadcasti128 ymm5, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
sub i_height, 1
.yloop:
vmovdqu xmm0, [p_src - 2]
@@ -3953,9 +4719,32 @@
SIGN_EXTENSION r3, r3d
sub p_src, i_srcstride
sub p_src, i_srcstride
+%ifdef X86_32_PICASM
+ push r1
+ mov r1, esp
+ and esp, 0xfffffff0
+ push 0x090a0809 ;shufb_32435465768798A9
+ push 0x07080607
+ push 0x05060405
+ push 0x03040203
+ vbroadcasti128 ymm4, [esp]
+ push 0x0c0b0b0a
+ push 0x06050504
+ push 0x08070706
+ push 0x02010100
+ vbroadcasti128 ymm5, [esp]
+ push 0x01fb01fb
+ push 0xfb01fb01
+ push 0x01fb01fb
+ push 0xfb01fb01
+ vbroadcasti128 ymm6, [esp]
+ mov esp, r1
+ pop r1
+%else
vbroadcasti128 ymm4, [shufb_32435465768798A9]
vbroadcasti128 ymm5, [shufb_011267784556ABBC]
vbroadcasti128 ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
sub i_height, 1
.yloop:
vmovdqu xmm0, [p_src - 2]
@@ -4114,9 +4903,47 @@
sub p_src, i_srcstride
sub p_src, i_srcstride
lea i_srcstride3, [3 * i_srcstride]
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x090a0809 ;shufb_32435465768798A9
+ push 0x07080607
+ push 0x05060405
+ push 0x03040203
+ vbroadcasti128 ymm5, [esp]
+ push 0x0c0b0b0a
+ push 0x06050504
+ push 0x08070706
+ push 0x02010100
+ vbroadcasti128 ymm6, [esp]
+ push 0x01fb01fb
+ push 0xfb01fb01
+ push 0x01fb01fb
+ push 0xfb01fb01
+ vbroadcasti128 ymm7, [esp]
+ sub esp, 16
+ push 0x0000fe0a ;maddubsw_m2p10_m40m40_p10m2_p0p0_256
+ push 0xd8d80afe
+ push 0x0000fe0a
+ push 0xd8d80afe
+ push 0x0000fe0a
+ push 0xd8d80afe
+ push 0x0000fe0a
+ push 0xd8d80afe
+ push 0x80008000 ;dwm32768_256
+ push 0x80008000
+ push 0x80008000
+ push 0x80008000
+ push 0x80008000
+ push 0x80008000
+ push 0x80008000
+ push 0x80008000
+%else
vbroadcasti128 ymm5, [shufb_32435465768798A9]
vbroadcasti128 ymm6, [shufb_011267784556ABBC]
vbroadcasti128 ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
sub i_height, 3
.yloop:
vmovdqu xmm0, [p_src - 2]
@@ -4134,7 +4961,14 @@
vinserti128 ymm0, ymm0, [p_src + i_srcstride3 + 6], 1
lea p_src, [p_src + 4 * i_srcstride]
vpunpckhqdq ymm4, ymm4, ymm0
+%ifdef X86_32_PICASM
+ vpmaddubsw ymm4, ymm4, [esp+32]
+ vpmaddwd ymm4, ymm4, [esp]
+ vpshufd ymm2, ymm4, 10110001b
+ vpaddd ymm4, ymm4, ymm2
+%else
AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2
+%endif
vmovlps [p_dst + 26], xmm4
vmovdqa [p_dst + 16], xmm3
vextracti128 xmm2, ymm4, 1
@@ -4157,7 +4991,16 @@
vmovdqu xmm3, [p_src + i_srcstride - 2]
vinserti128 ymm3, ymm3, [p_src + i_srcstride + 6], 1
vpunpckhqdq ymm4, ymm0, ymm3
+%ifdef X86_32_PICASM
+ vpmaddubsw ymm4, ymm4, [esp+32]
+ vpmaddwd ymm4, ymm4, [esp]
+ vpshufd ymm2, ymm4, 10110001b
+ vpaddd ymm4, ymm4, ymm2
+ mov esp, r5
+ pop r5
+%else
AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2
+%endif
AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
vextracti128 xmm4, ymm4, 1
--- a/codec/common/x86/satd_sad.asm
+++ b/codec/common/x86/satd_sad.asm
@@ -773,9 +773,28 @@
%endif
pxor xmm4, xmm4
+%ifdef X86_32_PICASM
+ push 0xff01ff01
+ push 0xff01ff01
+ push 0x01010101
+ push 0x01010101
+ movdqu xmm5, [esp]
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ movdqu xmm6, [esp]
+ push 0x00010001
+ push 0x00010001
+ push 0x00010001
+ push 0x00010001
+ movdqu xmm7, [esp]
+ add esp, 48
+%else
movdqa xmm5, [HSumSubDB1]
movdqa xmm6, [HSumSubDW1]
movdqa xmm7, [PDW1]
+%endif
sub r0, r1
movdqu xmm0, [r0]
movhlps xmm1, xmm0
@@ -974,7 +993,88 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
loop_chroma_satdx3:
+%ifdef X86_32_PICASM
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0xff01ff01
+ push 0xff01ff01
+ push 0x01010101
+ push 0x01010101
+ movdqa xmm5, [esp]
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ movdqa xmm6, [esp]
+ push 0x00010001
+ push 0x00010001
+ push 0x00010001
+ push 0x00010001
+ movdqa xmm7, [esp]
+ mov esp, r0
+ mov r0, [esp + push_num*4 + 4]
+
+ sub r0, r1
+ movq xmm0, [r0]
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+ movdqa [r6], xmm0 ;V
+ add r0, r1
+ pinsrb xmm0, byte[r0-1], 0
+ pinsrb xmm0, byte[r0+r1-1], 1
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 2
+ pinsrb xmm0, byte[r0+r1-1], 3
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 4
+ pinsrb xmm0, byte[r0+r1-1], 5
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 6
+ pinsrb xmm0, byte[r0+r1-1], 7
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+;movdqa [r6+16], xmm0 ;H
+;(sum+2)>>2
+ mov DWORD [r6+16], 0x0002
+ mov DWORD [r6+20], 0x0000
+ mov DWORD [r6+24], 0x0002
+ mov DWORD [r6+28], 0x0000
+ movdqa xmm6, [r6+16]
+ movdqa [r6+16], xmm0 ;H
+
+ movdqa xmm5, xmm4
+ punpckhqdq xmm5, xmm1
+ paddd xmm5, xmm6
+ psrld xmm5, 2
+;(sum1+sum2+4)>>3
+ paddd xmm6, xmm6
+ paddd xmm4, xmm1
+ paddd xmm4, xmm6
+ psrld xmm4, 3
+;satd *16
+ pslld xmm5, 4
+ pslld xmm4, 4
+;temp satd
+ movdqa xmm6, xmm4
+ punpcklqdq xmm4, xmm5
+ psllq xmm4, 32
+ psrlq xmm4, 32
+ movdqa [r6+32], xmm4
+ punpckhqdq xmm5, xmm6
+ psllq xmm5, 32
+ psrlq xmm5, 32
+ movdqa [r6+48], xmm5
+
+ pxor xmm4, xmm4 ;V
+ pxor xmm5, xmm5 ;H
+ pxor xmm6, xmm6 ;DC
+ mov r0, 0
+ SSE41_ChromaGetX38x4Satd r0, 0
+ inc r0
+ SSE41_ChromaGetX38x4Satd r0, 0
+%else
SSE41_ChromaGetX38x8Satd
+%endif
SSEReg2MMX xmm4, mm0,mm1
SSEReg2MMX xmm5, mm2,mm3
SSEReg2MMX xmm6, mm5,mm6
@@ -981,7 +1081,89 @@
mov r0, arg8
mov r2, arg9
+%ifdef X86_32_PICASM
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0xff01ff01
+ push 0xff01ff01
+ push 0x01010101
+ push 0x01010101
+ movdqa xmm5, [esp]
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ push 0xffff0001
+ movdqa xmm6, [esp]
+ push 0x00010001
+ push 0x00010001
+ push 0x00010001
+ push 0x00010001
+ movdqa xmm7, [esp]
+ mov esp, r0
+ mov r0, arg8
+
+ sub r0, r1
+ movq xmm0, [r0]
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+ movdqa [r6], xmm0 ;V
+ add r0, r1
+ pinsrb xmm0, byte[r0-1], 0
+ pinsrb xmm0, byte[r0+r1-1], 1
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 2
+ pinsrb xmm0, byte[r0+r1-1], 3
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 4
+ pinsrb xmm0, byte[r0+r1-1], 5
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 6
+ pinsrb xmm0, byte[r0+r1-1], 7
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+ ;movdqa [r6+16], xmm0 ;H
+;(sum+2)>>2
+
+ mov DWORD [r6+16], 0x0002
+ mov DWORD [r6+20], 0x0000
+ mov DWORD [r6+24], 0x0002
+ mov DWORD [r6+28], 0x0000
+ movdqa xmm6, [r6+16]
+ movdqa [r6+16], xmm0 ;H
+
+ movdqa xmm5, xmm4
+ punpckhqdq xmm5, xmm1
+ paddd xmm5, xmm6
+ psrld xmm5, 2
+;(sum1+sum2+4)>>3
+ paddd xmm6, xmm6
+ paddd xmm4, xmm1
+ paddd xmm4, xmm6
+ psrld xmm4, 3
+;satd *16
+ pslld xmm5, 4
+ pslld xmm4, 4
+;temp satd
+ movdqa xmm6, xmm4
+ punpcklqdq xmm4, xmm5
+ psllq xmm4, 32
+ psrlq xmm4, 32
+ movdqa [r6+32], xmm4
+ punpckhqdq xmm5, xmm6
+ psllq xmm5, 32
+ psrlq xmm5, 32
+ movdqa [r6+48], xmm5
+
+ pxor xmm4, xmm4 ;V
+ pxor xmm5, xmm5 ;H
+ pxor xmm6, xmm6 ;DC
+ mov r0, 0
+ SSE41_ChromaGetX38x4Satd r0, 0
+ inc r0
+ SSE41_ChromaGetX38x4Satd r0, 0
+%else
SSE41_ChromaGetX38x8Satd
+%endif
MMXReg2SSE xmm0, xmm3, mm0, mm1
MMXReg2SSE xmm1, xmm3, mm2, mm3
@@ -1279,7 +1461,16 @@
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
+%ifdef X86_32_PICASM
+ push 0xff01ff01
+ push 0x01010101
+ push 0xff01ff01
+ push 0x01010101
+ movdqu xmm4, [esp]
+ add esp, 16
+%else
movdqa xmm4,[HSwapSumSubDB1]
+%endif
movd xmm2,[r2]
movd xmm5,[r2+r3]
shufps xmm2,xmm5,0
@@ -1337,7 +1528,17 @@
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
+
+%ifdef X86_32_PICASM
+ push 0xff01ff01
+ push 0xff01ff01
+ push 0x01010101
+ push 0x01010101
+ movdqu xmm7, [esp]
+ add esp, 16
+%else
movdqa xmm7, [HSumSubDB1]
+%endif
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
pxor xmm6, xmm6
@@ -1370,7 +1571,17 @@
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
+
+%ifdef X86_32_PICASM
+ push 0xff01ff01
+ push 0xff01ff01
+ push 0x01010101
+ push 0x01010101
+ movdqu xmm7, [esp]
+ add esp, 16
+%else
movdqa xmm7, [HSumSubDB1]
+%endif
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
pxor xmm6, xmm6
@@ -1410,7 +1621,16 @@
push r0
push r2
+%ifdef X86_32_PICASM
+ push 0xff01ff01
+ push 0xff01ff01
+ push 0x01010101
+ push 0x01010101
+ movdqu xmm7, [esp]
+ add esp, 16
+%else
movdqa xmm7, [HSumSubDB1]
+%endif
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
pxor xmm6, xmm6
@@ -1457,7 +1677,16 @@
push r0
push r2
+%ifdef X86_32_PICASM
+ push 0xff01ff01
+ push 0xff01ff01
+ push 0x01010101
+ push 0x01010101
+ movdqu xmm7, [esp]
+ add esp, 16
+%else
movdqa xmm7, [HSumSubDB1]
+%endif
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
pxor xmm6, xmm6
@@ -1634,7 +1863,19 @@
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
+%ifdef X86_32_PICASM
+ mov r1, esp
+ and esp, 0xfffffff0
+ push 0xff01ff01
+ push 0xff01ff01
+ push 0x01010101
+ push 0x01010101
+ vbroadcasti128 ymm7, [esp]
+ mov esp, r1
+ mov r1, [esp + push_num*4 + 8]
+%else
vbroadcasti128 ymm7, [HSumSubDB1]
+%endif
lea r5, [3 * r1]
lea r6, [3 * r3]
vpxor ymm6, ymm6, ymm6
@@ -1700,8 +1941,21 @@
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
+%ifdef X86_32_PICASM
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0xff01ff01
+ push 0xff01ff01
+ push 0x01010101
+ push 0x01010101
+ vpbroadcastq xmm0, [esp]
+ vpbroadcastq ymm6, [esp + 8]
+ mov esp, r0
+ mov r0, [esp + push_num*4 + 4]
+%else
vpbroadcastq xmm0, [HSumSubDB1]
vpbroadcastq ymm6, [HSumSubDB1 + 8]
+%endif
vpblendd ymm6, ymm0, ymm6, 11110000b
lea r5, [3 * r1]
lea r6, [3 * r3]
--- a/codec/decoder/core/x86/intra_pred.asm
+++ b/codec/decoder/core/x86/intra_pred.asm
@@ -132,7 +132,20 @@
%macro COPY_16_TIMES 2
movdqa %2, [%1-16]
psrldq %2, 15
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xfffffff0
+ push 0x01010101 ;mmx_01bytes
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pmuludq %2, [esp]
+ mov esp, r5
+ pop r5
+%else
pmuludq %2, [mmx_01bytes]
+%endif
pshufd %2, %2, 0
%endmacro
@@ -139,7 +152,20 @@
%macro COPY_16_TIMESS 3
movdqa %2, [%1+%3-16]
psrldq %2, 15
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xfffffff0
+ push 0x01010101 ;mmx_01bytes
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pmuludq %2, [esp]
+ mov esp, r5
+ pop r5
+%else
pmuludq %2, [mmx_01bytes]
+%endif
pshufd %2, %2, 0
%endmacro
@@ -179,23 +205,50 @@
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
+%ifdef X86_32_PICASM
+ push r3
+ mov r3, esp
+ and esp, 0xfffffff0
+ push 0x01010101 ;mmx_01bytes
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+%endif
movzx r2, byte [r0-1]
movd xmm0, r2d
+%ifdef X86_32_PICASM
+ pmuludq xmm0, [esp]
+%else
pmuludq xmm0, [mmx_01bytes]
+%endif
movzx r2, byte [r0+r1-1]
movd xmm1, r2d
+%ifdef X86_32_PICASM
+ pmuludq xmm1, [esp]
+%else
pmuludq xmm1, [mmx_01bytes]
+%endif
lea r0, [r0+r1]
movzx r2, byte [r0+r1-1]
movd xmm2, r2d
+%ifdef X86_32_PICASM
+ pmuludq xmm2, [esp]
+%else
pmuludq xmm2, [mmx_01bytes]
+%endif
movzx r2, byte [r0+2*r1-1]
movd xmm3, r2d
+%ifdef X86_32_PICASM
+ pmuludq xmm3, [esp]
+ mov esp, r3
+ pop r3
+%else
pmuludq xmm3, [mmx_01bytes]
+%endif
sub r0, r1
movd [r0], xmm0
@@ -223,11 +276,37 @@
;for H
pxor xmm7, xmm7
movq xmm0, [r0]
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x00010002
+ push 0x00030004
+ push 0x00050006
+ push 0x00070008
+ movdqa xmm5, [esp]
+ mov esp, r0
+ pop r0
+%else
movdqa xmm5, [sse2_plane_dec]
+%endif
punpcklbw xmm0, xmm7
pmullw xmm0, xmm5
movq xmm1, [r0 + 9]
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x00080007 ;sse2_plane_inc
+ push 0x00060005
+ push 0x00040003
+ push 0x00020001
+ movdqa xmm6, [esp]
+ mov esp, r0
+ pop r0
+%else
movdqa xmm6, [sse2_plane_inc]
+%endif
punpcklbw xmm1, xmm7
pmullw xmm1, xmm6
psubw xmm1, xmm0
@@ -282,7 +361,19 @@
SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
xor r2, r2
+%ifdef X86_32_PICASM
+ mov r2, esp
+ and esp, 0xfffffff0
+ push 0x0000ffff ;sse2_plane_inc_minus
+ push 0xfffefffd
+ push 0xfffcfffb
+ push 0xfffafff9
+ movdqa xmm5, [esp]
+ mov esp, r2
+ xor r2, r2
+%else
movdqa xmm5, [sse2_plane_inc_minus]
+%endif
get_i16x16_luma_pred_plane_sse2_1:
movdqa xmm2, xmm1
@@ -395,11 +486,30 @@
pxor mm7, mm7
movq mm0, [r0]
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xfffffff0
+ push 0x00010002 ;sse2_plane_dec_c
+ push 0x00030004
+ push 0x00040003 ;sse2_plane_inc_c
+ push 0x00020001
+ push 0x00040003 ;
+ push 0x00020001
+ push 0x0000ffff
+ push 0xfffefffd
+ movq mm5, [esp+24]
+%else
movq mm5, [sse2_plane_dec_c]
+%endif
punpcklbw mm0, mm7
pmullw mm0, mm5
movq mm1, [r0 + 5]
+%ifdef X86_32_PICASM
+ movq mm6, [esp+16]
+%else
movq mm6, [sse2_plane_inc_c]
+%endif
punpcklbw mm1, mm7
pmullw mm1, mm6
psubw mm1, mm0
@@ -451,7 +561,13 @@
SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
xor r2, r2
+%ifdef X86_32_PICASM
+ movdqa xmm5, [esp]
+ mov esp, r5
+ pop r5
+%else
movdqa xmm5, [sse2_plane_mul_b_c]
+%endif
get_i_chroma_pred_plane_sse2_1:
movdqa xmm2, xmm1
@@ -513,7 +629,20 @@
movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
pxor mm1,mm4 ;find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pand mm1,[esp] ;set the odd bit
+ mov esp, r0
+ pop r0
+%else
pand mm1,[mmx_01bytes] ;set the odd bit
+%endif
psubusb mm3,mm1 ;decrease 1 from odd bytes
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
@@ -538,7 +667,20 @@
movq %1, [%3-8]
psrlq %1, 38h
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pmullw %1, [esp]
+ mov esp, r0
+ pop r0
+%else
pmullw %1, [mmx_01bytes]
+%endif
pshufw %1, %1, 0
movq [%4], %1
%endmacro
@@ -547,7 +689,20 @@
movq %1, [%3+r1-8]
psrlq %1, 38h
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pmullw %1, [esp]
+ mov esp, r0
+ pop r0
+%else
pmullw %1, [mmx_01bytes]
+%endif
pshufw %1, %1, 0
movq [%4], %1
%endmacro
@@ -561,7 +716,20 @@
movq mm0, [r2-8]
psrlq mm0, 38h
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pmullw mm0, [esp]
+ mov esp, r0
+ pop r0
+%else
pmullw mm0, [mmx_01bytes]
+%endif
pshufw mm0, mm0, 0
movq [r0], mm0
@@ -673,7 +841,18 @@
pavgb mm1, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ pand mm4, [esp]
+ mov esp, r0
+ pop r0
+%else
pand mm4, [mmx_01bytes] ; set the odd bit
+%endif
psubusb mm1, mm4 ; decrease 1 from odd bytes
pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
@@ -758,7 +937,18 @@
pavgb mm2, mm0
pxor mm5, mm0 ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ pand mm5, [esp] ; set the odd bit
+ mov esp, r0
+ pop r0
+%else
pand mm5, [mmx_01bytes] ; set the odd bit
+%endif
psubusb mm2, mm5 ; decrease 1 from odd bytes
pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
@@ -840,7 +1030,18 @@
pavgb mm2, mm0
pxor mm3, mm0 ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ pand mm3, [esp] ; set the odd bit
+ mov esp, r0
+ pop r0
+%else
pand mm3, [mmx_01bytes] ; set the odd bit
+%endif
psubusb mm2, mm3 ; decrease 1 from odd bytes
movq mm3, mm0
@@ -920,7 +1121,18 @@
movq mm3, mm1
pavgb mm1, mm2
pxor mm3, mm2 ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ pand mm3, [esp] ; set the odd bit
+ mov esp, r0
+ pop r0
+%else
pand mm3, [mmx_01bytes] ; set the odd bit
+%endif
psubusb mm1, mm3 ; decrease 1 from odd bytes
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
@@ -987,7 +1199,18 @@
movq mm4, mm2
pavgb mm2, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ pand mm4, [esp] ; set the odd bit
+ mov esp, r0
+ pop r0
+%else
pand mm4, [mmx_01bytes] ; set the odd bit
+%endif
psubusb mm2, mm4 ; decrease 1 from odd bytes
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
@@ -1052,7 +1275,18 @@
movq mm1, mm2
paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x00000000
+ push 0x00000002
+ movq mm4, [esp]
+ mov esp, r0
+ pop r0
+%else
movq mm4, [mmx_0x02]
+%endif
paddq mm0, mm4
psrlq mm0, 0x02
@@ -1068,13 +1302,30 @@
paddq mm1, mm4
psrlq mm1, 0x03
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ pmuludq mm0, [esp]
+ pmuludq mm3, [esp]
+%else
pmuludq mm0, [mmx_01bytes]
pmuludq mm3, [mmx_01bytes]
+%endif
psllq mm0, 0x20
pxor mm0, mm3 ; mm0 = m_up
+%ifdef X86_32_PICASM
+ pmuludq mm2, [esp]
+ pmuludq mm1, [esp]
+ mov esp, r5
+ pop r5
+%else
pmuludq mm2, [mmx_01bytes]
pmuludq mm1, [mmx_01bytes]
+%endif
psllq mm1, 0x20
pxor mm1, mm2 ; mm2 = m_down
@@ -1134,7 +1385,20 @@
movd xmm1, r2d
paddw xmm0, xmm1
psrld xmm0, 0x05
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pmuludq xmm0, [esp]
+ mov esp, r0
+ pop r0
+%else
pmuludq xmm0, [mmx_01bytes]
+%endif
pshufd xmm0, xmm0, 0
movdqa [r4], xmm0
@@ -1258,7 +1522,20 @@
SIGN_EXTENSION r1, r1d
lea r2, [2*r1+r1] ; 3*kiStride
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ movdqa xmm0, [esp]
+ mov esp, r0
+ pop r0
+%else
movdqa xmm0, [sse2_dc_0x80]
+%endif
movdqa xmm1, xmm0
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
@@ -1375,7 +1652,13 @@
paddw xmm1, xmm3 ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
punpckhqdq xmm1, xmm7
punpcklqdq xmm0, xmm1 ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
+%ifdef X86_32_PICASM
+ pcmpeqw xmm6, xmm6
+ psrlw xmm6, 15
+ psllw xmm6, 1
+%else
movdqa xmm6, [sse2_wd_0x02]
+%endif
paddw xmm0, xmm6
psraw xmm0, 02h
packuswb xmm0, xmm7
@@ -1400,7 +1683,18 @@
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
lea r2, [2*r1+r1]
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x80808080
+ push 0x80808080
+ movq mm0, [esp]
+ mov esp, r0
+ pop r0
+%else
movq mm0, [sse2_dc_0x80]
+%endif
movq mm1, mm0
movq [r0], mm0
movq [r0+r1], mm1
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -75,7 +75,9 @@
#ifdef X86_ASM
+#ifndef X86_32_PICASM
int32_t WelsGetNoneZeroCount_sse2 (int16_t* pLevel);
+#endif
int32_t WelsGetNoneZeroCount_sse42 (int16_t* pLevel);
/****************************************************************************
@@ -84,7 +86,9 @@
void WelsScan4x4Ac_sse2 (int16_t* zig_value, int16_t* pDct);
void WelsScan4x4DcAc_ssse3 (int16_t* pLevel, int16_t* pDct);
void WelsScan4x4DcAc_sse2 (int16_t* pLevel, int16_t* pDct);
+#ifndef X86_32_PICASM
int32_t WelsCalculateSingleCtr4x4_sse2 (int16_t* pDct);
+#endif
/****************************************************************************
* DCT functions
--- a/codec/encoder/core/inc/set_mb_syn_cavlc.h
+++ b/codec/encoder/core/inc/set_mb_syn_cavlc.h
@@ -78,10 +78,12 @@
int32_t CavlcParamCal_c (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
int32_t iEndIdx);
#ifdef X86_ASM
+#ifndef X86_32_PICASM
int32_t CavlcParamCal_sse2 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
int32_t iEndIdx);
int32_t CavlcParamCal_sse42 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
int32_t iEndIdx);
+#endif
#endif
#if defined(__cplusplus)
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -500,7 +500,9 @@
pFuncList->pfCopy8x16Aligned = WelsCopy8x16_mmx;
}
if (uiCpuFlag & WELS_CPU_SSE2) {
+#ifndef X86_32_PICASM
pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_sse2;
+#endif
pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_sse2;
pFuncList->pfQuantization4x4 = WelsQuant4x4_sse2;
@@ -514,7 +516,9 @@
pFuncList->pfScan4x4 = WelsScan4x4DcAc_sse2;
pFuncList->pfScan4x4Ac = WelsScan4x4Ac_sse2;
+#ifndef X86_32_PICASM
pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_sse2;
+#endif
pFuncList->pfDctT4 = WelsDctT4_sse2;
pFuncList->pfDctFourT4 = WelsDctFourT4_sse2;
--- a/codec/encoder/core/src/set_mb_syn_cavlc.cpp
+++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp
@@ -280,14 +280,19 @@
pFuncList->pfCavlcParamCal = CavlcParamCal_c;
#if defined(X86_32_ASM)
+#ifndef X86_32_PICASM
if (uiCpuFlag & WELS_CPU_SSE2) {
pFuncList->pfCavlcParamCal = CavlcParamCal_sse2;
}
#endif
+#endif
+
#ifdef X86_ASM
+#ifndef X86_32_PICASM
if (uiCpuFlag & WELS_CPU_SSE42) {
pFuncList->pfCavlcParamCal = CavlcParamCal_sse42;
}
+#endif
#endif
if (iEntropyCodingModeFlag) {
pFuncList->pfStashMBStatus = StashMBStatusCabac;
--- a/codec/encoder/core/x86/coeff.asm
+++ b/codec/encoder/core/x86/coeff.asm
@@ -369,6 +369,7 @@
%ifdef X86_32
+%ifndef X86_32_PICASM
;***********************************************************************
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
@@ -506,8 +507,10 @@
pop edi
pop ebx
ret
-%endif
+%endif ;%ifndef X86_32_PICASM
+%endif ;%ifdef X86_32
+%ifndef X86_32_PICASM
;***********************************************************************
;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
@@ -670,3 +673,5 @@
%undef r_tmp2d
%undef p_shufb_lut
%undef p_run_lut
+
+%endif ;ifndef X86_32_PICASM
--- a/codec/encoder/core/x86/intra_pred.asm
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -144,7 +144,20 @@
%macro COPY_16_TIMES 2
movdqa %2, [%1-16]
psrldq %2, 15
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pmuludq %2, [esp]
+ mov esp, r0
+ pop r0
+%else
pmuludq %2, [mmx_01bytes]
+%endif
pshufd %2, %2, 0
%endmacro
@@ -151,7 +164,20 @@
%macro COPY_16_TIMESS 3
movdqa %2, [%1+%3-16]
psrldq %2, 15
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pmuludq %2, [esp]
+ mov esp, r0
+ pop r0
+%else
pmuludq %2, [mmx_01bytes]
+%endif
pshufd %2, %2, 0
%endmacro
@@ -193,11 +219,26 @@
SIGN_EXTENSION r2, r2d
movzx r3, byte [r1-1]
movd xmm0, r3d
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pmuludq xmm0, [esp]
+%else
pmuludq xmm0, [mmx_01bytes]
+%endif
movzx r3, byte [r1+r2-1]
movd xmm1, r3d
+%ifdef X86_32_PICASM
+ pmuludq xmm1, [esp]
+%else
pmuludq xmm1, [mmx_01bytes]
+%endif
unpcklps xmm0, xmm1
@@ -204,11 +245,21 @@
lea r1, [r1+r2*2]
movzx r3, byte [r1-1]
movd xmm2, r3d
+%ifdef X86_32_PICASM
+ pmuludq xmm2, [esp]
+%else
pmuludq xmm2, [mmx_01bytes]
+%endif
movzx r3, byte [r1+r2-1]
movd xmm3, r3d
+%ifdef X86_32_PICASM
+ pmuludq xmm3, [esp]
+ mov esp, r0
+ pop r0
+%else
pmuludq xmm3, [mmx_01bytes]
+%endif
unpcklps xmm2, xmm3
unpcklpd xmm0, xmm2
@@ -233,11 +284,34 @@
;for H
pxor xmm7, xmm7
movq xmm0, [r1]
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xfffffff0
+ push 0x00010002 ;sse2_plane_dec
+ push 0x00030004
+ push 0x00050006
+ push 0x00070008
+ push 0x00080007 ;sse_plane_inc
+ push 0x00060005
+ push 0x00040003
+ push 0x00020001
+ push 0x0000ffff ;sse_plane_inc_minus
+ push 0xfffefffd
+ push 0xfffcfffb
+ push 0xfffafff9
+ movdqa xmm5, [esp+32]
+%else
movdqa xmm5, [sse2_plane_dec]
+%endif
punpcklbw xmm0, xmm7
pmullw xmm0, xmm5
movq xmm1, [r1 + 9]
+%ifdef X86_32_PICASM
+ movdqa xmm6, [esp+16]
+%else
movdqa xmm6, [sse2_plane_inc]
+%endif
punpcklbw xmm1, xmm7
pmullw xmm1, xmm6
psubw xmm1, xmm0
@@ -283,7 +357,13 @@
SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
xor r3, r3
+%ifdef X86_32_PICASM
+ movdqa xmm5, [esp]
+ mov esp, r5
+ pop r5
+%else
movdqa xmm5, [sse2_plane_inc_minus]
+%endif
get_i16x16_luma_pred_plane_sse2_1:
movdqa xmm2, xmm1
@@ -321,11 +401,30 @@
pxor mm7, mm7
movq mm0, [r1]
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xfffffff0
+ push 0x00010002 ;sse2_plane_dec_c
+ push 0x00030004
+ push 0x00040003 ;sse2_plane_inc_c
+ push 0x00020001
+ push 0x00040003 ;sse2_plane_mul_b_c
+ push 0x00020001
+ push 0x0000ffff
+ push 0xfffefffd
+ movq mm5, [esp+24]
+%else
movq mm5, [sse2_plane_dec_c]
+%endif
punpcklbw mm0, mm7
pmullw mm0, mm5
movq mm1, [r1 + 5]
+%ifdef X86_32_PICASM
+ movq mm6, [esp+16]
+%else
movq mm6, [sse2_plane_inc_c]
+%endif
punpcklbw mm1, mm7
pmullw mm1, mm6
psubw mm1, mm0
@@ -375,7 +474,13 @@
SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
xor r3, r3
+%ifdef X86_32_PICASM
+ movdqa xmm5, [esp]
+ mov esp, r5
+ pop r5
+%else
movdqa xmm5, [sse2_plane_mul_b_c]
+%endif
get_i_chroma_pred_plane_sse2_1:
movdqa xmm2, xmm1
@@ -434,7 +539,18 @@
movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
pxor mm1,mm4 ;find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ pand mm1,[esp] ;set the odd bit
+ mov esp, r0
+ pop r0
+%else
pand mm1,[mmx_01bytes] ;set the odd bit
+%endif
psubusb mm3,mm1 ;decrease 1 from odd bytes
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
@@ -503,7 +619,20 @@
psrlq %1, 38h
;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pmullw %1, [esp]
+ mov esp, r0
+ pop r0
+%else
pmullw %1, [mmx_01bytes]
+%endif
pshufw %1, %1, 0
movq [%4], %1
%endmacro
@@ -513,7 +642,20 @@
psrlq %1, 38h
;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pmullw %1, [esp]
+ mov esp, r0
+ pop r0
+%else
pmullw %1, [mmx_01bytes]
+%endif
pshufw %1, %1, 0
movq [%4], %1
%endmacro
@@ -526,7 +668,20 @@
psrlq mm0, 38h
;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pmullw mm0, [esp]
+ mov esp, r0
+ pop r0
+%else
pmullw mm0, [mmx_01bytes]
+%endif
pshufw mm0, mm0, 0
movq [r0], mm0
@@ -636,7 +791,18 @@
pavgb mm1, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ pand mm4, [esp] ; set the odd bit
+ mov esp, r0
+ pop r0
+%else
pand mm4, [mmx_01bytes] ; set the odd bit
+%endif
psubusb mm1, mm4 ; decrease 1 from odd bytes
pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
@@ -715,7 +881,18 @@
pavgb mm2, mm0
pxor mm5, mm0 ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ pand mm5, [esp] ; set the odd bit
+ mov esp, r0
+ pop r0
+%else
pand mm5, [mmx_01bytes] ; set the odd bit
+%endif
psubusb mm2, mm5 ; decrease 1 from odd bytes
pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
@@ -794,7 +971,18 @@
pavgb mm2, mm0
pxor mm3, mm0 ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ pand mm3, [esp] ; set the odd bit
+ mov esp, r0
+ pop r0
+%else
pand mm3, [mmx_01bytes] ; set the odd bit
+%endif
psubusb mm2, mm3 ; decrease 1 from odd bytes
movq mm3, mm0
@@ -872,7 +1060,18 @@
movq mm3, mm1
pavgb mm1, mm2
pxor mm3, mm2 ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ pand mm3, [esp] ; set the odd bit
+ mov esp, r0
+ pop r0
+%else
pand mm3, [mmx_01bytes] ; set the odd bit
+%endif
psubusb mm1, mm3 ; decrease 1 from odd bytes
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
@@ -936,7 +1135,18 @@
movq mm4, mm2
pavgb mm2, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ pand mm4, [esp] ; set the odd bit
+ mov esp, r0
+ pop r0
+%else
pand mm4, [mmx_01bytes] ; set the odd bit
+%endif
psubusb mm2, mm4 ; decrease 1 from odd bytes
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
@@ -998,7 +1208,18 @@
movq mm1, mm2
paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x00000000
+ push 0x00000002
+ movq mm4, [esp]
+ mov esp, r0
+ pop r0
+%else
movq mm4, [mmx_0x02]
+%endif
paddq mm0, mm4
psrlq mm0, 0x02
@@ -1014,13 +1235,32 @@
paddq mm1, mm4
psrlq mm1, 0x03
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pmuludq mm0, [esp]
+ pmuludq mm3, [esp]
+%else
pmuludq mm0, [mmx_01bytes]
pmuludq mm3, [mmx_01bytes]
+%endif
psllq mm0, 0x20
pxor mm0, mm3 ; mm0 = m_up
+%ifdef X86_32_PICASM
+ pmuludq mm2, [esp]
+ pmuludq mm1, [esp]
+ mov esp, r0
+ pop r0
+%else
pmuludq mm2, [mmx_01bytes]
pmuludq mm1, [mmx_01bytes]
+%endif
psllq mm1, 0x20
pxor mm1, mm2 ; mm2 = m_down
@@ -1076,7 +1316,20 @@
movd xmm1, r3d
paddw xmm0, xmm1
psrld xmm0, 0x05
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ push 0x01010101
+ pmuludq xmm0, [esp]
+ mov esp, r0
+ pop r0
+%else
pmuludq xmm0, [mmx_01bytes]
+%endif
pshufd xmm0, xmm0, 0
movdqa [r0], xmm0
@@ -1098,4 +1351,4 @@
pop r4
pop r3
- ret
\ No newline at end of file
+ ret
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -696,9 +696,26 @@
mov ebx, [height]
mov [i_height], ebx
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x00100010 ;mv_x_inc_x4
+ push 0x00100010
+ push 0x00040004 ;mv_y_inc_x4
+ push 0x00040004
+ push 0x000c0008 ;mx_x_offset_x4
+ push 0x00040000
+ movq xmm7, [esp+16] ; x_qpel inc
+ movq xmm6, [esp+8] ; y_qpel inc
+ movq xmm5, [esp] ; x_qpel vector
+ mov esp, r0
+ pop r0
+%else
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
+%endif
pxor xmm4, xmm4
pxor xmm3, xmm3 ; y_qpel vector
HASH_HEIGHT_LOOP_SSE2:
@@ -1398,9 +1415,24 @@
push r13
mov r12, r2
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x00100010 ;mv_x_inc_x4
+ push 0x00100010
+ push 0x00040004 ;mv_y_inc_x4
+ push 0x00040004
+ push 0x000c0008 ;mx_x_offset_x4
+ push 0x00040000
+ movq xmm7, [esp+16] ; x_qpel inc
+ movq xmm6, [esp+8] ; y_qpel inc
+ movq xmm5, [esp] ; x_qpel vector
+%else
movq xmm7, [mv_x_inc_x4] ; x_qpel inc
movq xmm6, [mv_y_inc_x4] ; y_qpel inc
movq xmm5, [mx_x_offset_x4] ; x_qpel vector
+%endif
pxor xmm4, xmm4
pxor xmm3, xmm3 ; y_qpel vector
HASH_HEIGHT_LOOP_SSE2:
--- a/codec/encoder/core/x86/score.asm
+++ b/codec/encoder/core/x86/score.asm
@@ -207,8 +207,26 @@
pextrw r1d, xmm1, 0 ; eax = [8]
pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x0d0c0706 ;pb_scanacdc_maska
+ push 0x05040b0a
+ push 0x0f0e0908
+ push 0x03020100
+ push 0x0f0e0d0c ;pb_scanacdc_maskb
+ push 0x07060100
+ push 0x05040b0a
+ push 0x09080302
+ pshufb xmm1, [esp]
+ pshufb xmm0, [esp+16]
+ mov esp, r0
+ pop r0
+%else
pshufb xmm1, [pb_scanacdc_maskb]
pshufb xmm0, [pb_scanacdc_maska]
+%endif
movdqa [r0],xmm0
movdqa [r0+16], xmm1
@@ -250,6 +268,7 @@
ret
+%ifndef X86_32_PICASM
;***********************************************************************
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
;***********************************************************************
@@ -306,8 +325,10 @@
mov retrd, r0d
%endif
ret
+%endif ;ifndef X86_32_PICASM
+%ifndef X86_32_PICASM
;***********************************************************************
; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
;***********************************************************************
@@ -336,6 +357,7 @@
add retrq, r1
;add al, [nozero_count_table+r1]
ret
+%endif ;%ifndef X86_32_PICASM
;***********************************************************************
; int32_t WelsGetNoneZeroCount_sse42(int16_t* level);
--- a/codec/processing/src/x86/denoisefilter.asm
+++ b/codec/processing/src/x86/denoisefilter.asm
@@ -147,7 +147,20 @@
movdqa %2, %1
psrldq %2, 2
punpcklbw %2, %4
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x00140014
+ push 0x00140014
+ push 0x00140014
+ push 0x00140014
+ pmullw %2, [esp]
+ mov esp, r0
+ pop r0
+%else
pmullw %2, [sse2_20]
+%endif
paddw %3, %2
movdqa %2, %1
@@ -184,7 +197,13 @@
movq xmm6, [r0]
punpcklbw xmm6, xmm7
+%ifdef X86_32_PICASM
+ pcmpeqw xmm3, xmm3
+ psrlw xmm3, 15
+ psllw xmm3, 5
+%else
movdqa xmm3, [sse2_32]
+%endif
pxor xmm4, xmm4 ; nTotWeight
pxor xmm5, xmm5 ; nSum
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -1253,7 +1253,20 @@
pmaddwd xmm2, xmm1
pshufd xmm1, xmm2, 00000001b
paddd xmm2, xmm1
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xffffffe0
+ push 0x00000000
+ push 0x00000000
+ push 0x00000000
+ push 0x00004000
+ movdqa xmm1, [esp]
+ mov esp, r0
+ pop r0
+%else
movdqa xmm1, [add_extra_half]
+%endif
paddd xmm2, xmm1
psrld xmm2, 15
@@ -1554,7 +1567,20 @@
pmaddwd xmm2, xmm1
pshufd xmm1, xmm2, 00000001b
paddd xmm2, xmm1
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xffffffe0
+ push 0x00000000
+ push 0x00000000
+ push 0x00000000
+ push 0x00004000
+ movdqa xmm1, [esp]
+ mov esp, r0
+ pop r0
+%else
movdqa xmm1, [add_extra_half]
+%endif
paddd xmm2, xmm1
psrld xmm2, 15
@@ -1671,15 +1697,52 @@
;1st line
movdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A
movdqa xmm1, xmm0
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x80808080 ;shufb_mask_onethird_low_1
+ push 0x80808080
+ push 0x80800f0c
+ push 0x09060300
+ push 0x80808080 ;shufb_mask_onethird_high_1
+ push 0x80808080
+ push 0x8080800d
+ push 0x0a070401
+ push 0x80808080 ;shufb_mask_onethird_low_2
+ push 0x800e0b08
+ push 0x05028080
+ push 0x80808080
+ push 0x80808080 ;shufb_mask_onethird_high_2
+ push 0x800f0c09
+ push 0x06030080
+ push 0x80808080
+ push 0x0d0a0704 ;shufb_mask_onethird_low_3
+ push 0x01808080
+ push 0x80808080
+ push 0x80808080
+ push 0x0e0b0805 ;shufb_mask_onethird_high_3
+ push 0x02808080
+ push 0x80808080
+ push 0x80808080
+ movdqa xmm5, [esp+80]
+ movdqa xmm6, [esp+64]
+%else
movdqa xmm5, [shufb_mask_onethird_low_1]
movdqa xmm6, [shufb_mask_onethird_high_1]
+%endif
pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
movdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f
movdqa xmm3, xmm2
+%ifdef X86_32_PICASM
+ movdqa xmm5, [esp+48]
+ movdqa xmm6, [esp+32]
+%else
movdqa xmm5, [shufb_mask_onethird_low_2]
movdqa xmm6, [shufb_mask_onethird_high_2]
+%endif
pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
@@ -1688,8 +1751,13 @@
movdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L *
movdqa xmm3, xmm2
+%ifdef X86_32_PICASM
+ movdqa xmm5, [esp+16]
+ movdqa xmm6, [esp]
+%else
movdqa xmm5, [shufb_mask_onethird_low_3]
movdqa xmm6, [shufb_mask_onethird_high_3]
+%endif
pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
@@ -1700,15 +1768,25 @@
;2nd line
movdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A'
movdqa xmm3, xmm2
+%ifdef X86_32_PICASM
+ movdqa xmm5, [esp+80]
+ movdqa xmm6, [esp+64]
+%else
movdqa xmm5, [shufb_mask_onethird_low_1]
movdqa xmm6, [shufb_mask_onethird_high_1]
+%endif
pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3
movdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f'
movdqa xmm4, xmm1
+%ifdef X86_32_PICASM
+ movdqa xmm5, [esp+48]
+ movdqa xmm6, [esp+32]
+%else
movdqa xmm5, [shufb_mask_onethird_low_2]
movdqa xmm6, [shufb_mask_onethird_high_2]
+%endif
pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
@@ -1717,8 +1795,15 @@
movdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' *
movdqa xmm4, xmm1
+%ifdef X86_32_PICASM
+ movdqa xmm5, [esp+16]
+ movdqa xmm6, [esp]
+ mov esp, r0
+ pop r0
+%else
movdqa xmm5, [shufb_mask_onethird_low_3]
movdqa xmm6, [shufb_mask_onethird_high_3]
+%endif
pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
@@ -1821,15 +1906,52 @@
;1st line
movntdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A
movdqa xmm1, xmm0
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x80808080 ;shufb_mask_onethird_low_1
+ push 0x80808080
+ push 0x80800f0c
+ push 0x09060300
+ push 0x80808080 ;shufb_mask_onethird_high_1
+ push 0x80808080
+ push 0x8080800d
+ push 0x0a070401
+ push 0x80808080 ;shufb_mask_onethird_low_2
+ push 0x800e0b08
+ push 0x05028080
+ push 0x80808080
+ push 0x80808080 ;shufb_mask_onethird_high_2
+ push 0x800f0c09
+ push 0x06030080
+ push 0x80808080
+ push 0x0d0a0704 ;shufb_mask_onethird_low_3
+ push 0x01808080
+ push 0x80808080
+ push 0x80808080
+ push 0x0e0b0805 ;shufb_mask_onethird_high_3
+ push 0x02808080
+ push 0x80808080
+ push 0x80808080
+ movdqa xmm5, [esp+80]
+ movdqa xmm6, [esp+64]
+%else
movdqa xmm5, [shufb_mask_onethird_low_1]
movdqa xmm6, [shufb_mask_onethird_high_1]
+%endif
pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
movntdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f
movdqa xmm3, xmm2
+%ifdef X86_32_PICASM
+ movdqa xmm5, [esp+48]
+ movdqa xmm6, [esp+32]
+%else
movdqa xmm5, [shufb_mask_onethird_low_2]
movdqa xmm6, [shufb_mask_onethird_high_2]
+%endif
pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
@@ -1838,8 +1960,13 @@
movntdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L *
movdqa xmm3, xmm2
+%ifdef X86_32_PICASM
+ movdqa xmm5, [esp+16]
+ movdqa xmm6, [esp]
+%else
movdqa xmm5, [shufb_mask_onethird_low_3]
movdqa xmm6, [shufb_mask_onethird_high_3]
+%endif
pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
@@ -1850,15 +1977,25 @@
;2nd line
movntdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A'
movdqa xmm3, xmm2
+%ifdef X86_32_PICASM
+ movdqa xmm5, [esp+80]
+ movdqa xmm6, [esp+64]
+%else
movdqa xmm5, [shufb_mask_onethird_low_1]
movdqa xmm6, [shufb_mask_onethird_high_1]
+%endif
pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3
movntdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f'
movdqa xmm4, xmm1
+%ifdef X86_32_PICASM
+ movdqa xmm5, [esp+48]
+ movdqa xmm6, [esp+32]
+%else
movdqa xmm5, [shufb_mask_onethird_low_2]
movdqa xmm6, [shufb_mask_onethird_high_2]
+%endif
pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
@@ -1867,8 +2004,15 @@
movntdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' *
movdqa xmm4, xmm1
+%ifdef X86_32_PICASM
+ movdqa xmm5, [esp+16]
+ movdqa xmm6, [esp]
+ mov esp, r0
+ pop r0
+%else
movdqa xmm5, [shufb_mask_onethird_low_3]
movdqa xmm6, [shufb_mask_onethird_high_3]
+%endif
pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
@@ -2112,7 +2256,20 @@
add r6, r0
movq xmm7, [r6]
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x80808080
+ push 0x0d090501
+ push 0x80808080
+ push 0x0c080400
+ movdqa xmm6, [esp]
+ mov esp, r0
+ pop r0
+%else
movdqa xmm6, [shufb_mask_quarter]
+%endif
.yloops_quarter_sse3:
;mov eax, [esp+40] ; iSrcWidth
;sar eax, $02 ; iSrcWidth >> 2
@@ -2221,7 +2378,20 @@
add r6, r0
movq xmm7, [r6]
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x80808080
+ push 0x0d090501
+ push 0x80808080
+ push 0x0c080400
+ movdqa xmm6, [esp]
+ mov esp, r0
+ pop r0
+%else
movdqa xmm6, [shufb_mask_quarter] ;mask
+%endif
.yloops_quarter_sse4:
%ifdef X86_32
@@ -2364,7 +2534,20 @@
%macro SSSE3_BilinearFastDownsample4xOrLess_8px 0
movdqa xmm_tmp0, xmm_xpos_int
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x08080808
+ push 0x08080808
+ push 0x00000000
+ push 0x00000000
+ pshufb xmm_tmp0, [esp]
+ mov esp, r0
+ pop r0
+%else
pshufb xmm_tmp0, [shufb_0000000088888888]
+%endif
psubb xmm_xpos_int, xmm_tmp0
SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac
mov r_tmp0, i_xpos
@@ -2372,7 +2555,24 @@
lddqu xmm_tmp3, [p_src_row0 + r_tmp0]
lddqu xmm_tmp4, [p_src_row1 + r_tmp0]
movdqa xmm_tmp2, xmm_xpos_int
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x80808080 ;db80h_256
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ punpcklbw xmm_tmp2, [esp]
+ mov esp, r5
+ pop r5
+%else
punpcklbw xmm_tmp2, [db80h_256]
+%endif
pshufb xmm_tmp3, xmm_tmp2
pshufb xmm_tmp4, xmm_tmp2
SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
@@ -2385,7 +2585,24 @@
lddqu xmm_tmp3, [p_src_row0 + r_tmp0]
lddqu xmm_tmp4, [p_src_row1 + r_tmp0]
movdqa xmm_tmp2, xmm_xpos_int
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x80808080 ;db80h_256
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ punpckhbw xmm_tmp2, [esp]
+ mov esp, r5
+ pop r5
+%else
punpckhbw xmm_tmp2, [db80h_256]
+%endif
pshufb xmm_tmp3, xmm_tmp2
pshufb xmm_tmp4, xmm_tmp2
SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
@@ -2524,13 +2741,43 @@
%macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0
movdqa xmm_tmp0, xmm_xpos_int
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x08080808
+ push 0x08080808
+ push 0x00000000
+ push 0x00000000
+ pshufb xmm_tmp0, [esp]
+ mov esp, r0
+ pop r0
+%else
pshufb xmm_tmp0, [shufb_0000000088888888]
+%endif
psubb xmm_xpos_int, xmm_tmp0
SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
mov r_tmp0, i_xpos
shr r_tmp0, 16
movdqa xmm_tmp3, xmm_xpos_int
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x80808080 ;db80h_256
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ punpcklbw xmm_tmp3, [esp]
+ mov esp, r5
+ pop r5
+%else
punpcklbw xmm_tmp3, [db80h_256]
+%endif
lddqu xmm_tmp4, [p_src_row0 + r_tmp0]
lddqu xmm_tmp2, [p_src_row1 + r_tmp0]
lea r_tmp0, [i_xpos + 4 * i_scalex]
@@ -2542,7 +2789,24 @@
pmaddwd xmm_tmp2, xmm_tmp0
SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
movdqa xmm_tmp2, xmm_xpos_int
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x80808080 ;db80h_256
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ punpckhbw xmm_tmp2, [esp]
+ mov esp, r5
+ pop r5
+%else
punpckhbw xmm_tmp2, [db80h_256]
+%endif
lddqu xmm_tmp4, [p_src_row0 + r_tmp0]
lddqu xmm_tmp3, [p_src_row1 + r_tmp0]
pshufb xmm_tmp4, xmm_tmp2
@@ -3321,7 +3585,20 @@
%endmacro
%macro AVX2_BilinearFastDownsample4xOrLess_16px 0
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xfffffff0
+ push 0x08080808
+ push 0x08080808
+ push 0x00000000
+ push 0x00000000
+ vbroadcasti128 ymm_tmp0, [esp]
+ mov esp, r0
+ pop r0
+%else
vbroadcasti128 ymm_tmp0, [shufb_0000000088888888]
+%endif
vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
@@ -3365,7 +3642,20 @@
%endmacro
%macro AVX2_BilinearFastDownsample8xOrLess_16px 0
+%ifdef X86_32_PICASM
+ push r0
+ mov r0, esp
+ and esp, 0xffffffe0
+ push 0x0c0c0c0c
+ push 0x08080808
+ push 0x04040404
+ push 0x00000000
+ vbroadcasti128 ymm_tmp0, [esp]
+ mov esp, r0
+ pop r0
+%else
vbroadcasti128 ymm_tmp0, [shufb_000044448888CCCC]
+%endif
vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
mov r_tmp0, i_xpos
@@ -3604,7 +3894,20 @@
%endmacro
%macro AVX2_BilinearAccurateDownsample4xOrLess_16px 0
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x08080808 ;shufb_0000000088888888
+ push 0x08080808
+ push 0x00000000
+ push 0x00000000
+ vbroadcasti128 ymm_tmp0, [esp]
+ mov esp, r5
+ pop r5
+%else
vbroadcasti128 ymm_tmp0, [shufb_0000000088888888]
+%endif
vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
@@ -3619,7 +3922,24 @@
lea r_tmp0, [i_xpos + 2 * i_scalex2]
lea i_xpos, [r_tmp0 + 4 * i_scalex2]
shr r_tmp0, 16
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x80808080 ;db80h_256
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ vpunpcklbw ymm_tmp3, ymm_xpos_int, [esp]
+ mov esp, r5
+ pop r5
+%else
vpunpcklbw ymm_tmp3, ymm_xpos_int, [db80h_256]
+%endif
vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3
vpshufb ymm_tmp2, ymm_tmp2, ymm_tmp3
vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp0
@@ -3632,7 +3952,24 @@
shr r_tmp0, 16
vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x80808080 ;db80h_256
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ vpunpckhbw ymm_tmp3, ymm_xpos_int, [esp]
+ mov esp, r5
+ pop r5
+%else
vpunpckhbw ymm_tmp3, ymm_xpos_int, [db80h_256]
+%endif
vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3
vpshufb ymm_tmp2, ymm_tmp2, ymm_tmp3
vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp1
@@ -3648,7 +3985,20 @@
%endmacro
%macro AVX2_BilinearAccurateDownsample8xOrLess_16px 0
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x0c0c0c0c ;shufb_000044448888cccc
+ push 0x08080808
+ push 0x04040404
+ push 0x00000000
+ vbroadcasti128 ymm_tmp0, [esp]
+ mov esp, r5
+ pop r5
+%else
vbroadcasti128 ymm_tmp0, [shufb_000044448888CCCC]
+%endif
vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0
vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0
mov r_tmp0, i_xpos
@@ -3669,7 +4019,24 @@
shr r_tmp0, 16
vinserti128 ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
vinserti128 ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
+%ifdef X86_32_PICASM
+ push r5
+ mov r5, esp
+ and esp, 0xffffffe0
+ push 0x80808080 ;db80h_256
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ push 0x80808080
+ vpunpcklbw ymm_tmp3, ymm_xpos_int, [esp]
+ mov esp, r5
+ pop r5
+%else
vpunpcklbw ymm_tmp3, ymm_xpos_int, [db80h_256]
+%endif
vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3
vpshufb ymm_tmp5, ymm_tmp5, ymm_tmp3
vpshufb ymm_tmp0, ymm_tmp0, ymm_tmp3
--- a/test/encoder/EncUT_Cavlc.cpp
+++ b/test/encoder/EncUT_Cavlc.cpp
@@ -77,14 +77,18 @@
}
#ifdef X86_32_ASM
+#ifndef X86_32_PICASM
TEST (CavlcTest, CavlcParamCal_sse2) {
TestCavlcParamCal (CavlcParamCal_sse2);
}
#endif
+#endif
#ifdef X86_ASM
+#ifndef X86_32_PICASM
TEST (CavlcTest, CavlcParamCal_sse42) {
if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
TestCavlcParamCal (CavlcParamCal_sse42);
}
+#endif
#endif
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -222,6 +222,7 @@
}
#endif //HAVE_AVX2
+#ifndef X86_32_PICASM
TEST (EncodeMbAuxTest, WelsCalculateSingleCtr4x4_sse2) {
CMemoryAlign cMemoryAlign (0);
ALLOC_MEMORY (int16_t, iDctC, 16);
@@ -235,6 +236,7 @@
FREE_MEMORY (iDctC);
FREE_MEMORY (iDctS);
}
+#endif //#ifndef X86_32_PICASM
#endif
void copy (uint8_t* pDst, int32_t iDStride, uint8_t* pSrc, int32_t iSStride, int32_t iWidth, int32_t iHeight) {
@@ -302,9 +304,11 @@
TestGetNoneZeroCount (WelsGetNoneZeroCount_c);
}
#ifdef X86_ASM
+#ifndef X86_32_PICASM
TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) {
TestGetNoneZeroCount (WelsGetNoneZeroCount_sse2);
}
+#endif
TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse42) {
if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);