ref: dc2ae517648accc0fe4ac0737f9ee850accda278
parent: e29cb9af406a70876ce287d8f5e050f8b8b07862
author: Kyle Siefring <kylesiefring@gmail.com>
date: Tue Mar 5 10:42:54 EST 2019
Utilize a better CDEF constant for avx2 Before: ``` cdef_filter_8x8_8bpc_avx2: 275.5 cdef_filter_4x8_8bpc_avx2: 193.3 cdef_filter_4x4_8bpc_avx2: 113.5 ``` After: ``` cdef_filter_8x8_8bpc_avx2: 252.3 cdef_filter_4x8_8bpc_avx2: 182.1 cdef_filter_4x4_8bpc_avx2: 105.7 ```
--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -79,14 +79,13 @@
movu xm6, [stkq+offq*2+%6*0] ; p1
vinserti128 m6, [stkq+offq*2+%6*1], 1
%endif
- pcmpeqw m9, m14, m5
- pcmpeqw m10, m14, m6
- pandn m9, m5
- pandn m10, m6
- pmaxsw m7, m9 ; max after p0
- pminsw m8, m5 ; min after p0
- pmaxsw m7, m10 ; max after p1
- pminsw m8, m6 ; min after p1
+ ; out of bounds values are set to a value that is a both a large unsigned
+ ; value and a negative signed value.
+ ; use signed max and unsigned min to remove them
+ pmaxsw m7, m5 ; max after p0
+ pminuw m8, m5 ; min after p0
+ pmaxsw m7, m6 ; max after p1
+ pminuw m8, m6 ; min after p1
; accumulate sum[m15] over p0/p1
psubw m5, m4 ; diff_p0(p0 - px)
@@ -99,8 +98,10 @@
psrlw m6, m10, %2
psubusw m5, %3, m5
psubusw m6, %3, m6
- pminsw m5, m9 ; constrain(diff_p0)
- pminsw m6, m10 ; constrain(diff_p1)
+
+ ; use unsigned min since abs diff can equal 0x8000
+ pminuw m5, m9 ; constrain(diff_p0)
+ pminuw m6, m10 ; constrain(diff_p1)
pmullw m5, m11 ; constrain(diff_p0) * taps
pmullw m6, m12 ; constrain(diff_p1) * taps
paddw m15, m5
@@ -118,7 +119,7 @@
%endif
%define px rsp+2*16+2*%3
pcmpeqw m14, m14
- psrlw m14, 1 ; 0x7fff
+ psllw m14, 15 ; 0x8000
mov edged, r8m
; prepare pixel buffers - body/right