ref: 4645bd26aa506fe5dd54dc230f3d36e446261360
parent: d906dda2240b2c4b39687f7474a4d1607319681a
author: Sindre Aamås <saamas@cisco.com>
date: Tue Apr 19 15:42:17 EDT 2016
[Encoder] Add an SSE4.2 implementation of WelsGetNonZeroCount Avoid touching some cache lines by using popcnt instead of table lookups. Also gives a speedup of ~1.4x on Haswell as compared with SSE2.
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -76,6 +76,7 @@
#ifdef X86_ASM
int32_t WelsGetNoneZeroCount_sse2 (int16_t* pLevel);
+int32_t WelsGetNoneZeroCount_sse42 (int16_t* pLevel);
/****************************************************************************
* Scan and Score functions
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -523,6 +523,9 @@
if (uiCpuFlag & WELS_CPU_SSSE3) {
pFuncList->pfScan4x4 = WelsScan4x4DcAc_ssse3;
}
+ if (uiCpuFlag & WELS_CPU_SSE42) {
+ pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_sse42;
+ }
if (uiCpuFlag & WELS_CPU_AVX2) {
pFuncList->pfDctT4 = WelsDctT4_avx2;
pFuncList->pfDctFourT4 = WelsDctFourT4_avx2;
--- a/codec/encoder/core/x86/score.asm
+++ b/codec/encoder/core/x86/score.asm
@@ -337,3 +337,17 @@
;add al, [nozero_count_table+r1]
ret
+;***********************************************************************
+; int32_t WelsGetNoneZeroCount_sse42(int16_t* level);
+;***********************************************************************
+WELS_EXTERN WelsGetNoneZeroCount_sse42
+ %assign push_num 0
+ LOAD_1_PARA
+ movdqa xmm0, [r0]
+ packsswb xmm0, [r0 + 16]
+ pxor xmm1, xmm1
+ pcmpeqb xmm0, xmm1
+ pmovmskb retrd, xmm0
+ xor retrd, 0FFFFh
+ popcnt retrd, retrd
+ ret
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -301,6 +301,10 @@
TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) {
TestGetNoneZeroCount (WelsGetNoneZeroCount_sse2);
}
+TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse42) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
+ TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);
+}
#endif
#define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign)
#define NEW_QUANT(pDct, ff, mf) (((ff)+ WELS_ABS_LC(pDct))*(mf)) >>16