ref: 3f31aff4dc5037cec1440f39aea8be54432a6aaa
parent: 502b16925ec7c706db3f3aeb0b8a1e7351539ab3
author: Sindre Aamås <saamas@cisco.com>
date: Tue Apr 19 12:41:10 EDT 2016
[Encoder] Add an SSE4.2 implementation of CavlcParamCal Use a combination of table lookups and pshufb to convert coefficients to zero run/level format. Two 16-entry lookup tables are used for a total of 192 bytes worth of tables. (The existing SSE2 version uses a table of size 2048 bytes.) Speedup is ~1.5x-3x as compared with the SSE2 version on Haswell (the speedup is greater for input with many trailing zeros). The use of popcnt makes it require SSE4.2. This can be replaced with a small LUT and accumulation which would reduce the requirement to SSSE3.
--- a/codec/encoder/core/inc/set_mb_syn_cavlc.h
+++ b/codec/encoder/core/inc/set_mb_syn_cavlc.h
@@ -80,6 +80,8 @@
#ifdef X86_ASM
int32_t CavlcParamCal_sse2 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
int32_t iEndIdx);
+int32_t CavlcParamCal_sse42 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
+ int32_t iEndIdx);
#endif
#if defined(__cplusplus)
--- a/codec/encoder/core/src/set_mb_syn_cavlc.cpp
+++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp
@@ -279,6 +279,11 @@
pFuncList->pfCavlcParamCal = CavlcParamCal_sse2;
}
#endif
+#ifdef X86_ASM
+ if (uiCpuFlag & WELS_CPU_SSE42) {
+ pFuncList->pfCavlcParamCal = CavlcParamCal_sse42;
+ }
+#endif
if (iEntropyCodingModeFlag) {
pFuncList->pfStashMBStatus = StashMBStatusCabac;
pFuncList->pfStashPopMBStatus = StashPopMBStatusCabac;
--- a/codec/encoder/core/x86/coeff.asm
+++ b/codec/encoder/core/x86/coeff.asm
@@ -42,10 +42,57 @@
%include "asm_inc.asm"
+SECTION .rodata align=16
+align 16
+wels_shufb_rev:
+ db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+; 4-bit table giving number of preceding zeros for each set bit as well as the
+; eventual next bit. For the case where all 4 bits are set, this requires 5
+; zeros. The 5th zero can either be read from beyond the final table entry or
+; implied via zero-initializing the location being read into.
+wels_cavlc_param_cal_run_lut:
+ db 4, 0, 0, 0
+ db 0, 3, 0, 0
+ db 1, 2, 0, 0
+ db 0, 0, 2, 0
+ db 2, 1, 0, 0
+ db 0, 1, 1, 0
+ db 1, 0, 1, 0
+ db 0, 0, 0, 1
+ db 3, 0, 0, 0
+ db 0, 2, 0, 0
+ db 1, 1, 0, 0
+ db 0, 0, 1, 0
+ db 2, 0, 0, 0
+ db 0, 1, 0, 0
+ db 1, 0, 0, 0
+ db 0, 0, 0, 0
+; db 0
+; 4-bit table giving pshufb vectors for compacting 4-word vectors by removing
+; the words that match zero bits and concatenating in reverse order.
+wels_cavlc_param_cal_shufb_lut:
+ db 0, 0, 0, 0, 0, 0, 0, 0
+ db 6, 7, 0, 0, 0, 0, 0, 0
+ db 4, 5, 0, 0, 0, 0, 0, 0
+ db 6, 7, 4, 5, 0, 0, 0, 0
+ db 2, 3, 0, 0, 0, 0, 0, 0
+ db 6, 7, 2, 3, 0, 0, 0, 0
+ db 4, 5, 2, 3, 0, 0, 0, 0
+ db 6, 7, 4, 5, 2, 3, 0, 0
+ db 0, 1, 0, 0, 0, 0, 0, 0
+ db 6, 7, 0, 1, 0, 0, 0, 0
+ db 4, 5, 0, 1, 0, 0, 0, 0
+ db 6, 7, 4, 5, 0, 1, 0, 0
+ db 2, 3, 0, 1, 0, 0, 0, 0
+ db 6, 7, 2, 3, 0, 1, 0, 0
+ db 4, 5, 2, 3, 0, 1, 0, 0
+ db 6, 7, 4, 5, 2, 3, 0, 1
+
+
%ifdef X86_32
-SECTION .rodata align=16
align 16
sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8
@@ -312,6 +359,8 @@
db 7,6,5,4,3,2,1,7, ;254
db 7,6,5,4,3,2,1,8, ;255
+%endif ; X86_32
+
;***********************************************************************
; Code
;***********************************************************************
@@ -318,6 +367,7 @@
SECTION .text
+%ifdef X86_32
;***********************************************************************
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
@@ -457,3 +507,162 @@
pop ebx
ret
%endif
+
+;***********************************************************************
+;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
+;***********************************************************************
+
+WELS_EXTERN CavlcParamCal_sse42
+%define p_coeff_level r0
+%define p_run r1
+%define p_level r2
+%define p_total_coeffs r3
+%define i_endidxd r4d
+
+%ifdef X86_32
+ push r5
+ push r6
+ %assign push_num 2
+ %define r_mask r5
+ %define r_maskd r5d
+ %define p_shufb_lut wels_cavlc_param_cal_shufb_lut
+ %define p_run_lut wels_cavlc_param_cal_run_lut
+%elifdef WIN64
+ push rbx
+ %assign push_num 1
+ %define r_mask rbx
+ %define r_maskd ebx
+ %define p_shufb_lut r5
+ %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
+ lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
+%else
+ %assign push_num 0
+ %define r_mask rax
+ %define r_maskd eax
+ %define p_shufb_lut r5
+ %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
+ lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
+%endif
+
+ LOAD_5_PARA
+ PUSH_XMM 2
+
+ ; Free up rcx/ecx because only cl is accepted as shift amount operand.
+%ifidni r0b, cl
+ mov r6, r0
+ %undef p_coeff_level
+ %define p_coeff_level r6
+ %define r_tmp r0
+ %define r_tmpd r0d
+ %define r_tmpb r0b
+%elifidni r1b, cl
+ mov r6, r1
+ %undef p_run
+ %define p_run r6
+ %define r_tmp r1
+ %define r_tmpd r1d
+ %define r_tmpb r1b
+%elifidni r3b, cl
+ mov r6, r3
+ %undef p_total_coeffs
+ %define p_total_coeffs r6
+ %define r_tmp r3
+ %define r_tmpd r3d
+ %define r_tmpb r3b
+%else
+ %error "Unknown cl register."
+%endif
+
+ ; Acquire a bitmask indicating which words are non-zero.
+ ; Assume p_coeff_level is 16-byte-aligned and at least 32 bytes if endIdx > 3.
+ ; Otherwise, assume 8 bytes available. Assume that input beyond endIdx is zero.
+ ; Assumptions are taken from previous implementations.
+ pxor xmm1, xmm1
+ cmp i_endidxd, 3
+ jg .load16
+ movq xmm0, [p_coeff_level]
+ packsswb xmm0, xmm1
+ jmp .load_done
+.load16:
+ movdqa xmm0, [p_coeff_level]
+ packsswb xmm0, [p_coeff_level + 16]
+.load_done:
+ movdqa [p_run], xmm1 ; Zero-initialize because we may read back implied zeros.
+ pcmpeqb xmm0, xmm1
+ pshufb xmm0, [wels_shufb_rev]
+ pmovmskb r_maskd, xmm0
+ xor r_maskd, 0FFFFh
+ mov r_tmpd, i_endidxd
+%undef i_endidxd
+%define r_tmp2 r4
+%define r_tmp2d r4d
+ popcnt r_tmp2d, r_maskd
+ mov [p_total_coeffs], r_tmp2d
+%xdefine i_total_zeros p_total_coeffs
+%undef p_total_coeffs
+ mov i_total_zeros, r_tmp2
+ jz .done
+ mov i_total_zeros, 16
+ sub i_total_zeros, r_tmp2
+ bsf r_tmpd, r_maskd ; Find first set bit.
+ sub i_total_zeros, r_tmp
+ ; Skip trailing zeros.
+ ; Restrict to multiples of 4 to retain alignment and avoid out-of-bound stores.
+ and r_tmpd, -4
+ shr r_maskd, r_tmpb
+ add r_tmpd, r_tmpd
+ sub p_coeff_level, r_tmp
+ ; Handle first quadruple containing a non-zero value.
+ mov r_tmp, r_mask
+ and r_tmpd, 0Fh
+ movq xmm0, [p_coeff_level + 24]
+ movq xmm1, [p_shufb_lut + 8 * r_tmp]
+ pshufb xmm0, xmm1
+ mov r_tmp2d, [p_run_lut + 4 * r_tmp]
+ shr r_tmp2d, 8 ; Skip initial zero run.
+ movlps [p_level], xmm0 ; Store levels for the first quadruple.
+ mov [p_run], r_tmp2d ; Store accompanying zero runs thus far.
+ shr r_maskd, 4
+ jz .done
+.loop:
+ ; Increment pointers.
+ popcnt r_tmpd, r_tmpd ; Number of non-zero values handled.
+ lea p_level, [p_level + 2 * r_tmp]
+ add p_run, r_tmp
+ ; Handle next quadruple.
+ mov r_tmp, r_mask
+ and r_tmpd, 0Fh
+ movq xmm0, [p_coeff_level + 16]
+ sub p_coeff_level, 8
+ movq xmm1, [p_shufb_lut + 8 * r_tmp]
+ pshufb xmm0, xmm1
+ movzx r_tmp2d, byte [p_run - 1]
+ add r_tmp2d, [p_run_lut + 4 * r_tmp] ; Add to previous run and get eventual new runs.
+ movlps [p_level], xmm0 ; Store levels (potentially none).
+ mov [p_run - 1], r_tmp2d ; Update previous run and store eventual new runs.
+ shr r_maskd, 4
+ jnz .loop
+.done:
+ mov retrq, i_total_zeros
+ POP_XMM
+ LOAD_5_PARA_POP
+%ifdef X86_32
+ pop r6
+ pop r5
+%elifdef WIN64
+ pop rbx
+%endif
+ ret
+%undef p_coeff_level
+%undef p_run
+%undef p_level
+%undef i_total_zeros
+%undef r_mask
+%undef r_maskd
+%undef r_tmp
+%undef r_tmpd
+%undef r_tmpb
+%undef r_tmp2
+%undef r_tmp2d
+%undef p_shufb_lut
+%undef p_run_lut
--- a/test/encoder/EncUT_Cavlc.cpp
+++ b/test/encoder/EncUT_Cavlc.cpp
@@ -81,3 +81,10 @@
TestCavlcParamCal (CavlcParamCal_sse2);
}
#endif
+
+#ifdef X86_ASM
+TEST (CavlcTest, CavlcParamCal_sse42) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
+ TestCavlcParamCal (CavlcParamCal_sse42);
+}
+#endif