ref: e5fb4b61b66d188b3afed56f1e2548dd6e1a2074
parent: 5b872402307b6a62ee4dbe93021c003ccf99a547
author: Ronald S. Bultje <rbultje@google.com>
date: Mon Jul 1 08:03:20 EDT 2013
Use pmovmskb to skip quantize loops over empty coefficients. If none of the 16 coefficients that we quantize per loop iteration are larger than the zbin, directly skip to the next round of coeffs, rather than doing a full quantize loop that will eventually result in 16 zeroes. This incurs a jump cost, but saves a lot of other work. 32x32 quant goes from 1349 -> 1184 cycles. The same approach yielded no significantly positive results for smaller transforms, so is not used there (8x8: 103 -> 101 cycles; 16x16: 302 -> 306 cycles). Change-Id: I8fca17dc2543fc8eed1dbcd5100145e3c3a9b647
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -15,10 +15,10 @@
SECTION .text
-%macro QUANTIZE_FN 1
-cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
- shift, qcoeff, dqcoeff, dequant, zbin_oq, \
- eob, scan, iscan
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+ shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+ eob, scan, iscan
cmp dword skipm, 0
jne .blank
@@ -43,9 +43,8 @@
mova m4, [r2] ; m4 = shift
mov r4, dqcoeffmp
mov r5, iscanmp
- mov r2, eobmp
pxor m5, m5 ; m5 = dedicated zero
- DEFINE_ARGS coeff, ncoeff, eob, qcoeff, dqcoeff, iscan
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
lea coeffq, [ coeffq+ncoeffq*2]
lea iscanq, [ iscanq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2]
@@ -119,6 +118,12 @@
%endif
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+ pmovmskb r6, m7
+ pmovmskb r2, m12
+ or r6, r2
+ jz .skip_iter
+%endif
paddw m6, m1 ; m6 += round
paddw m11, m1 ; m11 += round
pmulhw m14, m6, m2 ; m14 = m6*q>>16
@@ -159,9 +164,20 @@
pmaxsw m8, m13
add ncoeffq, mmsize
jl .ac_only_loop
+%ifidn %1, b_32x32
+ jmp .accumulate_eob
+.skip_iter:
+ mova [qcoeffq+ncoeffq*2+ 0], m5
+ mova [qcoeffq+ncoeffq*2+16], m5
+ mova [dqcoeffq+ncoeffq*2+ 0], m5
+ mova [dqcoeffq+ncoeffq*2+16], m5
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+%endif
.accumulate_eob:
; horizontally accumulate/max eobs and write into [eob] memory pointer
+ mov r2, eobmp
pshufd m7, m8, 0xe
pmaxsw m8, m7
pshuflw m7, m8, 0xe
@@ -168,7 +184,7 @@
pmaxsw m8, m7
pshuflw m7, m8, 0x1
pmaxsw m8, m7
- pextrw [eobq], m8, 0
+ pextrw [r2], m8, 0
RET
; skip-block, i.e. just write all zeroes
@@ -194,5 +210,5 @@
%endmacro
INIT_XMM ssse3
-QUANTIZE_FN b
-QUANTIZE_FN b_32x32
+QUANTIZE_FN b, 6
+QUANTIZE_FN b_32x32, 7
--
⑨