shithub: libvpx

Download patch

ref: 0780f258da950e6d0a600ede35968ced93f407c5
parent: b2c8dff727e9020baed12feb40268f555cd4c8fc
parent: 2b2c0c9bda036f3f284ff476d3292af94f406b83
author: Johann <johannkoenig@google.com>
date: Thu Jan 5 05:09:39 EST 2012

Merge "Improve SSSE3 fast quantizer function"

--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -80,6 +80,9 @@
     mov         rdi, [rsi + vp8_blockd_dequant]
     mov         rcx, [rsi + vp8_blockd_dqcoeff]
 
+    movdqa      xmm2, xmm1                  ;store y for getting eob
+    movdqa      xmm3, xmm5
+
     pxor        xmm1, xmm0
     pxor        xmm5, xmm4
     psubw       xmm1, xmm0
@@ -88,35 +91,30 @@
     movdqa      [rax], xmm1
     movdqa      [rax + 16], xmm5
 
-    movdqa      xmm2, [rdi]
-    movdqa      xmm3, [rdi + 16]
+    movdqa      xmm0, [rdi]
+    movdqa      xmm4, [rdi + 16]
 
-    pxor        xmm4, xmm4
-    pmullw      xmm2, xmm1
-    pmullw      xmm3, xmm5
+    pmullw      xmm0, xmm1
+    pmullw      xmm4, xmm5
+    pxor        xmm1, xmm1
 
-    pcmpeqw     xmm1, xmm4                  ;non zero mask
-    pcmpeqw     xmm5, xmm4                  ;non zero mask
-    packsswb    xmm1, xmm5
-    pshufb      xmm1, [GLOBAL(zz_shuf)]
+    pcmpgtw     xmm2, xmm1                  ;calculate eob
+    pcmpgtw     xmm3, xmm1
+    packsswb    xmm2, xmm3
+    pshufb      xmm2, [GLOBAL(zz_shuf)]
 
-    pmovmskb    edx, xmm1
+    pmovmskb    edx, xmm2
 
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax                      ;flip the bits for bsr
-    bsr         eax, edx
-
-    movdqa      [rcx], xmm2                 ;store dqcoeff
-    movdqa      [rcx + 16], xmm3            ;store dqcoeff
-
+    movdqa      [rcx], xmm0                 ;store dqcoeff
+    movdqa      [rcx + 16], xmm4            ;store dqcoeff
     mov         rcx, [rsi + vp8_blockd_eob]
 
-    sub         edi, edx                    ;check for all zeros in bit mask
-    sar         edi, 31                     ;0 or -1
+    bsr         eax, edx                    ;count 0
     add         eax, 1
-    and         eax, edi                    ;if the bit mask was all zero,
-                                            ;then eob = 0
+
+    cmp         edx, 0                      ;if all 0, eob=0
+    cmove       eax, edx
+
     mov         BYTE PTR [rcx], al          ;store eob
 
     ; begin epilog