shithub: libvpx

Download patch

ref: 8eae7fe7e872874a997018acb9a2ff7c49be2632
parent: 746439ef6c1dd2fedbe0c24ddb76d40cb9d26357
author: Fritz Koenig <frkoenig@google.com>
date: Wed Sep 15 10:07:32 EDT 2010

Better choice of instruction filter mask comparision.

Use pmaxub instead of a combination of psubusb/por to
determine if any comparisons go over the limit.

Change-Id: I3f0bd7d2aabe5fee9ba6620508e2b60605abcb82

--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+; Use of pmaxub instead of psubusb to compute filter mask was seen
+; in ffvp8
 
 %macro LFH_FILTER_MASK 1
 %if %1
@@ -33,8 +35,6 @@
         psubusb     xmm2,                   xmm6              ; q3-=q2
         por         xmm1,                   xmm2              ; abs(q3-q2)
 
-        psubusb     xmm1,                   xmm7
-
 %if %1
         movdqa      xmm4,                   [rsi+rax]         ; q1
 %else
@@ -49,10 +49,8 @@
         psubusb     xmm4,                   xmm6              ; q1-=q2
         psubusb     xmm6,                   xmm3              ; q2-=q1
         por         xmm4,                   xmm6              ; abs(q2-q1)
-        psubusb     xmm4,                   xmm7
+        pmaxub      xmm1,                   xmm4
 
-        por         xmm1,                   xmm4
-
 %if %1
         movdqa      xmm4,                   [rsi]             ; q0
 %else
@@ -67,10 +65,8 @@
         psubusb     xmm3,                   xmm0              ; q1-=q0
         por         xmm4,                   xmm3              ; abs(q0-q1)
         movdqa      t0,                     xmm4              ; save to t0
+        pmaxub      xmm1,                   xmm4
 
-        psubusb     xmm4,                   xmm7
-        por         xmm1,                   xmm4
-
 %if %1
         neg         rax                     ; negate pitch to deal with above border
 
@@ -95,10 +91,8 @@
         psubusb     xmm4,                   xmm2              ; p2-=p3
         psubusb     xmm2,                   xmm5              ; p3-=p2
         por         xmm4,                   xmm2              ; abs(p3 - p2)
+        pmaxub      xmm1,                   xmm4
 
-        psubusb     xmm4,                   xmm7
-        por         xmm1,                   xmm4
-
 %if %1
         movdqa      xmm4,                   [rsi+2*rax]       ; p1
 %else
@@ -113,9 +107,8 @@
         psubusb     xmm4,                   xmm5              ; p1-=p2
         psubusb     xmm5,                   xmm3              ; p2-=p1
         por         xmm4,                   xmm5              ; abs(p2 - p1)
-        psubusb     xmm4,                   xmm7
+        pmaxub      xmm1,                   xmm4
 
-        por         xmm1,                   xmm4
         movdqa      xmm2,                   xmm3              ; p1
 
 %if %1
@@ -133,8 +126,8 @@
         por         xmm4,                   xmm3              ; abs(p1 - p0)
         movdqa        t1,                   xmm4              ; save to t1
 
-        psubusb     xmm4,                   xmm7
-        por         xmm1,                   xmm4
+        pmaxub      xmm1,                   xmm4
+        psubusb     xmm1,                   xmm7
 
 %if %1
         movdqa      xmm3,                   [rdi]             ; q1
@@ -872,19 +865,18 @@
         psubusb     xmm0,               xmm7            ; q2-q3
 
         psubusb     xmm7,               xmm6            ; q3-q2
-        por         xmm7,               xmm0            ; abs (q3-q2)
-
         movdqa      xmm4,               xmm5            ; q1
-        psubusb     xmm4,               xmm6            ; q1-q2
 
-        psubusb     xmm6,               xmm5            ; q2-q1
-        por         xmm6,               xmm4            ; abs (q2-q1)
+        por         xmm7,               xmm0            ; abs (q3-q2)
+        psubusb     xmm4,               xmm6            ; q1-q2
 
         movdqa      xmm0,               xmm1
+        psubusb     xmm6,               xmm5            ; q2-q1
 
+        por         xmm6,               xmm4            ; abs (q2-q1)
         psubusb     xmm0,               xmm2            ; p2 - p3;
-        psubusb     xmm2,               xmm1            ; p3 - p2;
 
+        psubusb     xmm2,               xmm1            ; p3 - p2;
         por         xmm0,               xmm2            ; abs(p2-p3)
 %if %1
         movdqa      xmm2,               [rdx]           ; p1
@@ -892,31 +884,20 @@
         movdqa      xmm2,               [rdx+32]        ; p1
 %endif
         movdqa      xmm5,               xmm2            ; p1
+        pmaxub      xmm0,               xmm7
 
         psubusb     xmm5,               xmm1            ; p1-p2
         psubusb     xmm1,               xmm2            ; p2-p1
 
+        movdqa      xmm7,               xmm3            ; p0
+        psubusb     xmm7,               xmm2            ; p0-p1
+
         por         xmm1,               xmm5            ; abs(p2-p1)
+        pmaxub      xmm0,               xmm6
 
-        mov         rdx,                arg(3)          ; limit
-        movdqa      xmm4,               [rdx]           ; limit
-
-        psubusb     xmm7,               xmm4
-
-        psubusb     xmm0,               xmm4            ; abs(p3-p2) > limit
-        psubusb     xmm1,               xmm4            ; abs(p2-p1) > limit
-
-        psubusb     xmm6,               xmm4            ; abs(q2-q1) > limit
-        por         xmm7,               xmm6            ; or
-
-        por         xmm0,               xmm1
-        por         xmm0,               xmm7            ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
+        pmaxub      xmm0,               xmm1
         movdqa      xmm1,               xmm2            ; p1
 
-        movdqa      xmm7,               xmm3            ; p0
-        psubusb     xmm7,               xmm2            ; p0-p1
-
         psubusb     xmm2,               xmm3            ; p1-p0
         por         xmm2,               xmm7            ; abs(p1-p0)
 
@@ -923,8 +904,8 @@
         movdqa      t0,                 xmm2            ; save abs(p1-p0)
         lea         rdx,                srct
 
-        psubusb     xmm2,               xmm4            ; abs(p1-p0)>limit
-        por         xmm0,               xmm2            ; mask
+        pmaxub      xmm0,               xmm2
+
 %if %1
         movdqa      xmm5,               [rdx+32]        ; q0
         movdqa      xmm7,               [rdx+48]        ; q1
@@ -940,9 +921,12 @@
         por         xmm7,               xmm5            ; abs(q1-q0)
 
         movdqa      t1,                 xmm7            ; save abs(q1-q0)
-        psubusb     xmm7,               xmm4            ; abs(q1-q0)> limit
 
-        por         xmm0,               xmm7            ; mask
+        mov         rdx,                arg(3)          ; limit
+        movdqa      xmm4,               [rdx]           ; limit
+
+        pmaxub      xmm0,               xmm7
+        psubusb     xmm0,               xmm4
 
         movdqa      xmm5,               xmm2            ; q1
         psubusb     xmm5,               xmm1            ; q1-=p1