shithub: libvpx

Download patch

ref: 7725a7eb56e74ebbdba14d01cfe85a151f81bf1c
parent: 27dad21548253f95a880e52fd0f8fb8a398b72f0
parent: 2debd5b5f75ab11bb6835b929e468f2873a88277
author: Yunqing Wang <yunqingwang@google.com>
date: Mon Feb 14 09:09:25 EST 2011

Merge "Improve vp8_sad16x16_sse3 function"

--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -586,52 +586,45 @@
 
     STACK_FRAME_CREATE_X3
 
-        lea             end_ptr,    [src_ptr+src_stride*8]
+        mov             end_ptr,    4
+        pxor            xmm7,        xmm7
 
-        lea             end_ptr,    [end_ptr+src_stride*8]
-        pxor            mm7,        mm7
-
 .vp8_sad16x16_sse3_loop:
+        movdqa          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [ref_ptr]
+        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
+        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]
 
-        movq            ret_var,    mm7
-        cmp             ret_var,    max_err
-        jg              .vp8_sad16x16_early_exit
+        lea             src_ptr,    [src_ptr+src_stride*2]
+        lea             ref_ptr,    [ref_ptr+ref_stride*2]
 
-        movq            mm0,        QWORD PTR [src_ptr]
-        movq            mm2,        QWORD PTR [src_ptr+8]
+        movdqa          xmm4,       XMMWORD PTR [src_ptr]
+        movdqu          xmm5,       XMMWORD PTR [ref_ptr]
+        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]
 
-        movq            mm1,        QWORD PTR [ref_ptr]
-        movq            mm3,        QWORD PTR [ref_ptr+8]
+        psadbw          xmm0,       xmm1
 
-        movq            mm4,        QWORD PTR [src_ptr+src_stride]
-        movq            mm5,        QWORD PTR [ref_ptr+ref_stride]
+        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
 
-        psadbw          mm0,        mm1
-        psadbw          mm2,        mm3
+        psadbw          xmm2,       xmm3
+        psadbw          xmm4,       xmm5
+        psadbw          xmm6,       xmm1
 
-        movq            mm1,        QWORD PTR [src_ptr+src_stride+8]
-        movq            mm3,        QWORD PTR [ref_ptr+ref_stride+8]
-
-        psadbw          mm4,        mm5
-        psadbw          mm1,        mm3
-
         lea             src_ptr,    [src_ptr+src_stride*2]
         lea             ref_ptr,    [ref_ptr+ref_stride*2]
 
-        paddw           mm0,        mm2
-        paddw           mm4,        mm1
+        paddw           xmm7,        xmm0
+        paddw           xmm7,        xmm2
+        paddw           xmm7,        xmm4
+        paddw           xmm7,        xmm6
 
-        paddw           mm7,        mm0
-        paddw           mm7,        mm4
-
-        cmp             src_ptr,    end_ptr
+        sub             end_ptr,     1
         jne             .vp8_sad16x16_sse3_loop
 
-        movq            ret_var,    mm7
-
-.vp8_sad16x16_early_exit:
-
-        mov             rax,        ret_var
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+        paddw           xmm0,       xmm7
+        movq            rax,        xmm0
 
     STACK_FRAME_DESTROY_X3