shithub: libvpx

Download patch

ref: 7d799d2ced458e5988fe43bbb7f53a46ed8b877f
parent: f225211256d6df95db5063e98abaa97d5b355170
parent: 58083cb34db84349d8f138e1ee59f4aee77e6624
author: John Koleszar <jkoleszar@google.com>
date: Thu Nov 11 19:05:03 EST 2010

Merge remote branch 'origin/master' into experimental

--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -1439,7 +1439,7 @@
 
         // Boost for arf frame
         Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
-        Boost += (cpi->baseline_gf_interval * 50);
+        Boost += (i * 50);
         allocation_chunks = (i * 100) + Boost;
 
         // Normalize Altboost and allocations chunck down to prevent overflow
@@ -1738,16 +1738,6 @@
 
         vp8_avg_stats(&sectionstats);
 
-        if (sectionstats.pcnt_motion < .17)
-            cpi->section_is_low_motion = 1;
-        else
-            cpi->section_is_low_motion = 0;
-
-        if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
-            cpi->section_is_fast_motion = 1;
-        else
-            cpi->section_is_fast_motion = 0;
-
         cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
 
         Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
@@ -1980,7 +1970,14 @@
             cpi->ni_av_qi                     = cpi->worst_quality;
         }
     }
-    else
+    // The last few frames of a clip almost always have to few or too many
+    // bits and for the sake of over exact rate control we dont want to make
+    // radical adjustments to the allowed quantizer range just to use up a
+    // few surplus bits or get beneath the target rate.
+    else if ( (cpi->common.current_video_frame <
+                  (((unsigned int)cpi->total_stats->count * 255)>>8)) &&
+              ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
+                  (unsigned int)cpi->total_stats->count) )
     {
         if (frames_left < 1)
             frames_left = 1;
@@ -2344,17 +2341,7 @@
 
         vp8_avg_stats(&sectionstats);
 
-        if (sectionstats.pcnt_motion < .17)
-            cpi->section_is_low_motion = 1;
-        else
-            cpi->section_is_low_motion = 0;
-
-        if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
-            cpi->section_is_fast_motion = 1;
-        else
-            cpi->section_is_fast_motion = 0;
-
-        cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+         cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
 
         Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
         // if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -614,9 +614,6 @@
     unsigned int tempdata2;
 
     int base_skip_false_prob[128];
-    unsigned int section_is_low_motion;
-    unsigned int section_benefits_from_aggresive_q;
-    unsigned int section_is_fast_motion;
     unsigned int section_intra_rating;
 
     double section_max_qfactor;
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -8,171 +8,24 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
+
 %include "vpx_ports/x86_abi_support.asm"
 
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     ref_ptr       rdi
-  %define     ref_stride    rdx
-  %define     end_ptr       rcx
-  %define     ret_var       rbx
-  %define     result_ptr    arg(4)
-  %define     max_err       arg(4)
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
+%macro PROCESS_16X2X3 1
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm5,       XMMWORD PTR [rdi]
+        lddqu           xmm6,       XMMWORD PTR [rdi+1]
+        lddqu           xmm7,       XMMWORD PTR [rdi+2]
 
-    mov         rsi,        arg(0)              ; src_ptr
-    mov         rdi,        arg(2)              ; ref_ptr
-
-    movsxd      rax,        dword ptr arg(1)    ; src_stride
-    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     ref_ptr     r8
-    %define     ref_stride  r9
-    %define     end_ptr     r10
-    %define     ret_var     r11
-    %define     result_ptr  [rsp+8+4*8]
-    %define     max_err     [rsp+8+4*8]
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     ref_ptr     rdx
-    %define     ref_stride  rcx
-    %define     end_ptr     r9
-    %define     ret_var     r10
-    %define     result_ptr  r8
-    %define     max_err     r8
-  %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
-  %define     src_ptr
-  %define     src_stride
-  %define     ref_ptr
-  %define     ref_stride
-  %define     end_ptr
-  %define     ret_var
-  %define     result_ptr
-  %define     max_err
-
-%if ABI_IS_32BIT
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro STACK_FRAME_CREATE_X4 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     r0_ptr        rcx
-  %define     r1_ptr        rdx
-  %define     r2_ptr        rbx
-  %define     r3_ptr        rdi
-  %define     ref_stride    rbp
-  %define     result_ptr    arg(4)
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    push        rbx
-
-    push        rbp
-    mov         rdi,        arg(2)              ; ref_ptr_base
-
-    LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
-
-    mov         rsi,        arg(0)              ; src_ptr
-
-    movsxd      rbx,        dword ptr arg(1)    ; src_stride
-    movsxd      rbp,        dword ptr arg(3)    ; ref_stride
-
-    xchg        rbx,        rax
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     r0_ptr      rsi
-    %define     r1_ptr      r10
-    %define     r2_ptr      r11
-    %define     r3_ptr      r8
-    %define     ref_stride  r9
-    %define     result_ptr  [rsp+16+4*8]
-    push        rsi
-
-    LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     r0_ptr      r9
-    %define     r1_ptr      r10
-    %define     r2_ptr      r11
-    %define     r3_ptr      rdx
-    %define     ref_stride  rcx
-    %define     result_ptr  r8
-
-    LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
-
-  %endif
-%endif
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X4 0
-  %define     src_ptr
-  %define     src_stride
-  %define     r0_ptr
-  %define     r1_ptr
-  %define     r2_ptr
-  %define     r3_ptr
-  %define     ref_stride
-  %define     result_ptr
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    pop         rsi
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm5,       XMMWORD PTR [%3]
-        lddqu           xmm6,       XMMWORD PTR [%3+1]
-        lddqu           xmm7,       XMMWORD PTR [%3+2]
-
         psadbw          xmm5,       xmm0
         psadbw          xmm6,       xmm0
         psadbw          xmm7,       xmm0
 %else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%3+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+2]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm1,       XMMWORD PTR [rdi]
+        lddqu           xmm2,       XMMWORD PTR [rdi+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+2]
 
         psadbw          xmm1,       xmm0
         psadbw          xmm2,       xmm0
@@ -182,15 +35,13 @@
         paddw           xmm6,       xmm2
         paddw           xmm7,       xmm3
 %endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%4]
-        lddqu           xmm1,       XMMWORD PTR [%3+%5]
-        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
+        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
 
-%if %1==0 || %1==1
-        lea             %2,         [%2+%4*2]
-        lea             %3,         [%3+%5*2]
-%endif
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
 
         psadbw          xmm1,       xmm0
         psadbw          xmm2,       xmm0
@@ -201,21 +52,21 @@
         paddw           xmm7,       xmm3
 %endmacro
 
-%macro PROCESS_8X2X3 5
-%if %1==0
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm5,       QWORD PTR [%3]
-        movq            mm6,       QWORD PTR [%3+1]
-        movq            mm7,       QWORD PTR [%3+2]
+%macro PROCESS_8X2X3 1
+%if %1
+        movq            mm0,       QWORD PTR [rsi]
+        movq            mm5,       QWORD PTR [rdi]
+        movq            mm6,       QWORD PTR [rdi+1]
+        movq            mm7,       QWORD PTR [rdi+2]
 
         psadbw          mm5,       mm0
         psadbw          mm6,       mm0
         psadbw          mm7,       mm0
 %else
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm1,       QWORD PTR [%3]
-        movq            mm2,       QWORD PTR [%3+1]
-        movq            mm3,       QWORD PTR [%3+2]
+        movq            mm0,       QWORD PTR [rsi]
+        movq            mm1,       QWORD PTR [rdi]
+        movq            mm2,       QWORD PTR [rdi+1]
+        movq            mm3,       QWORD PTR [rdi+2]
 
         psadbw          mm1,       mm0
         psadbw          mm2,       mm0
@@ -225,15 +76,13 @@
         paddw           mm6,       mm2
         paddw           mm7,       mm3
 %endif
-        movq            mm0,       QWORD PTR [%2+%4]
-        movq            mm1,       QWORD PTR [%3+%5]
-        movq            mm2,       QWORD PTR [%3+%5+1]
-        movq            mm3,       QWORD PTR [%3+%5+2]
+        movq            mm0,       QWORD PTR [rsi+rax]
+        movq            mm1,       QWORD PTR [rdi+rdx]
+        movq            mm2,       QWORD PTR [rdi+rdx+1]
+        movq            mm3,       QWORD PTR [rdi+rdx+2]
 
-%if %1==0 || %1==1
-        lea             %2,        [%2+%4*2]
-        lea             %3,        [%3+%5*2]
-%endif
+        lea             rsi,       [rsi+rax*2]
+        lea             rdi,       [rdi+rdx*2]
 
         psadbw          mm1,       mm0
         psadbw          mm2,       mm0
@@ -252,13 +101,13 @@
         mov             %5,         [%1+REG_SZ_BYTES*3]
 %endmacro
 
-%macro PROCESS_16X2X4 8
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm4,       XMMWORD PTR [%3]
-        lddqu           xmm5,       XMMWORD PTR [%4]
-        lddqu           xmm6,       XMMWORD PTR [%5]
-        lddqu           xmm7,       XMMWORD PTR [%6]
+%macro PROCESS_16X2X4 1
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm4,       XMMWORD PTR [rcx]
+        lddqu           xmm5,       XMMWORD PTR [rdx]
+        lddqu           xmm6,       XMMWORD PTR [rbx]
+        lddqu           xmm7,       XMMWORD PTR [rdi]
 
         psadbw          xmm4,       xmm0
         psadbw          xmm5,       xmm0
@@ -265,10 +114,10 @@
         psadbw          xmm6,       xmm0
         psadbw          xmm7,       xmm0
 %else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%4]
-        lddqu           xmm3,       XMMWORD PTR [%5]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm1,       XMMWORD PTR [rcx]
+        lddqu           xmm2,       XMMWORD PTR [rdx]
+        lddqu           xmm3,       XMMWORD PTR [rbx]
 
         psadbw          xmm1,       xmm0
         psadbw          xmm2,       xmm0
@@ -275,7 +124,7 @@
         psadbw          xmm3,       xmm0
 
         paddw           xmm4,       xmm1
-        lddqu           xmm1,       XMMWORD PTR [%6]
+        lddqu           xmm1,       XMMWORD PTR [rdi]
         paddw           xmm5,       xmm2
         paddw           xmm6,       xmm3
 
@@ -282,10 +131,10 @@
         psadbw          xmm1,       xmm0
         paddw           xmm7,       xmm1
 %endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%7]
-        lddqu           xmm1,       XMMWORD PTR [%3+%8]
-        lddqu           xmm2,       XMMWORD PTR [%4+%8]
-        lddqu           xmm3,       XMMWORD PTR [%5+%8]
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        lddqu           xmm1,       XMMWORD PTR [rcx+rbp]
+        lddqu           xmm2,       XMMWORD PTR [rdx+rbp]
+        lddqu           xmm3,       XMMWORD PTR [rbx+rbp]
 
         psadbw          xmm1,       xmm0
         psadbw          xmm2,       xmm0
@@ -292,31 +141,30 @@
         psadbw          xmm3,       xmm0
 
         paddw           xmm4,       xmm1
-        lddqu           xmm1,       XMMWORD PTR [%6+%8]
+        lddqu           xmm1,       XMMWORD PTR [rdi+rbp]
         paddw           xmm5,       xmm2
         paddw           xmm6,       xmm3
 
-%if %1==0 || %1==1
-        lea             %2,         [%2+%7*2]
-        lea             %3,         [%3+%8*2]
+        lea             rsi,        [rsi+rax*2]
+        lea             rcx,        [rcx+rbp*2]
 
-        lea             %4,         [%4+%8*2]
-        lea             %5,         [%5+%8*2]
+        lea             rdx,        [rdx+rbp*2]
+        lea             rbx,        [rbx+rbp*2]
 
-        lea             %6,         [%6+%8*2]
-%endif
+        lea             rdi,        [rdi+rbp*2]
+
         psadbw          xmm1,       xmm0
         paddw           xmm7,       xmm1
 
 %endmacro
 
-%macro PROCESS_8X2X4 8
-%if %1==0
-        movq            mm0,        QWORD PTR [%2]
-        movq            mm4,        QWORD PTR [%3]
-        movq            mm5,        QWORD PTR [%4]
-        movq            mm6,        QWORD PTR [%5]
-        movq            mm7,        QWORD PTR [%6]
+%macro PROCESS_8X2X4 1
+%if %1
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm4,        QWORD PTR [rcx]
+        movq            mm5,        QWORD PTR [rdx]
+        movq            mm6,        QWORD PTR [rbx]
+        movq            mm7,        QWORD PTR [rdi]
 
         psadbw          mm4,        mm0
         psadbw          mm5,        mm0
@@ -323,10 +171,10 @@
         psadbw          mm6,        mm0
         psadbw          mm7,        mm0
 %else
-        movq            mm0,        QWORD PTR [%2]
-        movq            mm1,        QWORD PTR [%3]
-        movq            mm2,        QWORD PTR [%4]
-        movq            mm3,        QWORD PTR [%5]
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rcx]
+        movq            mm2,        QWORD PTR [rdx]
+        movq            mm3,        QWORD PTR [rbx]
 
         psadbw          mm1,        mm0
         psadbw          mm2,        mm0
@@ -333,7 +181,7 @@
         psadbw          mm3,        mm0
 
         paddw           mm4,        mm1
-        movq            mm1,        QWORD PTR [%6]
+        movq            mm1,        QWORD PTR [rdi]
         paddw           mm5,        mm2
         paddw           mm6,        mm3
 
@@ -340,10 +188,10 @@
         psadbw          mm1,        mm0
         paddw           mm7,        mm1
 %endif
-        movq            mm0,        QWORD PTR [%2+%7]
-        movq            mm1,        QWORD PTR [%3+%8]
-        movq            mm2,        QWORD PTR [%4+%8]
-        movq            mm3,        QWORD PTR [%5+%8]
+        movq            mm0,        QWORD PTR [rsi+rax]
+        movq            mm1,        QWORD PTR [rcx+rbp]
+        movq            mm2,        QWORD PTR [rdx+rbp]
+        movq            mm3,        QWORD PTR [rbx+rbp]
 
         psadbw          mm1,        mm0
         psadbw          mm2,        mm0
@@ -350,19 +198,18 @@
         psadbw          mm3,        mm0
 
         paddw           mm4,        mm1
-        movq            mm1,        QWORD PTR [%6+%8]
+        movq            mm1,        QWORD PTR [rdi+rbp]
         paddw           mm5,        mm2
         paddw           mm6,        mm3
 
-%if %1==0 || %1==1
-        lea             %2,         [%2+%7*2]
-        lea             %3,         [%3+%8*2]
+        lea             rsi,        [rsi+rax*2]
+        lea             rcx,        [rcx+rbp*2]
 
-        lea             %4,         [%4+%8*2]
-        lea             %5,         [%5+%8*2]
+        lea             rdx,        [rdx+rbp*2]
+        lea             rbx,        [rbx+rbp*2]
 
-        lea             %6,         [%6+%8*2]
-%endif
+        lea             rdi,        [rdi+rbp*2]
+
         psadbw          mm1,        mm0
         paddw           mm7,        mm1
 
@@ -376,39 +223,54 @@
 ;    int  *results)
 global sym(vp8_sad16x16x3_sse3)
 sym(vp8_sad16x16x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
 
-    STACK_FRAME_CREATE_X3
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
 
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        mov             rcx,        result_ptr
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
 
+        mov             rdi,        arg(4) ;Results
+
         movq            xmm0,       xmm5
         psrldq          xmm5,       8
 
         paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
+        movd            [rdi],      xmm0
 ;-
         movq            xmm0,       xmm6
         psrldq          xmm6,       8
 
         paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
+        movd            [rdi+4],    xmm0
 ;-
         movq            xmm0,       xmm7
         psrldq          xmm7,       8
 
         paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
+        movd            [rdi+8],    xmm0
 
-    STACK_FRAME_DESTROY_X3
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 
 ;void int vp8_sad16x8x3_sse3(
 ;    unsigned char *src_ptr,
@@ -418,35 +280,50 @@
 ;    int  *results)
 global sym(vp8_sad16x8x3_sse3)
 sym(vp8_sad16x8x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
 
-    STACK_FRAME_CREATE_X3
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
 
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        mov             rcx,        result_ptr
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
 
+        mov             rdi,        arg(4) ;Results
+
         movq            xmm0,       xmm5
         psrldq          xmm5,       8
 
         paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
+        movd            [rdi],      xmm0
 ;-
         movq            xmm0,       xmm6
         psrldq          xmm6,       8
 
         paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
+        movd            [rdi+4],    xmm0
 ;-
         movq            xmm0,       xmm7
         psrldq          xmm7,       8
 
         paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
+        movd            [rdi+8],    xmm0
 
-    STACK_FRAME_DESTROY_X3
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 
 ;void int vp8_sad8x16x3_sse3(
 ;    unsigned char *src_ptr,
@@ -456,26 +333,40 @@
 ;    int  *results)
 global sym(vp8_sad8x16x3_sse3)
 sym(vp8_sad8x16x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
 
-    STACK_FRAME_CREATE_X3
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
 
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        mov             rcx,        result_ptr
+        PROCESS_8X2X3 1
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
 
-        punpckldq       mm5,        mm6
+        mov             rdi,        arg(4) ;Results
 
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
+        movd            [rdi],      mm5
+        movd            [rdi+4],    mm6
+        movd            [rdi+8],    mm7
 
-    STACK_FRAME_DESTROY_X3
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 
 ;void int vp8_sad8x8x3_sse3(
 ;    unsigned char *src_ptr,
@@ -485,22 +376,36 @@
 ;    int  *results)
 global sym(vp8_sad8x8x3_sse3)
 sym(vp8_sad8x8x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
 
-    STACK_FRAME_CREATE_X3
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
 
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        mov             rcx,        result_ptr
+        PROCESS_8X2X3 1
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
 
-        punpckldq       mm5,        mm6
+        mov             rdi,        arg(4) ;Results
 
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
+        movd            [rdi],      mm5
+        movd            [rdi+4],    mm6
+        movd            [rdi+8],    mm7
 
-    STACK_FRAME_DESTROY_X3
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 
 ;void int vp8_sad4x4x3_sse3(
 ;    unsigned char *src_ptr,
@@ -510,23 +415,33 @@
 ;    int  *results)
 global sym(vp8_sad4x4x3_sse3)
 sym(vp8_sad4x4x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
 
-    STACK_FRAME_CREATE_X3
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
 
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [ref_ptr]
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rdi]
 
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rdi+rdx]
+
         punpcklbw       mm0,        mm2
         punpcklbw       mm1,        mm3
 
-        movd            mm4,        DWORD PTR [ref_ptr+1]
-        movd            mm5,        DWORD PTR [ref_ptr+2]
+        movd            mm4,        DWORD PTR [rdi+1]
+        movd            mm5,        DWORD PTR [rdi+2]
 
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
+        movd            mm2,        DWORD PTR [rdi+rdx+1]
+        movd            mm3,        DWORD PTR [rdi+rdx+2]
 
         psadbw          mm1,        mm0
 
@@ -536,27 +451,29 @@
         psadbw          mm4,        mm0
         psadbw          mm5,        mm0
 
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
 
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [ref_ptr]
 
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
 
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm2,        DWORD PTR [rdi]
+
+        movd            mm3,        DWORD PTR [rsi+rax]
+        movd            mm6,        DWORD PTR [rdi+rdx]
+
         punpcklbw       mm0,        mm3
         punpcklbw       mm2,        mm6
 
-        movd            mm3,        DWORD PTR [ref_ptr+1]
-        movd            mm7,        DWORD PTR [ref_ptr+2]
+        movd            mm3,        DWORD PTR [rdi+1]
+        movd            mm7,        DWORD PTR [rdi+2]
 
         psadbw          mm2,        mm0
 
         paddw           mm1,        mm2
 
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
+        movd            mm2,        DWORD PTR [rdi+rdx+1]
+        movd            mm6,        DWORD PTR [rdi+rdx+2]
 
         punpcklbw       mm3,        mm2
         punpcklbw       mm7,        mm6
@@ -567,14 +484,19 @@
         paddw           mm3,        mm4
         paddw           mm7,        mm5
 
-        mov             rcx,        result_ptr
+        mov             rdi,        arg(4) ;Results
+        movd            [rdi],      mm1
 
-        punpckldq       mm1,        mm3
+        movd            [rdi+4],    mm3
+        movd            [rdi+8],    mm7
 
-        movq            [rcx],      mm1
-        movd            [rcx+8],    mm7
 
-    STACK_FRAME_DESTROY_X3
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 
 ;unsigned int vp8_sad16x16_sse3(
 ;    unsigned char *src_ptr,
@@ -585,40 +507,51 @@
 ;%define lddqu movdqu
 global sym(vp8_sad16x16_sse3)
 sym(vp8_sad16x16_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
 
-    STACK_FRAME_CREATE_X3
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
 
-        lea             end_ptr,    [src_ptr+src_stride*8]
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        lea             end_ptr,    [end_ptr+src_stride*8]
+        lea             rcx,        [rsi+rbx*8]
+
+        lea             rcx,        [rcx+rbx*8]
         pxor            mm7,        mm7
 
-.vp8_sad16x16_sse3_loop:
+vp8_sad16x16_sse3_loop:
 
-        movq            ret_var,    mm7
-        cmp             ret_var,    max_err
-        jg              .vp8_sad16x16_early_exit
+        movq            rax,        mm7
+        cmp             rax,        arg(4)
+        jg              vp8_sad16x16_early_exit
 
-        movq            mm0,        QWORD PTR [src_ptr]
-        movq            mm2,        QWORD PTR [src_ptr+8]
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm2,        QWORD PTR [rsi+8]
 
-        movq            mm1,        QWORD PTR [ref_ptr]
-        movq            mm3,        QWORD PTR [ref_ptr+8]
+        movq            mm1,        QWORD PTR [rdi]
+        movq            mm3,        QWORD PTR [rdi+8]
 
-        movq            mm4,        QWORD PTR [src_ptr+src_stride]
-        movq            mm5,        QWORD PTR [ref_ptr+ref_stride]
+        movq            mm4,        QWORD PTR [rsi+rbx]
+        movq            mm5,        QWORD PTR [rdi+rdx]
 
         psadbw          mm0,        mm1
         psadbw          mm2,        mm3
 
-        movq            mm1,        QWORD PTR [src_ptr+src_stride+8]
-        movq            mm3,        QWORD PTR [ref_ptr+ref_stride+8]
+        movq            mm1,        QWORD PTR [rsi+rbx+8]
+        movq            mm3,        QWORD PTR [rdi+rdx+8]
 
         psadbw          mm4,        mm5
         psadbw          mm1,        mm3
 
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
+        lea             rsi,        [rsi+rbx*2]
+        lea             rdi,        [rdi+rdx*2]
 
         paddw           mm0,        mm2
         paddw           mm4,        mm1
@@ -626,17 +559,21 @@
         paddw           mm7,        mm0
         paddw           mm7,        mm4
 
-        cmp             src_ptr,    end_ptr
-        jne             .vp8_sad16x16_sse3_loop
+        cmp             rsi,        rcx
+        jne             vp8_sad16x16_sse3_loop
 
-        movq            ret_var,    mm7
+        movq            rax,        mm7
 
-.vp8_sad16x16_early_exit:
+vp8_sad16x16_early_exit:
 
-        mov             rax,        ret_var
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 
-    STACK_FRAME_DESTROY_X3
-
 ;void vp8_sad16x16x4d_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
@@ -645,48 +582,69 @@
 ;    int  *results)
 global sym(vp8_sad16x16x4d_sse3)
 sym(vp8_sad16x16x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
 
-    STACK_FRAME_CREATE_X4
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
 
-        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
 
-%if ABI_IS_32BIT
+        mov             rsi,        arg(0) ;src_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
+
+        xchg            rbx,        rax
+
+        PROCESS_16X2X4 1
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+
         pop             rbp
-%endif
-        mov             rcx,        result_ptr
+        mov             rdi,        arg(4) ;Results
 
         movq            xmm0,       xmm4
         psrldq          xmm4,       8
 
         paddw           xmm0,       xmm4
-        movd            [rcx],      xmm0
+        movd            [rdi],      xmm0
 ;-
         movq            xmm0,       xmm5
         psrldq          xmm5,       8
 
         paddw           xmm0,       xmm5
-        movd            [rcx+4],    xmm0
+        movd            [rdi+4],    xmm0
 ;-
         movq            xmm0,       xmm6
         psrldq          xmm6,       8
 
         paddw           xmm0,       xmm6
-        movd            [rcx+8],    xmm0
+        movd            [rdi+8],    xmm0
 ;-
         movq            xmm0,       xmm7
         psrldq          xmm7,       8
 
         paddw           xmm0,       xmm7
-        movd            [rcx+12],   xmm0
+        movd            [rdi+12],   xmm0
 
-    STACK_FRAME_DESTROY_X4
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 
 ;void vp8_sad16x8x4d_sse3(
 ;    unsigned char *src_ptr,
@@ -696,44 +654,65 @@
 ;    int  *results)
 global sym(vp8_sad16x8x4d_sse3)
 sym(vp8_sad16x8x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
 
-    STACK_FRAME_CREATE_X4
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
 
-        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
 
-%if ABI_IS_32BIT
+        mov             rsi,        arg(0) ;src_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
+
+        xchg            rbx,        rax
+
+        PROCESS_16X2X4 1
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+
         pop             rbp
-%endif
-        mov             rcx,        result_ptr
+        mov             rdi,        arg(4) ;Results
 
         movq            xmm0,       xmm4
         psrldq          xmm4,       8
 
         paddw           xmm0,       xmm4
-        movd            [rcx],      xmm0
+        movd            [rdi],      xmm0
 ;-
         movq            xmm0,       xmm5
         psrldq          xmm5,       8
 
         paddw           xmm0,       xmm5
-        movd            [rcx+4],    xmm0
+        movd            [rdi+4],    xmm0
 ;-
         movq            xmm0,       xmm6
         psrldq          xmm6,       8
 
         paddw           xmm0,       xmm6
-        movd            [rcx+8],    xmm0
+        movd            [rdi+8],    xmm0
 ;-
         movq            xmm0,       xmm7
         psrldq          xmm7,       8
 
         paddw           xmm0,       xmm7
-        movd            [rcx+12],   xmm0
+        movd            [rdi+12],   xmm0
 
-    STACK_FRAME_DESTROY_X4
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 
 ;void int vp8_sad8x16x4d_sse3(
 ;    unsigned char *src_ptr,
@@ -743,31 +722,51 @@
 ;    int  *results)
 global sym(vp8_sad8x16x4d_sse3)
 sym(vp8_sad8x16x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
 
-    STACK_FRAME_CREATE_X4
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
 
-        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
 
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
+        mov             rsi,        arg(0) ;src_ptr
 
-        punpckldq       mm4,        mm5
-        punpckldq       mm6,        mm7
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
 
-        movq            [rcx],      mm4
-        movq            [rcx+8],    mm6
+        xchg            rbx,        rax
 
-    STACK_FRAME_DESTROY_X4
+        PROCESS_8X2X4 1
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
 
+        pop             rbp
+        mov             rdi,        arg(4) ;Results
+
+        movd            [rdi],      mm4
+        movd            [rdi+4],    mm5
+        movd            [rdi+8],    mm6
+        movd            [rdi+12],   mm7
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
 ;void int vp8_sad8x8x4d_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
@@ -776,27 +775,47 @@
 ;    int  *results)
 global sym(vp8_sad8x8x4d_sse3)
 sym(vp8_sad8x8x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
 
-    STACK_FRAME_CREATE_X4
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
 
-        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
 
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
+        mov             rsi,        arg(0) ;src_ptr
 
-        punpckldq       mm4,        mm5
-        punpckldq       mm6,        mm7
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
 
-        movq            [rcx],      mm4
-        movq            [rcx+8],    mm6
+        xchg            rbx,        rax
 
-    STACK_FRAME_DESTROY_X4
+        PROCESS_8X2X4 1
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
 
+        pop             rbp
+        mov             rdi,        arg(4) ;Results
+
+        movd            [rdi],      mm4
+        movd            [rdi+4],    mm5
+        movd            [rdi+8],    mm6
+        movd            [rdi+12],   mm7
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
 ;void int vp8_sad4x4x4d_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
@@ -805,26 +824,43 @@
 ;    int  *results)
 global sym(vp8_sad4x4x4d_sse3)
 sym(vp8_sad4x4x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
 
-    STACK_FRAME_CREATE_X4
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
 
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [r0_ptr]
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
 
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [r0_ptr+ref_stride]
+        mov             rsi,        arg(0) ;src_ptr
 
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
+
+        xchg            rbx,        rax
+
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rcx]
+
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rcx+rbp]
+
         punpcklbw       mm0,        mm2
         punpcklbw       mm1,        mm3
 
-        movd            mm4,        DWORD PTR [r1_ptr]
-        movd            mm5,        DWORD PTR [r2_ptr]
+        movd            mm4,        DWORD PTR [rdx]
+        movd            mm5,        DWORD PTR [rbx]
 
-        movd            mm6,        DWORD PTR [r3_ptr]
-        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
+        movd            mm6,        DWORD PTR [rdi]
+        movd            mm2,        DWORD PTR [rdx+rbp]
 
-        movd            mm3,        DWORD PTR [r2_ptr+ref_stride]
-        movd            mm7,        DWORD PTR [r3_ptr+ref_stride]
+        movd            mm3,        DWORD PTR [rbx+rbp]
+        movd            mm7,        DWORD PTR [rdi+rbp]
 
         psadbw          mm1,        mm0
 
@@ -839,40 +875,37 @@
 
 
 
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             r0_ptr,     [r0_ptr+ref_stride*2]
+        lea             rsi,        [rsi+rax*2]
+        lea             rcx,        [rcx+rbp*2]
 
-        lea             r1_ptr,     [r1_ptr+ref_stride*2]
-        lea             r2_ptr,     [r2_ptr+ref_stride*2]
+        lea             rdx,        [rdx+rbp*2]
+        lea             rbx,        [rbx+rbp*2]
 
-        lea             r3_ptr,     [r3_ptr+ref_stride*2]
+        lea             rdi,        [rdi+rbp*2]
 
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [r0_ptr]
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm2,        DWORD PTR [rcx]
 
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm7,        DWORD PTR [r0_ptr+ref_stride]
+        movd            mm3,        DWORD PTR [rsi+rax]
+        movd            mm7,        DWORD PTR [rcx+rbp]
 
         punpcklbw       mm0,        mm3
         punpcklbw       mm2,        mm7
 
-        movd            mm3,        DWORD PTR [r1_ptr]
-        movd            mm7,        DWORD PTR [r2_ptr]
+        movd            mm3,        DWORD PTR [rdx]
+        movd            mm7,        DWORD PTR [rbx]
 
         psadbw          mm2,        mm0
-%if ABI_IS_32BIT
         mov             rax,        rbp
 
         pop             rbp
-%define     ref_stride    rax
-%endif
-        mov             rsi,        result_ptr
+        mov             rsi,        arg(4) ;Results
 
         paddw           mm1,        mm2
         movd            [rsi],      mm1
 
-        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
-        movd            mm1,        DWORD PTR [r2_ptr+ref_stride]
+        movd            mm2,        DWORD PTR [rdx+rax]
+        movd            mm1,        DWORD PTR [rbx+rax]
 
         punpcklbw       mm3,        mm2
         punpcklbw       mm7,        mm1
@@ -880,8 +913,8 @@
         psadbw          mm3,        mm0
         psadbw          mm7,        mm0
 
-        movd            mm2,        DWORD PTR [r3_ptr]
-        movd            mm1,        DWORD PTR [r3_ptr+ref_stride]
+        movd            mm2,        DWORD PTR [rdi]
+        movd            mm1,        DWORD PTR [rdi+rax]
 
         paddw           mm3,        mm4
         paddw           mm7,        mm5
@@ -896,4 +929,10 @@
         movd            [rsi+12],   mm2
 
 
-    STACK_FRAME_DESTROY_X4
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret