shithub: libvpx

Download patch

ref: b59bd22cc0f89bc13c0b53b1ef618cc405e5dd1f
parent: 65b44c2911a8625e0152f8d6c893d597780e94fd
parent: 4a2b684ef4b361b805be8e0db972cbe9b7e24752
author: John Koleszar <jkoleszar@google.com>
date: Wed Apr 20 20:05:08 EDT 2011

Merge remote branch 'origin/master' into experimental

Change-Id: I78a30fb4438ddd0730262691d7c120d67cbcaaa9

--- a/vp8/common/x86/idctllm_sse2.asm
+++ b/vp8/common/x86/idctllm_sse2.asm
@@ -102,7 +102,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -443,7 +443,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
--- a/vp8/common/x86/iwalsh_sse2.asm
+++ b/vp8/common/x86/iwalsh_sse2.asm
@@ -17,7 +17,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 2
-    SAVE_XMM
+    SAVE_XMM 6
     push        rsi
     push        rdi
     ; end prolog
@@ -41,7 +41,7 @@
     movdqa    xmm4, xmm0
     punpcklqdq  xmm0, xmm3          ;d1 a1
     punpckhqdq  xmm4, xmm3          ;c1 b1
-    movd    xmm7, eax
+    movd    xmm6, eax
 
     movdqa    xmm1, xmm4          ;c1 b1
     paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
@@ -66,7 +66,7 @@
     pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
     movdqa    xmm3, xmm4          ;ip[4] ip[0]
 
-    pshufd    xmm7, xmm7, 0       ;03 03 03 03 03 03 03 03
+    pshufd    xmm6, xmm6, 0       ;03 03 03 03 03 03 03 03
 
     paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
     psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
@@ -90,8 +90,8 @@
     punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
     punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    paddw   xmm5, xmm7
-    paddw   xmm1, xmm7
+    paddw   xmm5, xmm6
+    paddw   xmm1, xmm6
 
     psraw   xmm5, 3
     psraw   xmm1, 3
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -288,7 +288,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -338,7 +338,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -584,7 +584,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -634,7 +634,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1024,7 +1024,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1091,7 +1091,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1249,7 +1249,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1318,7 +1318,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1386,7 +1386,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1503,7 +1503,7 @@
     push        rbp         ; save old base pointer value.
     mov         rbp, rsp    ; set new base pointer value.
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx         ; save callee-saved reg
     push        rsi
     push        rdi
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -26,7 +26,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -256,7 +256,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -456,7 +456,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -67,7 +67,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM
+    SAVE_XMM 7
     push        rsi
     push        rdi
     ; end prolog
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -37,7 +37,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -157,7 +157,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -333,7 +333,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -428,7 +428,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -538,7 +538,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -651,7 +651,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -816,7 +816,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -908,7 +908,6 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
-    ;SAVE_XMM                          ;xmm6, xmm7 are not used here.
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -948,7 +947,6 @@
     pop rdi
     pop rsi
     RESTORE_GOT
-    ;RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -969,7 +967,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1238,7 +1236,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -39,7 +39,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -182,7 +182,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -289,7 +289,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -418,7 +418,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -606,7 +606,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -819,7 +819,6 @@
     pop rdi
     pop rsi
     RESTORE_GOT
-    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -886,7 +885,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1149,7 +1148,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -33,7 +33,7 @@
     %define     input       rcx
     %define     output      rdx
     %define     pitch       r8
-    SAVE_XMM
+    SAVE_XMM 7, u
   %else
     %define     input       rdi
     %define     output      rsi
--- a/vp8/encoder/x86/encodeopt.asm
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -208,7 +208,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM ; 6
+    SAVE_XMM 6
     push rsi
     push rdi
     ; end prolog
--- a/vp8/encoder/x86/fwalsh_sse2.asm
+++ b/vp8/encoder/x86/fwalsh_sse2.asm
@@ -17,7 +17,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -20,7 +20,7 @@
 sym(vp8_regular_quantize_b_sse2):
     push        rbp
     mov         rbp, rsp
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
 
 %if ABI_IS_32BIT
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -21,7 +21,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM ; 6
+    SAVE_XMM 6
     push        rsi
     push        rdi
     ; end prolog
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -33,6 +33,7 @@
     movsxd      rdx,        dword ptr arg(3)    ; ref_stride
 %else
   %ifidn __OUTPUT_FORMAT__,x64
+    SAVE_XMM 7, u
     %define     src_ptr     rcx
     %define     src_stride  rdx
     %define     ref_ptr     r8
@@ -39,9 +40,8 @@
     %define     ref_stride  r9
     %define     end_ptr     r10
     %define     ret_var     r11
-    %define     result_ptr  [rsp+40+4*8]
-    %define     max_err     [rsp+40+4*8]
-    SAVE_XMM
+    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
+    %define     max_err     [rsp+xmm_stack_space+8+4*8]
   %else
     %define     src_ptr     rdi
     %define     src_stride  rsi
@@ -108,6 +108,7 @@
     xchg        rbx,        rax
 %else
   %ifidn __OUTPUT_FORMAT__,x64
+    SAVE_XMM 7, u
     %define     src_ptr     rcx
     %define     src_stride  rdx
     %define     r0_ptr      rsi
@@ -115,8 +116,7 @@
     %define     r2_ptr      r11
     %define     r3_ptr      r8
     %define     ref_stride  r9
-    %define     result_ptr  [rsp+48+4*8]
-    SAVE_XMM
+    %define     result_ptr  [rsp+xmm_stack_space+16+4*8]
     push        rsi
 
     LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -157,7 +157,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM
+    SAVE_XMM 7
     push        rsi
     push        rdi
     push        rcx
@@ -270,7 +270,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM
+    SAVE_XMM 7
     push        rsi
     push        rdi
     push        rcx
--- a/vp8/encoder/x86/ssim_opt.asm
+++ b/vp8/encoder/x86/ssim_opt.asm
@@ -66,7 +66,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM
+    SAVE_XMM 15
     push        rsi
     push        rdi
     ; end prolog
@@ -156,7 +156,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM
+    SAVE_XMM 15
     push        rsi
     push        rdi
     ; end prolog
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -77,7 +77,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -26,7 +26,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -85,7 +85,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     push rbx
     push rsi
     push rdi
@@ -225,7 +225,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -345,7 +345,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -534,7 +534,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -811,7 +811,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -933,7 +933,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -1049,7 +1049,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -1156,7 +1156,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -1264,7 +1264,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
@@ -1369,7 +1369,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
--- a/vp8/encoder/x86/variance_impl_ssse3.asm
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -34,7 +34,7 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM
+    SAVE_XMM 7
     GET_GOT     rbx
     push rsi
     push rdi
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -255,21 +255,48 @@
   %define UNSHADOW_ARGS mov rsp, rbp
 %endif
 
-; must keep XMM6:XMM15 (libvpx uses XMM6 and XMM7) on Win64 ABI
-; rsp register has to be aligned
+; Win64 ABI requires that XMM6:XMM15 are callee saved
+; SAVE_XMM n, [u]
+; store registers 6-n on the stack
+; if u is specified, use unaligned movs.
+; Win64 ABI requires 16 byte stack alignment, but then pushes an 8 byte return
+; value. Typically we follow this up with 'push rbp' - re-aligning the stack -
+; but in some cases this is not done and unaligned movs must be used.
 %ifidn __OUTPUT_FORMAT__,x64
-%macro SAVE_XMM 0
-  sub rsp, 32
-  movdqu XMMWORD PTR [rsp], xmm6
-  movdqu XMMWORD PTR [rsp+16], xmm7
+%macro SAVE_XMM 1-2 a
+  %if %1 < 6
+    %error Only xmm registers 6-15 must be preserved
+  %else
+    %assign last_xmm %1
+    %define movxmm movdq %+ %2
+    %assign xmm_stack_space ((last_xmm - 5) * 16)
+    sub rsp, xmm_stack_space
+    %assign i 6
+    %rep (last_xmm - 5)
+      movxmm [rsp + ((i - 6) * 16)], xmm %+ i
+      %assign i i+1
+    %endrep
+  %endif
 %endmacro
 %macro RESTORE_XMM 0
-  movdqu xmm6, XMMWORD PTR [rsp]
-  movdqu xmm7, XMMWORD PTR [rsp+16]
-  add rsp, 32
+  %ifndef last_xmm
+    %error RESTORE_XMM must be paired with SAVE_XMM n
+  %else
+    %assign i last_xmm
+    %rep (last_xmm - 5)
+      movxmm xmm %+ i, [rsp +((i - 6) * 16)]
+      %assign i i-1
+    %endrep
+    add rsp, xmm_stack_space
+    ; there are a couple functions which return from multiple places.
+    ; otherwise, we could uncomment these:
+    ; %undef last_xmm
+    ; %undef xmm_stack_space
+    ; %undef movxmm
+  %endif
 %endmacro
 %else
-%macro SAVE_XMM 0
+%macro SAVE_XMM 1-2
 %endmacro
 %macro RESTORE_XMM 0
 %endmacro