ref: d96ba65a23f383c64ea2b244503636c96d1e8437
parent: 945dad277d3c09708956d60bf7844d47e0eeed1f
author: Yunqing Wang <yunqingwang@google.com>
date: Tue Feb 22 13:01:08 EST 2011
Add prefetch before variance calculation This improved encoding performance by 0.5% (good, speed 1) to 1.5% (good, speed 5). Change-Id: I843d72a0d68a90b5f694adf770943e4a4618f50e
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -85,10 +85,9 @@
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
+ push rbx
push rsi
push rdi
- sub rsp, 16
; end prolog
mov rsi, arg(0) ;[src_ptr]
@@ -97,6 +96,29 @@
movsxd rax, DWORD PTR arg(1) ;[source_stride]
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+ ; Prefetch data
+ lea rcx, [rax+rax*2]
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+rax]
+ prefetcht0 [rsi+rax*2]
+ prefetcht0 [rsi+rcx]
+ lea rbx, [rsi+rax*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax*2]
+ prefetcht0 [rbx+rcx]
+
+ lea rcx, [rdx+rdx*2]
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+rdx]
+ prefetcht0 [rdi+rdx*2]
+ prefetcht0 [rdi+rcx]
+ lea rbx, [rdi+rdx*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx*2]
+ prefetcht0 [rbx+rcx]
+
pxor xmm0, xmm0 ; clear xmm0 for unpack
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
@@ -107,6 +129,9 @@
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rdi]
+ prefetcht0 [rsi+rax*8]
+ prefetcht0 [rdi+rdx*8]
+
movdqa xmm3, xmm1
movdqa xmm4, xmm2
@@ -178,10 +203,9 @@
; begin epilog
- add rsp, 16
pop rdi
pop rsi
- RESTORE_GOT
+ pop rbx
UNSHADOW_ARGS
pop rbp
ret
--
⑨