shithub: libvpx

Download patch

ref: 15de7bd3344910c70a93c36bc4d3b71f65d79aa5
parent: c54d16501460dd15764a2debd19fd8002b003124
parent: afd2f68daef62d1185ba1e65971fdd3a7fc1d8eb
author: James Zern <jzern@google.com>
date: Thu Aug 6 15:28:49 EDT 2015

Merge "Revert "VP9_COPY_CONVOLVE_SSE2 optimization""

--- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -16,289 +16,140 @@
 
 %macro convolve_fn 1
 INIT_XMM sse2
-cglobal convolve_%1, 4, 7, 8, src, src_stride, dst, dst_stride, \
+cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
                               fx, fxs, fy, fys, w, h
-  mov                          r4d, dword wm
-  cmp                          r4d, 4
+  mov r4d, dword wm
+  cmp r4d, 4
   je .w4
-  cmp                          r4d, 8
+  cmp r4d, 8
   je .w8
-  cmp                          r4d, 16
+  cmp r4d, 16
   je .w16
-  cmp                          r4d, 32
+  cmp r4d, 32
   je .w32
 
-  ; 64xh
-  mov                          r4d, dword hm
-  shr                          r4d, 1     ; ASSUMPTION: hm is at least EVEN
-  sub                          r4d, 1
-
-  movu                          m0, [srcq]
-  movu                          m4, [srcq+src_strideq]
-  movu                          m1, [srcq+16]
-  movu                          m5, [srcq+src_strideq+16]
-  movu                          m2, [srcq+32]
-  movu                          m6, [srcq+src_strideq+32]
-  movu                          m3, [srcq+48]
-  movu                          m7, [srcq+src_strideq+48]
-
+  mov                    r4d, dword hm
 .loop64:
-  prefetcht0 [srcq+64            ]
-  prefetcht0 [srcq+src_strideq+64]
-
-  lea                         srcq, [srcq+src_strideq*2]
-
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+  add                   srcq, src_strideq
 %ifidn %1, avg
-  pavgb                         m0, [dstq]
-  pavgb                         m1, [dstq+16]
-
-  mova                   [dstq   ], m0
-  movu                          m0, [srcq]
-
-  mova                   [dstq+16], m1
-  movu                          m1, [srcq+16]
-
-  pavgb                         m2, [dstq+32]
-  mova                   [dstq+32], m2
-  movu                          m2, [srcq+32]
-  pavgb                         m3, [dstq+48]
-  mova                   [dstq+48], m3
-  movu                          m3, [srcq+48]
-  pavgb                         m4, [dstq+dst_strideq]
-
-  mova          [dstq+dst_strideq], m4
-  movu                          m4, [srcq+src_strideq]
-
-  pavgb                         m5, [dstq+dst_strideq+16]
-  mova       [dstq+dst_strideq+16], m5
-  movu                          m5, [srcq+src_strideq+16]
-  pavgb                         m6, [dstq+dst_strideq+32]
-  mova       [dstq+dst_strideq+32], m6
-  movu                          m6, [srcq+src_strideq+32]
-  pavgb                         m7, [dstq+dst_strideq+48]
-  mova       [dstq+dst_strideq+48], m7
-  movu                          m7, [srcq+src_strideq+48]
-
-  lea                         dstq, [dstq+dst_strideq*2]
-%else
-  mova                   [dstq   ], m0
-  movu                          m0, [srcq]
-
-  mova                   [dstq+16], m1
-  movu                          m1, [srcq+16]
-  mova                   [dstq+32], m2
-  movu                          m2, [srcq+32]
-  mova                   [dstq+48], m3
-  movu                          m3, [srcq+48]
-
-  mova          [dstq+dst_strideq], m4
-  movu                          m4, [srcq+src_strideq]
-
-  mova       [dstq+dst_strideq+16], m5
-  movu                          m5, [srcq+src_strideq+16]
-  mova       [dstq+dst_strideq+32], m6
-  movu                          m6, [srcq+src_strideq+32]
-  mova       [dstq+dst_strideq+48], m7
-  movu                          m7, [srcq+src_strideq+48]
-
-  lea                         dstq, [dstq+dst_strideq*2]
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq+16]
+  pavgb                   m2, [dstq+32]
+  pavgb                   m3, [dstq+48]
 %endif
-  dec                          r4d
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  add                   dstq, dst_strideq
+  dec                    r4d
   jnz .loop64
-
-%ifidn %1, avg
-  pavgb                         m0, [dstq]
-  pavgb                         m1, [dstq+16]
-  pavgb                         m2, [dstq+32]
-  pavgb                         m3, [dstq+48]
-  pavgb                         m4, [dstq+dst_strideq]
-  pavgb                         m5, [dstq+dst_strideq+16]
-  pavgb                         m6, [dstq+dst_strideq+32]
-  pavgb                         m7, [dstq+dst_strideq+48]
-%endif
-  mova                   [dstq   ], m0
-  mova                   [dstq+16], m1
-  mova                   [dstq+32], m2
-  mova                   [dstq+48], m3
-
-  mova       [dstq+dst_strideq   ], m4
-  mova       [dstq+dst_strideq+16], m5
-  mova       [dstq+dst_strideq+32], m6
-  mova       [dstq+dst_strideq+48], m7
-
   RET
 
 .w32:
-  mov                          r4d, dword hm
-  sub                          r4d, 2
-
-  movu                          m0, [srcq]
-  movu                          m1, [srcq+16]
-  movu                          m2, [srcq+src_strideq]
-  movu                          m3, [srcq+src_strideq+16]
-
+  mov                    r4d, dword hm
 .loop32:
-  prefetcht0 [srcq+64]
-  prefetcht0 [srcq+src_strideq+64]
-
-  lea                         srcq, [srcq+src_strideq*2]
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+src_strideq]
+  movu                    m3, [srcq+src_strideq+16]
+  lea                   srcq, [srcq+src_strideq*2]
 %ifidn %1, avg
-  pavgb                         m0, [dstq]
-  pavgb                         m1, [dstq+16]
-  pavgb                         m2, [dstq+dst_strideq]
-  pavgb                         m3, [dstq+dst_strideq+16]
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq            +16]
+  pavgb                   m2, [dstq+dst_strideq]
+  pavgb                   m3, [dstq+dst_strideq+16]
 %endif
-  mova                      [dstq], m0
-  movu                          m0, [srcq]
-
-  mova                   [dstq+16], m1
-  movu                          m1, [srcq+16]
-
-  mova          [dstq+dst_strideq], m2
-  movu                          m2, [srcq+src_strideq]
-
-  mova       [dstq+dst_strideq+16], m3
-  movu                          m3, [srcq+src_strideq+16]
-
-  lea                         dstq, [dstq+dst_strideq*2]
-
-  sub                          r4d, 2
+  mova [dstq               ], m0
+  mova [dstq            +16], m1
+  mova [dstq+dst_strideq   ], m2
+  mova [dstq+dst_strideq+16], m3
+  lea                   dstq, [dstq+dst_strideq*2]
+  sub                    r4d, 2
   jnz .loop32
-
-%ifidn %1, avg
-  pavgb                         m0, [dstq]
-  pavgb                         m1, [dstq+16]
-  pavgb                         m2, [dstq+dst_strideq]
-  pavgb                         m3, [dstq+dst_strideq+16]
-%endif
-  mova                   [dstq   ], m0
-  mova                   [dstq+16], m1
-
-  mova       [dstq+dst_strideq   ], m2
-  mova       [dstq+dst_strideq+16], m3
-
   RET
 
 .w16:
-  mov                          r4d, dword hm
-  sub                          r4d, 4
-
-  movu                          m0, [srcq]
-  movu                          m1, [srcq+src_strideq]
-
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
 .loop16:
-  lea                         srcq, [srcq+src_strideq]
-  prefetcht0  [srcq+src_strideq*4]
-  lea                         srcq, [srcq+src_strideq]
-  prefetcht0  [srcq+src_strideq*2]
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+src_strideq]
+  movu                    m2, [srcq+src_strideq*2]
+  movu                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
 %ifidn %1, avg
-  pavgb                         m0, [dstq]
-  pavgb                         m1, [dstq+dst_strideq]
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq+dst_strideq]
+  pavgb                   m2, [dstq+dst_strideq*2]
+  pavgb                   m3, [dstq+r6q]
 %endif
-  mova          [dstq            ], m0
-  mova          [dstq+dst_strideq], m1
-
-  lea                         dstq, [dstq+dst_strideq*2]
-
-  movu                          m0, [srcq]
-  movu                          m1, [srcq+src_strideq]
-
-  sub                          r4d, 2
+  mova  [dstq              ], m0
+  mova  [dstq+dst_strideq  ], m1
+  mova  [dstq+dst_strideq*2], m2
+  mova  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
   jnz .loop16
-
-  lea                         srcq, [srcq+src_strideq*2]
-%ifidn %1, avg
-  pavgb                         m0, [dstq]
-  pavgb                         m1, [dstq+dst_strideq]
-%endif
-  mova          [dstq            ], m0
-  mova          [dstq+dst_strideq], m1
-
-  lea                         dstq, [dstq+dst_strideq*2]
-
-  movu                          m0, [srcq]
-  movu                          m1, [srcq+src_strideq]
-
-%ifidn %1, avg
-  pavgb                         m0, [dstq]
-  pavgb                         m1, [dstq+dst_strideq]
-%endif
-
-  mova          [dstq            ], m0
-  mova          [dstq+dst_strideq], m1
-
   RET
 
 INIT_MMX sse
 .w8:
-  mov                          r4d, dword hm
-  sub                          r4d, 2
-
-  movu                          m0, [srcq]
-  movu                          m1, [srcq+src_strideq]
-
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
 .loop8:
-  lea                         srcq, [srcq+src_strideq]
-  prefetcht0  [srcq+src_strideq*4]
-  lea                         srcq, [srcq+src_strideq]
-  prefetcht0  [srcq+src_strideq*2]
-
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+src_strideq]
+  movu                    m2, [srcq+src_strideq*2]
+  movu                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
 %ifidn %1, avg
-  pavgb                         m0, [dstq]
-  pavgb                         m1, [dstq+dst_strideq]
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq+dst_strideq]
+  pavgb                   m2, [dstq+dst_strideq*2]
+  pavgb                   m3, [dstq+r6q]
 %endif
-  mova          [dstq            ], m0
-  mova          [dstq+dst_strideq], m1
-
-  movu                          m0, [srcq]
-  movu                          m1, [srcq+src_strideq]
-
-  lea                         dstq, [dstq+dst_strideq*2]
-
-  sub                          r4d, 2
+  mova  [dstq              ], m0
+  mova  [dstq+dst_strideq  ], m1
+  mova  [dstq+dst_strideq*2], m2
+  mova  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
   jnz .loop8
-
-%ifidn %1, avg
-  pavgb                         m0, [dstq]
-  pavgb                         m1, [dstq+dst_strideq]
-%endif
-  mova          [dstq            ], m0
-  mova          [dstq+dst_strideq], m1
-
   RET
 
 .w4:
-  mov                          r4d, dword hm
-
-  lea                          r5q, [src_strideq*3]
-  lea                          r6q, [dst_strideq*3]
-
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
 .loop4:
-  movh                          m0, [srcq]
-  movh                          m1, [srcq+src_strideq]
-  movh                          m2, [srcq+src_strideq*2]
-  movh                          m3, [srcq+r5q]
-
-  lea                         srcq, [srcq+src_strideq*4]
+  movh                    m0, [srcq]
+  movh                    m1, [srcq+src_strideq]
+  movh                    m2, [srcq+src_strideq*2]
+  movh                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
 %ifidn %1, avg
-  movh                          m4, [dstq]
-  movh                          m5, [dstq+dst_strideq]
-  movh                          m6, [dstq+dst_strideq*2]
-  movh                          m7, [dstq+r6q]
-
-  pavgb                         m0, m4
-  pavgb                         m1, m5
-  pavgb                         m2, m6
-  pavgb                         m3, m7
+  movh                    m4, [dstq]
+  movh                    m5, [dstq+dst_strideq]
+  movh                    m6, [dstq+dst_strideq*2]
+  movh                    m7, [dstq+r6q]
+  pavgb                   m0, m4
+  pavgb                   m1, m5
+  pavgb                   m2, m6
+  pavgb                   m3, m7
 %endif
-  movh        [dstq              ], m0
-  movh        [dstq+dst_strideq  ], m1
-  movh        [dstq+dst_strideq*2], m2
-  movh        [dstq+r6q          ], m3
-
-  lea                         dstq, [dstq+dst_strideq*4]
-
-  sub                          r4d, 4
+  movh  [dstq              ], m0
+  movh  [dstq+dst_strideq  ], m1
+  movh  [dstq+dst_strideq*2], m2
+  movh  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
   jnz .loop4
   RET
 %endmacro