shithub: libvpx

Download patch

ref: 683b5a31617f0fc5a1f43b8f46343693774551ea
parent: a3df343cda2b6f3d554138ce5dae831e2f946d0c
author: James Zern <jzern@google.com>
date: Wed Sep 16 13:33:34 EDT 2015

vpx_subpixel_8t_ssse3: fix reg counts/access

fixes build on windows x64; previously 'heightq' i.e., the 64-bit register
was accessed when only the 32-bit value was needed. given this is from a
stack variable the upper bits were undefined.

+ bump register/xmm counts; users of SETUP_LOCAL_VARS touch xmm13 in
64-bit builds and filter_block1d16_v* uses one extra temp variable

Change-Id: I9c768c0b2047481d1d3b11c2e16b2f8de6eb0d80

--- a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -104,7 +104,7 @@
     %define       k0k1k4k5 m8
     %define       k2k3k6k7 m9
     %define            krd m10
-    %define    orig_height r7
+    %define    orig_height r7d
     mova               krd, [GLOBAL(pw_64)]
     pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
     pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
@@ -131,8 +131,8 @@
     mova          k2k3k6k7, m7
     mova               krd, m1
 %endif
-    mov        orig_height, heightq
-    shr            heightq, 1
+    mov        orig_height, heightd
+    shr            heightd, 1
 .loop:
     ;Do two rows at once
     movh                m0, [srcq - 3]
@@ -200,12 +200,12 @@
     lea               dstq, [dstq + 2 * dstrideq    ]
     prefetcht0              [srcq + 2 * sstrideq - 3]
 
-    dec            heightq
+    dec            heightd
     jnz              .loop
 
     ; Do last row if output_height is odd
-    mov            heightq, orig_height
-    and            heightq, 1
+    mov            heightd, orig_height
+    and            heightd, 1
     je               .done
 
     movh                m0, [srcq - 3]    ; load src
@@ -254,17 +254,17 @@
 
 ;-------------------------------------------------------------------------------
 %macro SUBPIX_HFILTER8 1
-cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 13, LOCAL_VARS_SIZE, \
+cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \
                             src, sstride, dst, dstride, height, filter
     mova                 m4, [filterq]
     SETUP_LOCAL_VARS
 %if ARCH_X86_64
-    %define     orig_height r7
+    %define     orig_height r7d
 %else
     %define     orig_height heightmp
 %endif
-    mov         orig_height, heightq
-    shr             heightq, 1
+    mov         orig_height, heightd
+    shr             heightd, 1
 
 .loop:
     movh                 m0, [srcq - 3]
@@ -336,12 +336,12 @@
     lea                srcq, [srcq + sstrideq        ]
     lea                dstq, [dstq + 2 * dstrideq    ]
     prefetcht0               [srcq + 2 * sstrideq - 3]
-    dec             heightq
+    dec             heightd
     jnz             .loop
 
     ;Do last row if output_height is odd
-    mov             heightq, orig_height
-    and             heightq, 1
+    mov             heightd, orig_height
+    and             heightd, 1
     je                .done
 
     movh                 m0, [srcq - 3]
@@ -361,7 +361,7 @@
 
 ;-------------------------------------------------------------------------------
 %macro SUBPIX_HFILTER16 1
-cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 13, LOCAL_VARS_SIZE, \
+cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
                              src, sstride, dst, dstride, height, filter
     mova          m4, [filterq]
     SETUP_LOCAL_VARS
@@ -427,7 +427,7 @@
     lea         srcq, [srcq + sstrideq]
     mova      [dstq], m0
     lea         dstq, [dstq + dstrideq]
-    dec      heightq
+    dec      heightd
     jnz        .loop
     RET
 %endm
@@ -527,11 +527,11 @@
 %endif
     movx     [dstq], m1
     add        dstq, dst_stride
-    sub     heightq, 2
-    cmp     heightq, 1
+    sub     heightd, 2
+    cmp     heightd, 1
     jg        .loop
 
-    cmp     heightq, 0
+    cmp     heightd, 0
     je        .done
 
     movx         m0, [srcq                ]     ;A
@@ -570,7 +570,7 @@
 
 ;-------------------------------------------------------------------------------
 %macro SUBPIX_VFILTER16 1
-cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*2), 13, LOCAL_VARS_SIZE, \
+cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
                              src, sstride, dst, dstride, height, filter
 
     mova          m4, [filterq]
@@ -655,7 +655,7 @@
 %endif
     movh  [dstq + 8], m3
     add         dstq, dst_stride
-    dec      heightq
+    dec      heightd
     jnz        .loop
     RET
 %endm