shithub: libvpx

Download patch

ref: d2a2d5a6d5ea1886eef6078d180be364d80501bc
parent: 7cb25d9c562e094c47f37ad34a94d5cafcba2ece
parent: c5f890af2cff951048cc41630f2523b61fb74a0b
author: John Koleszar <jkoleszar@google.com>
date: Tue Aug 23 20:05:11 EDT 2011

Merge remote branch 'origin/master' into experimental

Change-Id: If53ec5c1219b31e5ef9ae552d9cc79432ebda267

--- a/vp8/common/x86/loopfilter_mmx.asm
+++ b/vp8/common/x86/loopfilter_mmx.asm
@@ -40,7 +40,7 @@
         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
 
         movsxd      rcx, dword ptr arg(5) ;count
-next8_h:
+.next8_h:
         mov         rdx, arg(3) ;limit
         movq        mm7, [rdx]
         mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
@@ -211,7 +211,7 @@
         add         rsi,8
         neg         rax
         dec         rcx
-        jnz         next8_h
+        jnz         .next8_h
 
     add rsp, 32
     pop rsp
@@ -255,7 +255,7 @@
         lea         rsi,        [rsi + rax*4 - 4]
 
         movsxd      rcx,        dword ptr arg(5) ;count
-next8_v:
+.next8_v:
         mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
         add         rdi,        rax
 
@@ -581,7 +581,7 @@
 
         lea         rsi,        [rsi+rax*8]
         dec         rcx
-        jnz         next8_v
+        jnz         .next8_v
 
     add rsp, 64
     pop rsp
@@ -622,7 +622,7 @@
         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
 
         movsxd      rcx, dword ptr arg(5) ;count
-next8_mbh:
+.next8_mbh:
         mov         rdx, arg(3) ;limit
         movq        mm7, [rdx]
         mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
@@ -898,7 +898,7 @@
         neg         rax
         add         rsi,8
         dec         rcx
-        jnz         next8_mbh
+        jnz         .next8_mbh
 
     add rsp, 32
     pop rsp
@@ -942,7 +942,7 @@
         lea         rsi,        [rsi + rax*4 - 4]
 
         movsxd      rcx,        dword ptr arg(5) ;count
-next8_mbv:
+.next8_mbv:
         lea         rdi,        [rsi + rax]  ; rdi points to row +1 for indirect addressing
 
         ;transpose
@@ -1365,7 +1365,7 @@
         lea         rsi,        [rsi+rax*8]
         dec         rcx
 
-        jnz         next8_mbv
+        jnz         .next8_mbv
 
     add rsp, 96
     pop rsp
@@ -1398,7 +1398,7 @@
         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
 
         mov         rcx, 2                ; count
-nexts8_h:
+.nexts8_h:
         mov         rdx, arg(2) ;blimit           ; get blimit
         movq        mm3, [rdx]            ;
 
@@ -1483,7 +1483,7 @@
         add         rsi,8
         neg         rax
         dec         rcx
-        jnz         nexts8_h
+        jnz         .nexts8_h
 
     ; begin epilog
     pop rdi
@@ -1520,7 +1520,7 @@
 
         lea         rsi, [rsi + rax*4- 2];  ;
         mov         rcx, 2                                      ; count
-nexts8_v:
+.nexts8_v:
 
         lea         rdi,        [rsi + rax];
         movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
@@ -1695,7 +1695,7 @@
         lea         rsi,        [rsi+rax*8]                 ; next 8
 
         dec         rcx
-        jnz         nexts8_v
+        jnz         .nexts8_v
 
     add rsp, 32
     pop rsp
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -58,10 +58,10 @@
         movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
         pxor        mm0, mm0              ; mm0 = 00000000
 
-nextrow:
+.nextrow:
 
         xor         rdx,        rdx       ; clear out rdx for use as loop counter
-nextcol:
+.nextcol:
 
         pxor        mm7, mm7              ; mm7 = 00000000
         movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps
@@ -146,7 +146,7 @@
         add         rdx, 4
 
         cmp         edx, dword ptr arg(5) ;cols
-        jl          nextcol
+        jl          .nextcol
         ; done with the all cols, start the across filtering in place
         sub         rsi, rdx
         sub         rdi, rdx
@@ -156,7 +156,7 @@
         xor         rdx,    rdx
         mov         rax,    [rdi-4];
 
-acrossnextcol:
+.acrossnextcol:
         pxor        mm7, mm7              ; mm7 = 00000000
         movq        mm6, [rbx + 32 ]      ;
         movq        mm4, [rdi+rdx]        ; mm4 = p0..p7
@@ -237,7 +237,7 @@
 
         add         rdx, 4
         cmp         edx, dword ptr arg(5) ;cols
-        jl          acrossnextcol;
+        jl          .acrossnextcol;
 
         mov         DWORD PTR [rdi+rdx-4],  eax
         pop         rax
@@ -249,7 +249,7 @@
         movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
 
         dec         rcx                   ; decrement count
-        jnz         nextrow               ; next row
+        jnz         .nextrow               ; next row
         pop         rbx
 
     ; begin epilog
@@ -293,7 +293,7 @@
     add         dword ptr arg(2), 8
 
     ;for(c=0; c<cols; c+=4)
-loop_col:
+.loop_col:
             mov         rsi,        arg(0)  ;s
             pxor        mm0,        mm0     ;
 
@@ -312,7 +312,7 @@
 
             mov         rcx,        15          ;
 
-loop_initvar:
+.loop_initvar:
             movd        mm1,        DWORD PTR [rdi];
             punpcklbw   mm1,        mm0     ;
 
@@ -329,10 +329,10 @@
             lea         rdi,        [rdi+rax]   ;
 
             dec         rcx
-            jne         loop_initvar
+            jne         .loop_initvar
             ;save the var and sum
             xor         rdx,        rdx
-loop_row:
+.loop_row:
             movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
             movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
 
@@ -438,13 +438,13 @@
             add         rdx,        1
 
             cmp         edx,        dword arg(2) ;rows
-            jl          loop_row
+            jl          .loop_row
 
 
         add         dword arg(0), 4 ; s += 4
         sub         dword arg(3), 4 ; cols -= 4
         cmp         dword arg(3), 0
-        jg          loop_col
+        jg          .loop_col
 
     add         rsp, 136
     pop         rsp
@@ -475,7 +475,7 @@
     push        rdi
     ; end prolog
 
-addnoise_loop:
+.addnoise_loop:
     call sym(rand) WRT_PLT
     mov     rcx, arg(1) ;noise
     and     rax, 0xff
@@ -492,7 +492,7 @@
             mov     rsi, arg(0) ;Pos
             xor         rax,rax
 
-addnoise_nextset:
+.addnoise_nextset:
             movq        mm1,[rsi+rax]         ; get the source
 
             psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
@@ -506,12 +506,12 @@
             add         rax,8                 ; move to the next line
 
             cmp         rax, rcx
-            jl          addnoise_nextset
+            jl          .addnoise_nextset
 
     movsxd  rax, dword arg(7) ; Pitch
     add     arg(0), rax ; Start += Pitch
     sub     dword arg(6), 1   ; Height -= 1
-    jg      addnoise_loop
+    jg      .addnoise_loop
 
     ; begin epilog
     pop rdi
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -57,10 +57,10 @@
         movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
         pxor        xmm0,       xmm0              ; mm0 = 00000000
 
-nextrow:
+.nextrow:
 
         xor         rdx,        rdx       ; clear out rdx for use as loop counter
-nextcol:
+.nextcol:
         movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
         punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
         movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
@@ -133,7 +133,7 @@
         add         rdx,        8
         cmp         edx,        dword arg(5) ;cols
 
-        jl          nextcol
+        jl          .nextcol
 
         ; done with the all cols, start the across filtering in place
         sub         rsi,        rdx
@@ -142,7 +142,7 @@
         xor         rdx,        rdx
         movq        mm0,        QWORD PTR [rdi-8];
 
-acrossnextcol:
+.acrossnextcol:
         movq        xmm7,       QWORD PTR [rdi +rdx -2]
         movd        xmm4,       DWORD PTR [rdi +rdx +6]
 
@@ -219,7 +219,7 @@
 
         add         rdx,        8
         cmp         edx,        dword arg(5) ;cols
-        jl          acrossnextcol;
+        jl          .acrossnextcol;
 
         ; last 8 pixels
         movq        QWORD PTR [rdi+rdx-8],  mm0
@@ -231,7 +231,7 @@
         mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
 
         dec         rcx                   ; decrement count
-        jnz         nextrow               ; next row
+        jnz         .nextrow              ; next row
 
 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
     add rsp,16
@@ -282,7 +282,7 @@
     add         dword arg(2), 8
 
     ;for(c=0; c<cols; c+=8)
-loop_col:
+.loop_col:
             mov         rsi,        arg(0) ; s
             pxor        xmm0,       xmm0        ;
 
@@ -301,7 +301,7 @@
 
             mov         rcx,        15          ;
 
-loop_initvar:
+.loop_initvar:
             movq        xmm1,       QWORD PTR [rdi];
             punpcklbw   xmm1,       xmm0        ;
 
@@ -318,10 +318,10 @@
             lea         rdi,        [rdi+rax]   ;
 
             dec         rcx
-            jne         loop_initvar
+            jne         .loop_initvar
             ;save the var and sum
             xor         rdx,        rdx
-loop_row:
+.loop_row:
             movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
             movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
 
@@ -428,12 +428,12 @@
             add         rdx,        1
 
             cmp         edx,        dword arg(2) ;rows
-            jl          loop_row
+            jl          .loop_row
 
         add         dword arg(0), 8 ; s += 8
         sub         dword arg(3), 8 ; cols -= 8
         cmp         dword arg(3), 0
-        jg          loop_col
+        jg          .loop_col
 
     add         rsp, 128+16
     pop         rsp
@@ -475,13 +475,13 @@
 
 
     ;for(r=0;r<rows;r++)
-ip_row_loop:
+.ip_row_loop:
 
         xor         rdx,    rdx ;sumsq=0;
         xor         rcx,    rcx ;sum=0;
         mov         rsi,    arg(0); s
         mov         rdi,    -8
-ip_var_loop:
+.ip_var_loop:
         ;for(i=-8;i<=6;i++)
         ;{
         ;    sumsq += s[i]*s[i];
@@ -493,7 +493,7 @@
         add         edx, eax
         add         rdi, 1
         cmp         rdi, 6
-        jle         ip_var_loop
+        jle         .ip_var_loop
 
 
             ;mov         rax,    sumsq
@@ -513,7 +513,7 @@
             pxor        mm1,    mm1
 
             pxor        xmm0,   xmm0
-nextcol4:
+.nextcol4:
 
             movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
             movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
@@ -600,7 +600,7 @@
             add         rcx,    4
 
             cmp         rcx,    rdx
-            jl          nextcol4
+            jl          .nextcol4
 
         ;s+=pitch;
         movsxd rax, dword arg(1)
@@ -608,7 +608,7 @@
 
         sub dword arg(2), 1 ;rows-=1
         cmp dword arg(2), 0
-        jg ip_row_loop
+        jg .ip_row_loop
 
     add         rsp, 16
     pop         rsp
@@ -640,7 +640,7 @@
     push        rdi
     ; end prolog
 
-addnoise_loop:
+.addnoise_loop:
     call sym(rand) WRT_PLT
     mov     rcx, arg(1) ;noise
     and     rax, 0xff
@@ -657,7 +657,7 @@
             mov     rsi, arg(0) ;Pos
             xor         rax,rax
 
-addnoise_nextset:
+.addnoise_nextset:
             movdqu      xmm1,[rsi+rax]         ; get the source
 
             psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
@@ -671,12 +671,12 @@
             add         rax,16                 ; move to the next line
 
             cmp         rax, rcx
-            jl          addnoise_nextset
+            jl          .addnoise_nextset
 
     movsxd  rax, dword arg(7) ; Pitch
     add     arg(0), rax ; Start += Pitch
     sub     dword arg(6), 1   ; Height -= 1
-    jg      addnoise_loop
+    jg      .addnoise_loop
 
     ; begin epilog
     pop rdi
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -503,7 +503,7 @@
     mov         rdi,        arg(0) ;dst;
     movsxd      rcx,        dword ptr arg(1) ;dst_stride
 
-vp8_intra_pred_uv_tm_%1_loop:
+.vp8_intra_pred_uv_tm_%1_loop:
     movd        xmm3,       [rsi]
     movd        xmm5,       [rsi+rax]
 %ifidn %1, sse2
@@ -525,7 +525,7 @@
     lea         rsi,        [rsi+rax*2]
     lea         rdi,        [rdi+rcx*2]
     dec         edx
-    jnz vp8_intra_pred_uv_tm_%1_loop
+    jnz .vp8_intra_pred_uv_tm_%1_loop
 
     ; begin epilog
     pop         rdi
@@ -615,7 +615,7 @@
 %endif
     dec         rsi
 %ifidn %1, mmx2
-vp8_intra_pred_uv_ho_%1_loop:
+.vp8_intra_pred_uv_ho_%1_loop:
     movd        mm0,        [rsi]
     movd        mm1,        [rsi+rax]
     punpcklbw   mm0,        mm0
@@ -627,7 +627,7 @@
     lea         rsi,        [rsi+rax*2]
     lea         rdi,        [rdi+rcx*2]
     dec         edx
-    jnz vp8_intra_pred_uv_ho_%1_loop
+    jnz .vp8_intra_pred_uv_ho_%1_loop
 %else
     movd        xmm0,       [rsi]
     movd        xmm3,       [rsi+rax]
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -50,7 +50,7 @@
         movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
         pxor        mm0,    mm0              ; mm0 = 00000000
 
-nextrow:
+.nextrow:
         movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
         movq        mm4,    mm3              ; mm4 = p-2..p5
         psrlq       mm3,    8                ; mm3 = p-1..p5
@@ -102,7 +102,7 @@
 %endif
 
         dec         rcx                      ; decrement count
-        jnz         nextrow                  ; next row
+        jnz         .nextrow                 ; next row
 
     ; begin epilog
     pop rdi
@@ -152,7 +152,7 @@
         pxor        mm0, mm0              ; mm0 = 00000000
 
 
-nextrow_cv:
+.nextrow_cv:
         movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
         pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
 
@@ -190,7 +190,7 @@
         ; avoidable!!!.
         lea         rdi,  [rdi+rax] ;
         dec         rcx                   ; decrement count
-        jnz         nextrow_cv             ; next row
+        jnz         .nextrow_cv           ; next row
 
         pop         rbx
 
@@ -282,7 +282,7 @@
         packuswb    mm7,        mm4                 ;
 
         add         rsi,        rdx                 ; next line
-next_row_8x8:
+.next_row_8x8:
         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         movq        mm4,        mm3                 ; make a copy of current line
 
@@ -349,7 +349,7 @@
         add         rdi,        r8                  ;dst_pitch
 %endif
         cmp         rdi,        rcx                 ;
-        jne         next_row_8x8
+        jne         .next_row_8x8
 
     ; begin epilog
     pop rdi
@@ -437,7 +437,7 @@
         packuswb    mm7,        mm4                 ;
 
         add         rsi,        rdx                 ; next line
-next_row_8x4:
+.next_row_8x4:
         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         movq        mm4,        mm3                 ; make a copy of current line
 
@@ -504,7 +504,7 @@
         add         rdi,        r8
 %endif
         cmp         rdi,        rcx                 ;
-        jne         next_row_8x4
+        jne         .next_row_8x4
 
     ; begin epilog
     pop rdi
@@ -579,7 +579,7 @@
         packuswb    mm7,        mm0                 ;
 
         add         rsi,        rdx                 ; next line
-next_row_4x4:
+.next_row_4x4:
         movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
 
@@ -622,7 +622,7 @@
 %endif
 
         cmp         rdi,        rcx                 ;
-        jne         next_row_4x4
+        jne         .next_row_4x4
 
     ; begin epilog
     pop rdi
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -55,7 +55,7 @@
 %endif
         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
 
-filter_block1d8_h6_rowloop:
+.filter_block1d8_h6_rowloop:
         movq        xmm3,       MMWORD PTR [rsi - 2]
         movq        xmm1,       MMWORD PTR [rsi + 6]
 
@@ -124,7 +124,7 @@
 %endif
         dec         rcx
 
-        jnz         filter_block1d8_h6_rowloop                ; next row
+        jnz         .filter_block1d8_h6_rowloop                ; next row
 
     ; begin epilog
     pop rdi
@@ -176,7 +176,7 @@
 
         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
 
-filter_block1d16_h6_sse2_rowloop:
+.filter_block1d16_h6_sse2_rowloop:
         movq        xmm3,       MMWORD PTR [rsi - 2]
         movq        xmm1,       MMWORD PTR [rsi + 6]
 
@@ -301,7 +301,7 @@
 %endif
 
         dec         rcx
-        jnz         filter_block1d16_h6_sse2_rowloop                ; next row
+        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
 
     ; begin epilog
     pop rdi
@@ -356,7 +356,7 @@
         movsxd      r8,         dword ptr arg(2) ; dst_ptich
 %endif
 
-vp8_filter_block1d8_v6_sse2_loop:
+.vp8_filter_block1d8_v6_sse2_loop:
         movdqa      xmm1,       XMMWORD PTR [rsi]
         pmullw      xmm1,       [rax]
 
@@ -396,7 +396,7 @@
         add         rdi,        r8
 %endif
         dec         rcx         ; decrement count
-        jnz         vp8_filter_block1d8_v6_sse2_loop               ; next row
+        jnz         .vp8_filter_block1d8_v6_sse2_loop               ; next row
 
     ; begin epilog
     pop rdi
@@ -448,7 +448,7 @@
         movsxd      r8,         dword ptr arg(2) ; dst_ptich
 %endif
 
-vp8_filter_block1d16_v6_sse2_loop:
+.vp8_filter_block1d16_v6_sse2_loop:
 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
         movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
         movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
@@ -511,7 +511,7 @@
         add         rdi,        r8
 %endif
         dec         rcx         ; decrement count
-        jnz         vp8_filter_block1d16_v6_sse2_loop               ; next row
+        jnz         .vp8_filter_block1d16_v6_sse2_loop              ; next row
 
     ; begin epilog
     pop rdi
@@ -556,7 +556,7 @@
 %endif
         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
 
-filter_block1d8_h6_only_rowloop:
+.filter_block1d8_h6_only_rowloop:
         movq        xmm3,       MMWORD PTR [rsi - 2]
         movq        xmm1,       MMWORD PTR [rsi + 6]
 
@@ -624,7 +624,7 @@
 %endif
         dec         rcx
 
-        jnz         filter_block1d8_h6_only_rowloop                ; next row
+        jnz         .filter_block1d8_h6_only_rowloop               ; next row
 
     ; begin epilog
     pop rdi
@@ -670,7 +670,7 @@
 
         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
 
-filter_block1d16_h6_only_sse2_rowloop:
+.filter_block1d16_h6_only_sse2_rowloop:
         movq        xmm3,       MMWORD PTR [rsi - 2]
         movq        xmm1,       MMWORD PTR [rsi + 6]
 
@@ -789,7 +789,7 @@
 %endif
 
         dec         rcx
-        jnz         filter_block1d16_h6_only_sse2_rowloop                ; next row
+        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
 
     ; begin epilog
     pop rdi
@@ -837,7 +837,7 @@
         movsxd      r8,         dword ptr arg(3) ; dst_ptich
 %endif
 
-vp8_filter_block1d8_v6_only_sse2_loop:
+.vp8_filter_block1d8_v6_only_sse2_loop:
         movq        xmm1,       MMWORD PTR [rsi]
         movq        xmm2,       MMWORD PTR [rsi + rdx]
         movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
@@ -883,7 +883,7 @@
         add         rdi,        r8
 %endif
         dec         rcx         ; decrement count
-        jnz         vp8_filter_block1d8_v6_only_sse2_loop               ; next row
+        jnz         .vp8_filter_block1d8_v6_only_sse2_loop              ; next row
 
     ; begin epilog
     pop rdi
@@ -924,7 +924,7 @@
         movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
 %endif
 
-unpack_block1d16_h6_sse2_rowloop:
+.unpack_block1d16_h6_sse2_rowloop:
         movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
         movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
 
@@ -941,7 +941,7 @@
         add         rdi,        r8
 %endif
         dec         rcx
-        jnz         unpack_block1d16_h6_sse2_rowloop                ; next row
+        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
 
     ; begin epilog
     pop rdi
@@ -980,7 +980,7 @@
         movsxd      rax,        dword ptr arg(2) ;xoffset
 
         cmp         rax,        0      ;skip first_pass filter if xoffset=0
-        je          b16x16_sp_only
+        je          .b16x16_sp_only
 
         shl         rax,        5
         add         rax,        rcx    ;HFilter
@@ -995,7 +995,7 @@
         movsxd      rax,        dword ptr arg(3) ;yoffset
 
         cmp         rax,        0      ;skip second_pass filter if yoffset=0
-        je          b16x16_fp_only
+        je          .b16x16_fp_only
 
         shl         rax,        5
         add         rax,        rcx    ;VFilter
@@ -1041,7 +1041,7 @@
         packuswb    xmm7,       xmm4
 
         add         rsi,        rdx                 ; next line
-next_row:
+.next_row:
         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         movdqa      xmm4,       xmm3                 ; make a copy of current line
 
@@ -1104,11 +1104,11 @@
 %endif
 
         cmp         rdi,        rcx
-        jne         next_row
+        jne         .next_row
 
-        jmp         done
+        jmp         .done
 
-b16x16_sp_only:
+.b16x16_sp_only:
         movsxd      rax,        dword ptr arg(3) ;yoffset
         shl         rax,        5
         add         rax,        rcx    ;VFilter
@@ -1130,7 +1130,7 @@
         movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
 
         add         rsi,        rax                 ; next line
-next_row_spo:
+.next_row_spo:
         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
 
         movdqa      xmm5,       xmm7
@@ -1164,17 +1164,17 @@
         add         rsi,        rax                 ; next line
         add         rdi,        rdx                 ;dst_pitch
         cmp         rdi,        rcx
-        jne         next_row_spo
+        jne         .next_row_spo
 
-        jmp         done
+        jmp         .done
 
-b16x16_fp_only:
+.b16x16_fp_only:
         lea         rcx,        [rdi+rdx*8]
         lea         rcx,        [rcx+rdx*8]
         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
         pxor        xmm0,       xmm0
 
-next_row_fpo:
+.next_row_fpo:
         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         movdqa      xmm4,       xmm3                 ; make a copy of current line
 
@@ -1208,9 +1208,9 @@
         add         rsi,        rax                 ; next line
         add         rdi,        rdx                 ; dst_pitch
         cmp         rdi,        rcx
-        jne         next_row_fpo
+        jne         .next_row_fpo
 
-done:
+.done:
     ; begin epilog
     pop rdi
     pop rsi
@@ -1318,7 +1318,7 @@
 
         movdqa      xmm7,       xmm3
         add         rsp,        16                 ; next line
-next_row8x8:
+.next_row8x8:
         movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
         movdqa      xmm4,       xmm3                 ; make a copy of current line
         psrldq      xmm4,       1
@@ -1352,7 +1352,7 @@
         add         rdi,        rdx
 
         cmp         rdi,        rcx
-        jne         next_row8x8
+        jne         .next_row8x8
 
     ;add rsp, 144
     pop rsp
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -70,7 +70,7 @@
 
     sub         rdi, rdx
 ;xmm3 free
-filter_block1d8_h6_rowloop_ssse3:
+.filter_block1d8_h6_rowloop_ssse3:
     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
 
     movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
@@ -102,7 +102,7 @@
     packuswb    xmm0,   xmm0
 
     movq        MMWORD Ptr [rdi], xmm0
-    jnz         filter_block1d8_h6_rowloop_ssse3
+    jnz         .filter_block1d8_h6_rowloop_ssse3
 
     ; begin epilog
     pop rdi
@@ -129,7 +129,7 @@
 
     sub         rdi, rdx
 
-filter_block1d8_h4_rowloop_ssse3:
+.filter_block1d8_h4_rowloop_ssse3:
     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
 
     movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
@@ -158,7 +158,7 @@
 
     movq        MMWORD Ptr [rdi], xmm0
 
-    jnz         filter_block1d8_h4_rowloop_ssse3
+    jnz         .filter_block1d8_h4_rowloop_ssse3
 
     ; begin epilog
     pop rdi
@@ -207,7 +207,7 @@
     movsxd      rcx, dword ptr arg(4)           ;output_height
     movsxd      rdx, dword ptr arg(3)           ;output_pitch
 
-filter_block1d16_h6_rowloop_ssse3:
+.filter_block1d16_h6_rowloop_ssse3:
     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
 
     movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
@@ -264,7 +264,7 @@
 
     lea         rdi,    [rdi + rdx]
     dec         rcx
-    jnz         filter_block1d16_h6_rowloop_ssse3
+    jnz         .filter_block1d16_h6_rowloop_ssse3
 
     ; begin epilog
     pop rdi
@@ -304,7 +304,7 @@
     movdqa      xmm7, [GLOBAL(rd)]
 
     cmp         esi, DWORD PTR [rax]
-    je          vp8_filter_block1d4_h4_ssse3
+    je          .vp8_filter_block1d4_h4_ssse3
 
     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
@@ -318,7 +318,7 @@
     movsxd      rdx, dword ptr arg(3)   ;output_pitch
 
 ;xmm3 free
-filter_block1d4_h6_rowloop_ssse3:
+.filter_block1d4_h6_rowloop_ssse3:
     movdqu      xmm0,   XMMWORD PTR [rsi - 2]
 
     movdqa      xmm1, xmm0
@@ -346,7 +346,7 @@
 
     add         rdi, rdx
     dec         rcx
-    jnz         filter_block1d4_h6_rowloop_ssse3
+    jnz         .filter_block1d4_h6_rowloop_ssse3
 
     ; begin epilog
     pop rdi
@@ -356,7 +356,7 @@
     pop         rbp
     ret
 
-vp8_filter_block1d4_h4_ssse3:
+.vp8_filter_block1d4_h4_ssse3:
     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
     movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
@@ -369,7 +369,7 @@
 
     movsxd      rdx, dword ptr arg(3)   ;output_pitch
 
-filter_block1d4_h4_rowloop_ssse3:
+.filter_block1d4_h4_rowloop_ssse3:
     movdqu      xmm1,   XMMWORD PTR [rsi - 2]
 
     movdqa      xmm2, xmm1
@@ -391,7 +391,7 @@
 
     add         rdi, rdx
     dec         rcx
-    jnz         filter_block1d4_h4_rowloop_ssse3
+    jnz         .filter_block1d4_h4_rowloop_ssse3
 
     ; begin epilog
     pop rdi
@@ -432,7 +432,7 @@
     add         rax, rdx
 
     cmp         esi, DWORD PTR [rax]
-    je          vp8_filter_block1d16_v4_ssse3
+    je          .vp8_filter_block1d16_v4_ssse3
 
     movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
@@ -450,7 +450,7 @@
     add         rax, rdx
 
 
-vp8_filter_block1d16_v6_ssse3_loop:
+.vp8_filter_block1d16_v6_ssse3_loop:
     movq        xmm1, MMWORD PTR [rsi]                  ;A
     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
@@ -508,7 +508,7 @@
     add         rdi,        r8
 %endif
     dec         rcx
-    jnz         vp8_filter_block1d16_v6_ssse3_loop
+    jnz         .vp8_filter_block1d16_v6_ssse3_loop
 
     ; begin epilog
     pop rdi
@@ -519,7 +519,7 @@
     pop         rbp
     ret
 
-vp8_filter_block1d16_v4_ssse3:
+.vp8_filter_block1d16_v4_ssse3:
     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
 
@@ -534,7 +534,7 @@
     movsxd      rcx, DWORD PTR arg(4)   ;output_height
     add         rax, rdx
 
-vp8_filter_block1d16_v4_ssse3_loop:
+.vp8_filter_block1d16_v4_ssse3_loop:
     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
@@ -581,7 +581,7 @@
     add         rdi,        r8
 %endif
     dec         rcx
-    jnz         vp8_filter_block1d16_v4_ssse3_loop
+    jnz         .vp8_filter_block1d16_v4_ssse3_loop
 
     ; begin epilog
     pop rdi
@@ -627,7 +627,7 @@
     movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
 
     cmp         esi, DWORD PTR [rax]
-    je          vp8_filter_block1d8_v4_ssse3
+    je          .vp8_filter_block1d8_v4_ssse3
 
     movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
@@ -638,7 +638,7 @@
     mov         rax, rsi
     add         rax, rdx
 
-vp8_filter_block1d8_v6_ssse3_loop:
+.vp8_filter_block1d8_v6_ssse3_loop:
     movq        xmm1, MMWORD PTR [rsi]                  ;A
     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
@@ -673,7 +673,7 @@
     add         rdi,        r8
 %endif
     dec         rcx
-    jnz         vp8_filter_block1d8_v6_ssse3_loop
+    jnz         .vp8_filter_block1d8_v6_ssse3_loop
 
     ; begin epilog
     pop rdi
@@ -684,7 +684,7 @@
     pop         rbp
     ret
 
-vp8_filter_block1d8_v4_ssse3:
+.vp8_filter_block1d8_v4_ssse3:
     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
     movdqa      xmm5, [GLOBAL(rd)]
@@ -694,7 +694,7 @@
     mov         rax, rsi
     add         rax, rdx
 
-vp8_filter_block1d8_v4_ssse3_loop:
+.vp8_filter_block1d8_v4_ssse3_loop:
     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
@@ -722,7 +722,7 @@
     add         rdi,        r8
 %endif
     dec         rcx
-    jnz         vp8_filter_block1d8_v4_ssse3_loop
+    jnz         .vp8_filter_block1d8_v4_ssse3_loop
 
     ; begin epilog
     pop rdi
@@ -766,7 +766,7 @@
     movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
 
     cmp         esi, DWORD PTR [rax]
-    je          vp8_filter_block1d4_v4_ssse3
+    je          .vp8_filter_block1d4_v4_ssse3
 
     movq        mm5, MMWORD PTR [rax]         ;k0_k5
     movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
@@ -777,7 +777,7 @@
     mov         rax, rsi
     add         rax, rdx
 
-vp8_filter_block1d4_v6_ssse3_loop:
+.vp8_filter_block1d4_v6_ssse3_loop:
     movd        mm1, DWORD PTR [rsi]                  ;A
     movd        mm2, DWORD PTR [rsi + rdx]            ;B
     movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
@@ -813,7 +813,7 @@
     add         rdi,        r8
 %endif
     dec         rcx
-    jnz         vp8_filter_block1d4_v6_ssse3_loop
+    jnz         .vp8_filter_block1d4_v6_ssse3_loop
 
     ; begin epilog
     pop rdi
@@ -823,7 +823,7 @@
     pop         rbp
     ret
 
-vp8_filter_block1d4_v4_ssse3:
+.vp8_filter_block1d4_v4_ssse3:
     movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
     movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
     movq        mm5, MMWORD PTR [GLOBAL(rd)]
@@ -833,7 +833,7 @@
     mov         rax, rsi
     add         rax, rdx
 
-vp8_filter_block1d4_v4_ssse3_loop:
+.vp8_filter_block1d4_v4_ssse3_loop:
     movd        mm2, DWORD PTR [rsi + rdx]            ;B
     movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
     movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
@@ -861,7 +861,7 @@
     add         rdi,        r8
 %endif
     dec         rcx
-    jnz         vp8_filter_block1d4_v4_ssse3_loop
+    jnz         .vp8_filter_block1d4_v4_ssse3_loop
 
     ; begin epilog
     pop rdi
@@ -895,7 +895,7 @@
         movsxd      rax,        dword ptr arg(2)    ; xoffset
 
         cmp         rax,        0                   ; skip first_pass filter if xoffset=0
-        je          b16x16_sp_only
+        je          .b16x16_sp_only
 
         shl         rax,        4
         lea         rax,        [rax + rcx]         ; HFilter
@@ -909,7 +909,7 @@
         movsxd      rax,        dword ptr arg(3)    ; yoffset
 
         cmp         rax,        0                   ; skip second_pass filter if yoffset=0
-        je          b16x16_fp_only
+        je          .b16x16_fp_only
 
         shl         rax,        4
         lea         rax,        [rax + rcx]         ; VFilter
@@ -996,9 +996,9 @@
         cmp         rdi,        rcx
         jne         .next_row
 
-        jmp         done
+        jmp         .done
 
-b16x16_sp_only:
+.b16x16_sp_only:
         movsxd      rax,        dword ptr arg(3)    ; yoffset
         shl         rax,        4
         lea         rax,        [rax + rcx]         ; VFilter
@@ -1018,7 +1018,7 @@
         movq        xmm2,       [rsi + 8]           ; load row 0
 
         lea         rsi,        [rsi + rax]         ; next line
-.next_row:
+.next_row_sp:
         movq        xmm3,       [rsi]               ; load row + 1
         movq        xmm5,       [rsi + 8]           ; load row + 1
 
@@ -1062,16 +1062,16 @@
         lea         rdi,        [rdi + 2*rdx]
 
         cmp         rdi,        rcx
-        jne         .next_row
+        jne         .next_row_sp
 
-        jmp         done
+        jmp         .done
 
-b16x16_fp_only:
+.b16x16_fp_only:
         lea         rcx,        [rdi+rdx*8]
         lea         rcx,        [rcx+rdx*8]
         movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
 
-.next_row:
+.next_row_fp:
         movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
         movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
 
@@ -1122,9 +1122,9 @@
 
         cmp         rdi,        rcx
 
-        jne         .next_row
+        jne         .next_row_fp
 
-done:
+.done:
     ; begin epilog
     pop         rdi
     pop         rsi
@@ -1191,7 +1191,7 @@
 
         movsxd      rax,        dword ptr arg(2)    ; xoffset
         cmp         rax,        0                   ; skip first_pass filter if xoffset=0
-        je          b8x8_sp_only
+        je          .b8x8_sp_only
 
         shl         rax,        4
         add         rax,        rcx                 ; HFilter
@@ -1203,7 +1203,7 @@
 
         movsxd      rax,        dword ptr arg(3)    ; yoffset
         cmp         rax,        0                   ; skip second_pass filter if yoffset=0
-        je          b8x8_fp_only
+        je          .b8x8_fp_only
 
         shl         rax,        4
         lea         rax,        [rax + rcx]         ; VFilter
@@ -1260,9 +1260,9 @@
         cmp         rdi,        rcx
         jne         .next_row
 
-        jmp         done8x8
+        jmp         .done8x8
 
-b8x8_sp_only:
+.b8x8_sp_only:
         movsxd      rax,        dword ptr arg(3)    ; yoffset
         shl         rax,        4
         lea         rax,        [rax + rcx]         ; VFilter
@@ -1364,12 +1364,12 @@
         movq        [rdi+rdx],  xmm1
         lea         rsp,        [rsp + 144]
 
-        jmp         done8x8
+        jmp         .done8x8
 
-b8x8_fp_only:
+.b8x8_fp_only:
         lea         rcx,        [rdi+rdx*8]
 
-.next_row:
+.next_row_fp:
         movdqa      xmm1,       XMMWORD PTR [rsp]
         movdqa      xmm3,       XMMWORD PTR [rsp+16]
 
@@ -1430,11 +1430,11 @@
         lea         rdi,        [rdi + 2*rdx]
         cmp         rdi,        rcx
 
-        jne         .next_row
+        jne         .next_row_fp
 
         lea         rsp,        [rsp + 16]
 
-done8x8:
+.done8x8:
     ;add rsp, 144
     pop         rsp
     ; begin epilog
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -94,15 +94,14 @@
 #if !(CONFIG_REALTIME_ONLY)
     cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_c;
 #endif
+#if CONFIG_INTERNAL_STATS
+    cpi->rtcd.variance.ssimpf_8x8            = vp8_ssim_parms_8x8_c;
+    cpi->rtcd.variance.ssimpf_16x16          = vp8_ssim_parms_16x16_c;
 #endif
+#endif
 
     // Pure C:
     vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
-
-#if CONFIG_INTERNAL_STATS
-    cpi->rtcd.variance.ssimpf_8x8            = ssim_parms_8x8_c;
-    cpi->rtcd.variance.ssimpf                = ssim_parms_c;
-#endif
 
 #if ARCH_X86 || ARCH_X86_64
     vp8_arch_x86_encoder_init(cpi);
--- a/vp8/encoder/ssim.c
+++ b/vp8/encoder/ssim.c
@@ -9,18 +9,9 @@
  */
 
 
-#include "vpx_scale/yv12config.h"
-#include "math.h"
 #include "onyx_int.h"
 
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x)  (x)
-#else
-#define IF_RTCD(x)  NULL
-#endif
-
-
-void ssim_parms_c
+void vp8_ssim_parms_16x16_c
 (
     unsigned char *s,
     int sp,
@@ -46,7 +37,7 @@
          }
      }
 }
-void ssim_parms_8x8_c
+void vp8_ssim_parms_8x8_c
 (
     unsigned char *s,
     int sp,
@@ -107,7 +98,7 @@
             const vp8_variance_rtcd_vtable_t *rtcd)
 {
     unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
-    rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    SSIMPF_INVOKE(rtcd,16x16)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
     return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
 }
 static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp,
@@ -114,7 +105,7 @@
                 const vp8_variance_rtcd_vtable_t *rtcd)
 {
     unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
-    rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    SSIMPF_INVOKE(rtcd,8x8)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
     return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
 }
 
@@ -134,7 +125,7 @@
     c1 = cc1*16;
     c2 = cc2*16;
 
-    rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    SSIMPF_INVOKE(rtcd,16x16)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
     ssim_n1 = (2*sum_s*sum_r+ c1);
 
     ssim_n2 =((int64_t) 2*256*sum_sxr-(int64_t) 2*sum_s*sum_r+c2);
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -320,16 +320,16 @@
 #endif
 extern prototype_get16x16prederror(vp8_variance_get4x4sse_cs);
 
-#ifndef vp8_ssimpf
-#define vp8_ssimpf ssim_parms_c
-#endif
-extern prototype_ssimpf(vp8_ssimpf)
-
 #ifndef vp8_ssimpf_8x8
-#define vp8_ssimpf_8x8 ssim_parms_8x8_c
+#define vp8_ssimpf_8x8 vp8_ssim_parms_8x8_c
 #endif
 extern prototype_ssimpf(vp8_ssimpf_8x8)
 
+#ifndef vp8_ssimpf_16x16
+#define vp8_ssimpf_16x16 vp8_ssim_parms_16x16_c
+#endif
+extern prototype_ssimpf(vp8_ssimpf_16x16)
+
 typedef prototype_sad(*vp8_sad_fn_t);
 typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
 typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t);
@@ -394,7 +394,7 @@
 
 #if CONFIG_INTERNAL_STATS
     vp8_ssimpf_fn_t          ssimpf_8x8;
-    vp8_ssimpf_fn_t          ssimpf;
+    vp8_ssimpf_fn_t          ssimpf_16x16;
 #endif
 
 } vp8_variance_rtcd_vtable_t;
@@ -417,8 +417,10 @@
 
 #if CONFIG_RUNTIME_CPU_DETECT
 #define VARIANCE_INVOKE(ctx,fn) (ctx)->fn
+#define SSIMPF_INVOKE(ctx,fn) (ctx)->ssimpf_##fn
 #else
 #define VARIANCE_INVOKE(ctx,fn) vp8_variance_##fn
+#define SSIMPF_INVOKE(ctx,fn) vp8_ssimpf_##fn
 #endif
 
 #endif
--- a/vp8/encoder/x86/encodeopt.asm
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -148,7 +148,7 @@
         pcmpeqw     mm1,        mm7
         mov         rcx,        16
 
-mberror_loop_mmx:
+.mberror_loop_mmx:
         movq        mm3,       [rsi]
         movq        mm4,       [rdi]
 
@@ -186,7 +186,7 @@
         add         rdi,        32
         sub         rcx,        1
 
-        jnz         mberror_loop_mmx
+        jnz         .mberror_loop_mmx
 
         movq        mm0,        mm2
         psrlq       mm2,        32
@@ -226,7 +226,7 @@
         pcmpeqw     xmm5,       xmm6
         mov         rcx,        16
 
-mberror_loop:
+.mberror_loop:
         movdqa      xmm0,       [rsi]
         movdqa      xmm1,       [rdi]
 
@@ -249,7 +249,7 @@
         paddd       xmm4,       xmm2
 
         paddd       xmm4,       xmm0
-        jnz         mberror_loop
+        jnz         .mberror_loop
 
         movdqa      xmm0,       xmm4
         punpckldq   xmm0,       xmm6
@@ -289,7 +289,7 @@
         mov             rcx,        16
         pxor            mm7,        mm7
 
-mbuverror_loop_mmx:
+.mbuverror_loop_mmx:
 
         movq            mm1,        [rsi]
         movq            mm2,        [rdi]
@@ -313,7 +313,7 @@
         add             rdi,        16
 
         dec             rcx
-        jnz             mbuverror_loop_mmx
+        jnz             .mbuverror_loop_mmx
 
         movq            mm0,        mm7
         psrlq           mm7,        32
@@ -346,7 +346,7 @@
         mov             rcx,        16
         pxor            xmm3,       xmm3
 
-mbuverror_loop:
+.mbuverror_loop:
 
         movdqa          xmm1,       [rsi]
         movdqa          xmm2,       [rdi]
@@ -360,7 +360,7 @@
         add             rdi,        16
 
         dec             rcx
-        jnz             mbuverror_loop
+        jnz             .mbuverror_loop
 
         pxor        xmm0,           xmm0
         movdqa      xmm1,           xmm3
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -137,7 +137,7 @@
     ; if (x >= zbin)
     sub         cx, WORD PTR[rdx]           ; x - zbin
     lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          rq_zigzag_loop_%1           ; x < zbin
+    jl          .rq_zigzag_loop_%1           ; x < zbin
 
     movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
 
@@ -144,10 +144,10 @@
     ; downshift by quant_shift[rc]
     movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
     sar         edi, cl                     ; also sets Z bit
-    je          rq_zigzag_loop_%1           ; !y
+    je          .rq_zigzag_loop_%1           ; !y
     mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
     mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
-rq_zigzag_loop_%1:
+.rq_zigzag_loop_%1:
 %endmacro
 ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
 ZIGZAG_LOOP  0
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -140,7 +140,7 @@
     ; if (x >= zbin)
     sub         cx, WORD PTR[rdx]           ; x - zbin
     lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          rq_zigzag_loop_%1           ; x < zbin
+    jl          .rq_zigzag_loop_%1          ; x < zbin
 
     pextrw      edi, %3, %2                 ; y
 
@@ -147,7 +147,7 @@
     ; downshift by quant_shift[rc]
     pextrb      ecx, xmm5, %1               ; quant_shift[rc]
     sar         edi, cl                     ; also sets Z bit
-    je          rq_zigzag_loop_%1           ; !y
+    je          .rq_zigzag_loop_%1          ; !y
 %if ABI_IS_32BIT
     mov         WORD PTR[rsp + qcoeff + %1 *2], di
 %else
@@ -154,7 +154,7 @@
     pinsrw      %5, edi, %2                 ; qcoeff[rc]
 %endif
     mov         rdx, rax                    ; reset to b->zrun_zbin_boost
-rq_zigzag_loop_%1:
+.rq_zigzag_loop_%1:
 %endmacro
 ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
 ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
--- a/vp8/encoder/x86/sad_mmx.asm
+++ b/vp8/encoder/x86/sad_mmx.asm
@@ -43,7 +43,7 @@
 
         pxor            mm6,        mm6
 
-x16x16sad_mmx_loop:
+.x16x16sad_mmx_loop:
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm2,        QWORD PTR [rsi+8]
@@ -83,7 +83,7 @@
         paddw           mm7,        mm1
 
         cmp             rsi,        rcx
-        jne             x16x16sad_mmx_loop
+        jne             .x16x16sad_mmx_loop
 
 
         movq            mm0,        mm7
@@ -135,7 +135,7 @@
 
         pxor            mm6,        mm6
 
-x8x16sad_mmx_loop:
+.x8x16sad_mmx_loop:
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm1,        QWORD PTR [rdi]
@@ -158,7 +158,7 @@
         paddw           mm7,        mm2
         cmp             rsi,        rcx
 
-        jne             x8x16sad_mmx_loop
+        jne             .x8x16sad_mmx_loop
 
         movq            mm0,        mm7
         punpcklwd       mm0,        mm6
@@ -205,7 +205,7 @@
 
         pxor            mm6,        mm6
 
-x8x8sad_mmx_loop:
+.x8x8sad_mmx_loop:
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm1,        QWORD PTR [rdi]
@@ -228,7 +228,7 @@
         paddw           mm7,       mm0
         cmp             rsi,        rcx
 
-        jne             x8x8sad_mmx_loop
+        jne             .x8x8sad_mmx_loop
 
         movq            mm0,        mm7
         punpcklwd       mm0,        mm6
@@ -364,7 +364,7 @@
 
         pxor            mm6,        mm6
 
-x16x8sad_mmx_loop:
+.x16x8sad_mmx_loop:
 
         movq            mm0,       [rsi]
         movq            mm1,       [rdi]
@@ -404,7 +404,7 @@
         paddw           mm7,        mm0
 
         cmp             rsi,        rcx
-        jne             x16x8sad_mmx_loop
+        jne             .x16x8sad_mmx_loop
 
         movq            mm0,        mm7
         punpcklwd       mm0,        mm6
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -37,7 +37,7 @@
         lea             rcx,        [rcx+rax*8]
         pxor            xmm6,       xmm6
 
-x16x16sad_wmt_loop:
+.x16x16sad_wmt_loop:
 
         movq            xmm0,       QWORD PTR [rsi]
         movq            xmm2,       QWORD PTR [rsi+8]
@@ -68,7 +68,7 @@
         paddw           xmm6,       xmm4
 
         cmp             rsi,        rcx
-        jne             x16x16sad_wmt_loop
+        jne             .x16x16sad_wmt_loop
 
         movq            xmm0,       xmm6
         psrldq          xmm6,       8
@@ -111,11 +111,11 @@
         lea             rcx,        [rcx+rbx*8]
         pxor            mm7,        mm7
 
-x8x16sad_wmt_loop:
+.x8x16sad_wmt_loop:
 
         movq            rax,        mm7
         cmp             eax,        arg(4)
-        jg              x8x16sad_wmt_early_exit
+        jg              .x8x16sad_wmt_early_exit
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm1,        QWORD PTR [rdi]
@@ -133,11 +133,11 @@
         paddw           mm7,        mm2
 
         cmp             rsi,        rcx
-        jne             x8x16sad_wmt_loop
+        jne             .x8x16sad_wmt_loop
 
         movq            rax,        mm7
 
-x8x16sad_wmt_early_exit:
+.x8x16sad_wmt_early_exit:
 
     ; begin epilog
     pop         rdi
@@ -172,11 +172,11 @@
         lea             rcx,        [rsi+rbx*8]
         pxor            mm7,        mm7
 
-x8x8sad_wmt_loop:
+.x8x8sad_wmt_loop:
 
         movq            rax,        mm7
         cmp             eax,        arg(4)
-        jg              x8x8sad_wmt_early_exit
+        jg              .x8x8sad_wmt_early_exit
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm1,        QWORD PTR [rdi]
@@ -188,10 +188,10 @@
         paddw           mm7,        mm0
 
         cmp             rsi,        rcx
-        jne             x8x8sad_wmt_loop
+        jne             .x8x8sad_wmt_loop
 
         movq            rax,        mm7
-x8x8sad_wmt_early_exit:
+.x8x8sad_wmt_early_exit:
 
     ; begin epilog
     pop         rdi
@@ -281,11 +281,11 @@
         lea             rcx,        [rsi+rbx*8]
         pxor            mm7,        mm7
 
-x16x8sad_wmt_loop:
+.x16x8sad_wmt_loop:
 
         movq            rax,        mm7
         cmp             eax,        arg(4)
-        jg              x16x8sad_wmt_early_exit
+        jg              .x16x8sad_wmt_early_exit
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm2,        QWORD PTR [rsi+8]
@@ -315,11 +315,11 @@
         paddw           mm7,        mm4
 
         cmp             rsi,        rcx
-        jne             x16x8sad_wmt_loop
+        jne             .x16x8sad_wmt_loop
 
         movq            rax,        mm7
 
-x16x8sad_wmt_early_exit:
+.x16x8sad_wmt_early_exit:
 
     ; begin epilog
     pop         rdi
@@ -352,7 +352,7 @@
         movsxd          rdx,        dword ptr arg(3) ;dst_stride
         movsxd          rcx,        dword ptr arg(4) ;height
 
-block_copy_sse2_loopx4:
+.block_copy_sse2_loopx4:
         movdqu          xmm0,       XMMWORD PTR [rsi]
         movdqu          xmm1,       XMMWORD PTR [rsi + 16]
         movdqu          xmm2,       XMMWORD PTR [rsi + rax]
@@ -383,12 +383,12 @@
 
         sub             rcx,     4
         cmp             rcx,     4
-        jge             block_copy_sse2_loopx4
+        jge             .block_copy_sse2_loopx4
 
         cmp             rcx, 0
-        je              copy_is_done
+        je              .copy_is_done
 
-block_copy_sse2_loop:
+.block_copy_sse2_loop:
         movdqu          xmm0,       XMMWORD PTR [rsi]
         movdqu          xmm1,       XMMWORD PTR [rsi + 16]
         lea             rsi,    [rsi+rax]
@@ -398,9 +398,9 @@
         lea             rdi,    [rdi+rdx]
 
         sub             rcx,     1
-        jne             block_copy_sse2_loop
+        jne             .block_copy_sse2_loop
 
-copy_is_done:
+.copy_is_done:
     ; begin epilog
     pop rdi
     pop rsi
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -647,7 +647,7 @@
 
     STACK_FRAME_CREATE_X3
 
-block_copy_sse3_loopx4:
+.block_copy_sse3_loopx4:
         lea             end_ptr,    [src_ptr+src_stride*2]
 
         movdqu          xmm0,       XMMWORD PTR [src_ptr]
@@ -676,13 +676,13 @@
 
         sub             height,     4
         cmp             height,     4
-        jge             block_copy_sse3_loopx4
+        jge             .block_copy_sse3_loopx4
 
         ;Check to see if there is more rows need to be copied.
         cmp             height, 0
-        je              copy_is_done
+        je              .copy_is_done
 
-block_copy_sse3_loop:
+.block_copy_sse3_loop:
         movdqu          xmm0,       XMMWORD PTR [src_ptr]
         movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
         lea             src_ptr,    [src_ptr+src_stride]
@@ -692,9 +692,9 @@
         lea             ref_ptr,    [ref_ptr+ref_stride]
 
         sub             height,     1
-        jne             block_copy_sse3_loop
+        jne             .block_copy_sse3_loop
 
-copy_is_done:
+.copy_is_done:
     STACK_FRAME_DESTROY_X3
 
 ;void vp8_sad16x16x4d_sse3(
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -169,30 +169,30 @@
         mov             rdx,        0xf
         and             rdx,        rdi
 
-        jmp vp8_sad16x16x3_ssse3_skiptable
-vp8_sad16x16x3_ssse3_jumptable:
-        dd vp8_sad16x16x3_ssse3_aligned_by_0  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_1  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_2  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_3  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_4  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_5  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_6  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_7  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_8  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_9  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
-vp8_sad16x16x3_ssse3_skiptable:
+        jmp .vp8_sad16x16x3_ssse3_skiptable
+.vp8_sad16x16x3_ssse3_jumptable:
+        dd .vp8_sad16x16x3_ssse3_aligned_by_0  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_1  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_2  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_3  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_4  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_5  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_6  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_7  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_8  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_9  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_skiptable:
 
-        call vp8_sad16x16x3_ssse3_do_jump
-vp8_sad16x16x3_ssse3_do_jump:
+        call .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_do_jump:
         pop             rcx                         ; get the address of do_jump
-        mov             rax,  vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
+        mov             rax,  .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
         add             rax,  rcx  ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
 
         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
@@ -203,23 +203,23 @@
 
         jmp             rcx
 
-        PROCESS_16X16X3_OFFSET 0,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 0,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 1,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 2,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 3,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 4,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 5,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 6,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 7,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 8,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 9,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
 
-vp8_sad16x16x3_ssse3_aligned_by_15:
+.vp8_sad16x16x3_ssse3_aligned_by_15:
         PROCESS_16X2X3 1
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
@@ -229,7 +229,7 @@
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
 
-vp8_sad16x16x3_ssse3_store_off:
+.vp8_sad16x16x3_ssse3_store_off:
         mov             rdi,        arg(4) ;Results
 
         movq            xmm0,       xmm5
@@ -282,30 +282,30 @@
         mov             rdx,        0xf
         and             rdx,        rdi
 
-        jmp vp8_sad16x8x3_ssse3_skiptable
-vp8_sad16x8x3_ssse3_jumptable:
-        dd vp8_sad16x8x3_ssse3_aligned_by_0  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_1  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_2  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_3  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_4  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_5  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_6  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_7  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_8  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_9  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
-vp8_sad16x8x3_ssse3_skiptable:
+        jmp .vp8_sad16x8x3_ssse3_skiptable
+.vp8_sad16x8x3_ssse3_jumptable:
+        dd .vp8_sad16x8x3_ssse3_aligned_by_0  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_1  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_2  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_3  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_4  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_5  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_6  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_7  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_8  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_9  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_skiptable:
 
-        call vp8_sad16x8x3_ssse3_do_jump
-vp8_sad16x8x3_ssse3_do_jump:
+        call .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_do_jump:
         pop             rcx                         ; get the address of do_jump
-        mov             rax,  vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
+        mov             rax,  .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
         add             rax,  rcx  ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
 
         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
@@ -316,23 +316,23 @@
 
         jmp             rcx
 
-        PROCESS_16X8X3_OFFSET 0,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 0,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 1,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 2,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 3,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 4,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 5,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 6,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 7,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 8,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 9,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
 
-vp8_sad16x8x3_ssse3_aligned_by_15:
+.vp8_sad16x8x3_ssse3_aligned_by_15:
 
         PROCESS_16X2X3 1
         PROCESS_16X2X3 0
@@ -339,7 +339,7 @@
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
 
-vp8_sad16x8x3_ssse3_store_off:
+.vp8_sad16x8x3_ssse3_store_off:
         mov             rdi,        arg(4) ;Results
 
         movq            xmm0,       xmm5
--- a/vp8/encoder/x86/ssim_opt.asm
+++ b/vp8/encoder/x86/ssim_opt.asm
@@ -44,7 +44,7 @@
         paddd           %1, xmm1
         SUM_ACROSS_Q    %1
 %endmacro
-;void ssim_parms_sse3(
+;void ssim_parms_sse2(
 ;    unsigned char *s,
 ;    int sp,
 ;    unsigned char *r,
@@ -61,8 +61,8 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(vp8_ssim_parms_16x16_sse3)
-sym(vp8_ssim_parms_16x16_sse3):
+global sym(vp8_ssim_parms_16x16_sse2)
+sym(vp8_ssim_parms_16x16_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
@@ -84,7 +84,7 @@
     pxor            xmm11,xmm11  ;sum_sxr
 
     mov             rdx, 16      ;row counter
-NextRow:
+.NextRow:
 
     ;grab source and reference pixels
     movdqu          xmm5, [rsi]
@@ -107,7 +107,7 @@
     add             rdi, rax   ; next r row
 
     dec             rdx        ; counter
-    jnz NextRow
+    jnz .NextRow
 
     SUM_ACROSS_W    xmm15
     SUM_ACROSS_W    xmm14
@@ -134,7 +134,7 @@
     pop         rbp
     ret
 
-;void ssim_parms_sse3(
+;void ssim_parms_sse2(
 ;    unsigned char *s,
 ;    int sp,
 ;    unsigned char *r,
@@ -151,8 +151,8 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(vp8_ssim_parms_8x8_sse3)
-sym(vp8_ssim_parms_8x8_sse3):
+global sym(vp8_ssim_parms_8x8_sse2)
+sym(vp8_ssim_parms_8x8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
@@ -174,7 +174,7 @@
     pxor            xmm11,xmm11  ;sum_sxr
 
     mov             rdx, 8      ;row counter
-NextRow2:
+.NextRow:
 
     ;grab source and reference pixels
     movq            xmm3, [rsi]
@@ -188,7 +188,7 @@
     add             rdi, rax   ; next r row
 
     dec             rdx        ; counter
-    jnz NextRow2
+    jnz .NextRow
 
     SUM_ACROSS_W    xmm15
     SUM_ACROSS_W    xmm14
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -93,7 +93,7 @@
             mov         rcx,            16
             pxor        mm0,            mm0
 
-submby_loop:
+.submby_loop:
 
             movq        mm1,            [rsi]
             movq        mm3,            [rax]
@@ -139,7 +139,7 @@
             lea         rsi,            [rsi+rdx]
 
             sub         rcx,            1
-            jnz         submby_loop
+            jnz         .submby_loop
 
     pop rdi
     pop rsi
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -91,7 +91,7 @@
 
             mov         rcx,            8      ; do two lines at one time
 
-submby_loop:
+.submby_loop:
             movdqa      xmm0,           XMMWORD PTR [rsi]   ; src
             movdqa      xmm1,           XMMWORD PTR [rax]   ; pred
 
@@ -133,7 +133,7 @@
             lea         rsi,            [rsi+rdx*2]
 
             sub         rcx,            1
-            jnz         submby_loop
+            jnz         .submby_loop
 
     pop rdi
     pop rsi
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -71,10 +71,10 @@
 
         lea         rcx,            [rdx + 16*16*1]
         cmp         dword ptr [rsp + block_size], 8
-        jne         temporal_filter_apply_load_16
+        jne         .temporal_filter_apply_load_16
         lea         rcx,            [rdx + 8*8*1]
 
-temporal_filter_apply_load_8:
+.temporal_filter_apply_load_8:
         movq        xmm0,           [rsi]  ; first row
         lea         rsi,            [rsi + rbp] ; += stride
         punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
@@ -81,9 +81,9 @@
         movq        xmm1,           [rsi]  ; second row
         lea         rsi,            [rsi + rbp] ; += stride
         punpcklbw   xmm1,           xmm7   ; src[ 8-15]
-        jmp         temporal_filter_apply_load_finished
+        jmp         .temporal_filter_apply_load_finished
 
-temporal_filter_apply_load_16:
+.temporal_filter_apply_load_16:
         movdqa      xmm0,           [rsi]  ; src (frame1)
         lea         rsi,            [rsi + rbp] ; += stride
         movdqa      xmm1,           xmm0
@@ -90,7 +90,7 @@
         punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
         punpckhbw   xmm1,           xmm7   ; src[ 8-15]
 
-temporal_filter_apply_load_finished:
+.temporal_filter_apply_load_finished:
         movdqa      xmm2,           [rdx]  ; predictor (frame2)
         movdqa      xmm3,           xmm2
         punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
@@ -176,13 +176,13 @@
         lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
 
         cmp         rdx,            rcx
-        je          temporal_filter_apply_epilog
+        je          .temporal_filter_apply_epilog
         pxor        xmm7,           xmm7   ; zero for extraction
         cmp         dword ptr [rsp + block_size], 16
-        je          temporal_filter_apply_load_16
-        jmp         temporal_filter_apply_load_8
+        je          .temporal_filter_apply_load_16
+        jmp         .temporal_filter_apply_load_8
 
-temporal_filter_apply_epilog:
+.temporal_filter_apply_epilog:
     ; begin epilog
     mov         rbp,            [rsp + rbp_backup]
     add         rsp,            stack_size
--- a/vp8/encoder/x86/variance_impl_mmx.asm
+++ b/vp8/encoder/x86/variance_impl_mmx.asm
@@ -27,7 +27,7 @@
         mov         rcx, 16
         pxor        mm4, mm4
 
-NEXTROW:
+.NEXTROW:
         movq        mm0, [rax]
         movq        mm1, [rax+8]
         movq        mm2, [rax+16]
@@ -44,7 +44,7 @@
 
         add         rax, 32
         dec         rcx
-        ja          NEXTROW
+        ja          .NEXTROW
         movq        QWORD PTR [rsp], mm4
 
         ;return sum[0]+sum[1];
@@ -568,7 +568,7 @@
         add             rsi, r8
 %endif
 
-filter_block2d_bil4x4_var_mmx_loop:
+.filter_block2d_bil4x4_var_mmx_loop:
 
         movd            mm1,            [rsi]               ;
         movd            mm3,            [rsi+1]             ;
@@ -614,7 +614,7 @@
         add             rdi,            r9
 %endif
         sub             rcx,            1                   ;
-        jnz             filter_block2d_bil4x4_var_mmx_loop       ;
+        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
 
 
         pxor            mm3,            mm3                 ;
@@ -726,7 +726,7 @@
         add             rsi,            r8
 %endif
 
-filter_block2d_bil_var_mmx_loop:
+.filter_block2d_bil_var_mmx_loop:
 
         movq            mm1,            [rsi]               ;
         movq            mm3,            [rsi+1]             ;
@@ -807,7 +807,7 @@
         add             rdi,            r9
 %endif
         sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_var_mmx_loop       ;
+        jnz             .filter_block2d_bil_var_mmx_loop       ;
 
 
         pxor            mm3,            mm3                 ;
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -33,7 +33,7 @@
         mov         rcx, 8
         pxor        xmm4, xmm4
 
-NEXTROW:
+.NEXTROW:
         movdqa      xmm0, [rax]
         movdqa      xmm1, [rax+16]
         movdqa      xmm2, [rax+32]
@@ -50,7 +50,7 @@
 
         add         rax, 0x40
         dec         rcx
-        ja          NEXTROW
+        ja          .NEXTROW
 
         movdqa      xmm3,xmm4
         psrldq      xmm4,8
@@ -126,7 +126,7 @@
         pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
         mov         rcx,            16
 
-var16loop:
+.var16loop:
         movdqu      xmm1,           XMMWORD PTR [rsi]
         movdqu      xmm2,           XMMWORD PTR [rdi]
 
@@ -160,7 +160,7 @@
         add         rdi,            rdx
 
         sub         rcx,            1
-        jnz         var16loop
+        jnz         .var16loop
 
 
         movdqa      xmm1,           xmm6
--- a/vp8/encoder/x86/variance_impl_ssse3.asm
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -47,7 +47,7 @@
         movsxd          rax,            dword ptr arg(5)     ; xoffset
 
         cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              filter_block2d_bil_var_ssse3_sp_only
+        je              .filter_block2d_bil_var_ssse3_sp_only
 
         shl             rax,            4                    ; point to filter coeff with xoffset
         lea             rax,            [rax + rcx]          ; HFilter
@@ -55,7 +55,7 @@
         movsxd          rdx,            dword ptr arg(6)     ; yoffset
 
         cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              filter_block2d_bil_var_ssse3_fp_only
+        je              .filter_block2d_bil_var_ssse3_fp_only
 
         shl             rdx,            4
         lea             rdx,            [rdx + rcx]          ; VFilter
@@ -88,7 +88,7 @@
         lea             rsi,            [rsi + r8]
 %endif
 
-filter_block2d_bil_var_ssse3_loop:
+.filter_block2d_bil_var_ssse3_loop:
         movdqu          xmm1,           XMMWORD PTR [rsi]
         movdqu          xmm2,           XMMWORD PTR [rsi+1]
         movdqa          xmm3,           xmm1
@@ -142,15 +142,15 @@
 %endif
 
         sub             rcx,            1
-        jnz             filter_block2d_bil_var_ssse3_loop
+        jnz             .filter_block2d_bil_var_ssse3_loop
 
-        jmp             filter_block2d_bil_variance
+        jmp             .filter_block2d_bil_variance
 
-filter_block2d_bil_var_ssse3_sp_only:
+.filter_block2d_bil_var_ssse3_sp_only:
         movsxd          rdx,            dword ptr arg(6)     ; yoffset
 
         cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
-        je              filter_block2d_bil_var_ssse3_full_pixel
+        je              .filter_block2d_bil_var_ssse3_full_pixel
 
         shl             rdx,            4
         lea             rdx,            [rdx + rcx]          ; VFilter
@@ -169,7 +169,7 @@
 
         lea             rsi,            [rsi + rax]
 
-filter_block2d_bil_sp_only_loop:
+.filter_block2d_bil_sp_only_loop:
         movdqu          xmm3,           XMMWORD PTR [rsi]
         movdqa          xmm2,           xmm1
         movdqa          xmm0,           xmm3
@@ -209,11 +209,11 @@
 %endif
 
         sub             rcx,            1
-        jnz             filter_block2d_bil_sp_only_loop
+        jnz             .filter_block2d_bil_sp_only_loop
 
-        jmp             filter_block2d_bil_variance
+        jmp             .filter_block2d_bil_variance
 
-filter_block2d_bil_var_ssse3_full_pixel:
+.filter_block2d_bil_var_ssse3_full_pixel:
         mov             rsi,            arg(0)               ;ref_ptr
         mov             rdi,            arg(2)               ;src_ptr
         movsxd          rcx,            dword ptr arg(4)     ;Height
@@ -221,7 +221,7 @@
         movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
         pxor            xmm0,           xmm0
 
-filter_block2d_bil_full_pixel_loop:
+.filter_block2d_bil_full_pixel_loop:
         movq            xmm1,           QWORD PTR [rsi]
         punpcklbw       xmm1,           xmm0
         movq            xmm2,           QWORD PTR [rsi+8]
@@ -244,11 +244,11 @@
         lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
         lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
         sub             rcx,            1
-        jnz             filter_block2d_bil_full_pixel_loop
+        jnz             .filter_block2d_bil_full_pixel_loop
 
-        jmp             filter_block2d_bil_variance
+        jmp             .filter_block2d_bil_variance
 
-filter_block2d_bil_var_ssse3_fp_only:
+.filter_block2d_bil_var_ssse3_fp_only:
         mov             rsi,            arg(0)               ;ref_ptr
         mov             rdi,            arg(2)               ;src_ptr
         movsxd          rcx,            dword ptr arg(4)     ;Height
@@ -260,7 +260,7 @@
         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
 %endif
 
-filter_block2d_bil_fp_only_loop:
+.filter_block2d_bil_fp_only_loop:
         movdqu          xmm1,           XMMWORD PTR [rsi]
         movdqu          xmm2,           XMMWORD PTR [rsi+1]
         movdqa          xmm3,           xmm1
@@ -298,11 +298,11 @@
 %endif
 
         sub             rcx,            1
-        jnz             filter_block2d_bil_fp_only_loop
+        jnz             .filter_block2d_bil_fp_only_loop
 
-        jmp             filter_block2d_bil_variance
+        jmp             .filter_block2d_bil_variance
 
-filter_block2d_bil_variance:
+.filter_block2d_bil_variance:
         pxor        xmm0,           xmm0
         pxor        xmm1,           xmm1
         pxor        xmm5,           xmm5
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -140,6 +140,8 @@
 extern prototype_variance(vp8_mse16x16_wmt);
 extern prototype_variance2(vp8_get8x8var_sse2);
 extern prototype_variance2(vp8_get16x16var_sse2);
+extern prototype_ssimpf(vp8_ssim_parms_8x8_sse2)
+extern prototype_ssimpf(vp8_ssim_parms_16x16_sse2)
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_variance_sad4x4
@@ -207,6 +209,14 @@
 
 #undef  vp8_variance_mse16x16
 #define vp8_variance_mse16x16 vp8_mse16x16_wmt
+
+#if ARCH_X86_64
+#undef  vp8_ssimpf_8x8
+#define vp8_ssimpf_8x8 vp8_ssim_parms_8x8_sse2
+
+#undef  vp8_ssimpf_16x16
+#define vp8_ssimpf_16x16 vp8_ssim_parms_16x16_sse2
+#endif
 
 #endif
 #endif
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -111,29 +111,6 @@
 
 #endif
 
-#if HAVE_SSSE3
-#if CONFIG_INTERNAL_STATS
-#if ARCH_X86_64
-typedef void ssimpf
-(
-    unsigned char *s,
-    int sp,
-    unsigned char *r,
-    int rp,
-    unsigned long *sum_s,
-    unsigned long *sum_r,
-    unsigned long *sum_sq_s,
-    unsigned long *sum_sq_r,
-    unsigned long *sum_sxr
-);
-
-extern ssimpf vp8_ssim_parms_16x16_sse3;
-extern ssimpf vp8_ssim_parms_8x8_sse3;
-#endif
-#endif
-#endif
-
-
 void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -246,6 +223,13 @@
 #if !(CONFIG_REALTIME_ONLY)
         cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_sse2;
 #endif
+
+#if CONFIG_INTERNAL_STATS
+#if ARCH_X86_64
+        cpi->rtcd.variance.ssimpf_8x8            = vp8_ssim_parms_8x8_sse2;
+        cpi->rtcd.variance.ssimpf_16x16          = vp8_ssim_parms_16x16_sse2;
+#endif
+#endif
     }
 #endif
 
@@ -280,14 +264,6 @@
         cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_ssse3;
 
         cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;
-
-#if CONFIG_INTERNAL_STATS
-#if ARCH_X86_64
-        cpi->rtcd.variance.ssimpf_8x8            = vp8_ssim_parms_8x8_sse3;
-        cpi->rtcd.variance.ssimpf                = vp8_ssim_parms_16x16_sse3;
-#endif
-#endif
-
     }
 #endif