shithub: libvpx

Download patch

ref: 211694f67e16c83ed61d69160ed60b7be0ab4882
parent: 8f910594bdc6e93269b93c930cd02a381d601a3f
parent: 01433c50436a669b7e10faf94382dbe03a8827bf
author: Johann <johannkoenig@google.com>
date: Wed Jul 13 00:10:03 EDT 2011

Merge "update x86 asm for loopfilter"

--- a/vp8/common/x86/loopfilter_mmx.asm
+++ b/vp8/common/x86/loopfilter_mmx.asm
@@ -16,7 +16,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int src_pixel_step,
-;    const char *flimit,
+;    const char *blimit,
 ;    const char *limit,
 ;    const char *thresh,
 ;    int  count
@@ -122,12 +122,10 @@
         paddusb     mm5, mm5              ; abs(p0-q0)*2
         paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        mov         rdx, arg(2) ;flimit           ; get flimit
-        movq        mm2, [rdx]            ; flimit mm2
-        paddb       mm2, mm2              ; flimit*2 (less than 255)
-        paddb       mm7, mm2              ; flimit * 2 + limit (less than 255)
+        mov         rdx, arg(2) ;blimit           ; get blimit
+        movq        mm7, [rdx]            ; blimit
 
-        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
         por         mm1,    mm5
         pxor        mm5,    mm5
         pcmpeqb     mm1,    mm5           ; mask mm1
@@ -230,7 +228,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
-;    const char *flimit,
+;    const char *blimit,
 ;    const char *limit,
 ;    const char *thresh,
 ;    int count
@@ -406,9 +404,9 @@
         pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
         psrlw       mm5,        1                           ; abs(p1-q1)/2
 
-        mov         rdx,        arg(2) ;flimit                      ;
+        mov         rdx,        arg(2) ;blimit                      ;
 
-        movq        mm2,        [rdx]                       ;flimit  mm2
+        movq        mm4,        [rdx]                       ;blimit
         movq        mm1,        mm3                         ; mm1=mm3=p0
 
         movq        mm7,        mm6                         ; mm7=mm6=q0
@@ -419,10 +417,7 @@
         paddusb     mm1,        mm1                         ; abs(q0-p0)*2
         paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        paddb       mm2,        mm2                         ; flimit*2 (less than 255)
-        paddb       mm4,        mm2                         ; flimit * 2 + limit (less than 255)
-
-        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
         por         mm1,        mm0;                        ; mask
 
         pxor        mm0,        mm0
@@ -603,7 +598,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
-;    const char *flimit,
+;    const char *blimit,
 ;    const char *limit,
 ;    const char *thresh,
 ;    int count
@@ -719,17 +714,15 @@
         paddusb     mm5, mm5              ; abs(p0-q0)*2
         paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        mov         rdx, arg(2) ;flimit           ; get flimit
-        movq        mm2, [rdx]            ; flimit mm2
-        paddb       mm2, mm2              ; flimit*2 (less than 255)
-        paddb       mm7, mm2              ; flimit * 2 + limit (less than 255)
+        mov         rdx, arg(2) ;blimit           ; get blimit
+        movq        mm7, [rdx]            ; blimit
 
-        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
         por         mm1,    mm5
         pxor        mm5,    mm5
         pcmpeqb     mm1,    mm5           ; mask mm1
 
-        ; mm1 = mask, mm0=q0,  mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+        ; mm1 = mask, mm0=q0,  mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
         ; mm6 = p0,
 
         ; calculate high edge variance
@@ -922,7 +915,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
-;    const char *flimit,
+;    const char *blimit,
 ;    const char *limit,
 ;    const char *thresh,
 ;    int count
@@ -1108,9 +1101,9 @@
         pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
         psrlw       mm5,        1                           ; abs(p1-q1)/2
 
-        mov         rdx,        arg(2) ;flimit                      ;
+        mov         rdx,        arg(2) ;blimit                      ;
 
-        movq        mm2,        [rdx]                       ;flimit  mm2
+        movq        mm4,        [rdx]                       ;blimit
         movq        mm1,        mm3                         ; mm1=mm3=p0
 
         movq        mm7,        mm6                         ; mm7=mm6=q0
@@ -1121,10 +1114,7 @@
         paddusb     mm1,        mm1                         ; abs(q0-p0)*2
         paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        paddb       mm2,        mm2                         ; flimit*2 (less than 255)
-        paddb       mm4,        mm2                         ; flimit * 2 + limit (less than 255)
-
-        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
         por         mm1,        mm0;                        ; mask
 
         pxor        mm0,        mm0
@@ -1392,16 +1382,13 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
-;    const char *flimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
+;    const char *blimit
 ;)
 global sym(vp8_loop_filter_simple_horizontal_edge_mmx)
 sym(vp8_loop_filter_simple_horizontal_edge_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
+    SHADOW_ARGS_TO_STACK 3
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1410,14 +1397,10 @@
         mov         rsi, arg(0) ;src_ptr
         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
 
-        movsxd      rcx, dword ptr arg(5) ;count
+        mov         rcx, 2                ; count
 nexts8_h:
-        mov         rdx, arg(3) ;limit
-        movq        mm7, [rdx]
-        mov         rdx, arg(2) ;flimit           ; get flimit
+        mov         rdx, arg(2) ;blimit           ; get blimit
         movq        mm3, [rdx]            ;
-        paddb       mm3, mm3              ; flimit*2 (less than 255)
-        paddb       mm3, mm7              ; flimit * 2 + limit (less than 255)
 
         mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
         add         rdi, rax
@@ -1445,7 +1428,7 @@
         paddusb     mm5, mm5              ; abs(p0-q0)*2
         paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
         pxor        mm3, mm3
         pcmpeqb     mm5, mm3
 
@@ -1515,16 +1498,13 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
-;    const char *flimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
+;    const char *blimit
 ;)
 global sym(vp8_loop_filter_simple_vertical_edge_mmx)
 sym(vp8_loop_filter_simple_vertical_edge_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
+    SHADOW_ARGS_TO_STACK 3
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1539,7 +1519,7 @@
         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
 
         lea         rsi, [rsi + rax*4- 2];  ;
-        movsxd      rcx, dword ptr arg(5) ;count
+        mov         rcx, 2                                      ; count
 nexts8_v:
 
         lea         rdi,        [rsi + rax];
@@ -1602,14 +1582,10 @@
         paddusb     mm5,        mm5                             ; abs(p0-q0)*2
         paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        mov         rdx,        arg(2) ;flimit                          ; get flimit
+        mov         rdx,        arg(2) ;blimit                          ; get blimit
         movq        mm7,        [rdx]
-        mov         rdx,        arg(3)                          ; get limit
-        movq        mm6,        [rdx]
-        paddb       mm7,        mm7                             ; flimit*2 (less than 255)
-        paddb       mm7,        mm6                             ; flimit * 2 + limit (less than 255)
 
-        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
         pxor        mm7,        mm7
         pcmpeqb     mm5,        mm7                             ; mm5 = mask
 
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -110,7 +110,7 @@
         psubusb     xmm6,                   xmm5              ; p1-=p0
 
         por         xmm6,                   xmm4              ; abs(p1 - p0)
-        mov         rdx,                    arg(2)            ; get flimit
+        mov         rdx,                    arg(2)            ; get blimit
 
         movdqa        t1,                   xmm6              ; save to t1
 
@@ -123,7 +123,7 @@
         psubusb     xmm1,                   xmm7
         por         xmm2,                   xmm3              ; abs(p1-q1)
 
-        movdqa      xmm4,                   XMMWORD PTR [rdx] ; flimit
+        movdqa      xmm7,                   XMMWORD PTR [rdx] ; blimit
 
         movdqa      xmm3,                   xmm0              ; q0
         pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
@@ -134,13 +134,11 @@
         psrlw       xmm2,                   1                 ; abs(p1-q1)/2
 
         psubusb     xmm5,                   xmm3              ; p0-=q0
-        paddb       xmm4,                   xmm4              ; flimit*2 (less than 255)
 
         psubusb     xmm3,                   xmm6              ; q0-=p0
         por         xmm5,                   xmm3              ; abs(p0 - q0)
 
         paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
-        paddb       xmm7,                   xmm4              ; flimit * 2 + limit (less than 255)
 
         movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)
 
@@ -150,7 +148,7 @@
 
         movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev
 
-        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
         psubusb     xmm4,                   xmm2              ; hev
 
         psubusb     xmm3,                   xmm2              ; hev
@@ -278,7 +276,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    int            count
@@ -328,7 +326,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    int            count
@@ -574,7 +572,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    int            count
@@ -624,7 +622,7 @@
 ;(
 ;    unsigned char *u,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    unsigned char *v
@@ -904,7 +902,7 @@
         movdqa      xmm4,               XMMWORD PTR [rdx]; limit
 
         pmaxub      xmm0,               xmm7
-        mov         rdx,                arg(2)          ; flimit
+        mov         rdx,                arg(2)          ; blimit
 
         psubusb     xmm0,               xmm4
         movdqa      xmm5,               xmm2            ; q1
@@ -921,12 +919,11 @@
         psrlw       xmm5,               1               ; abs(p1-q1)/2
         psubusb     xmm6,               xmm3            ; q0-p0
 
-        movdqa      xmm2,               XMMWORD PTR [rdx]; flimit
+        movdqa      xmm4,               XMMWORD PTR [rdx]; blimit
 
         mov         rdx,                arg(4)          ; get thresh
 
         por         xmm1,               xmm6            ; abs(q0-p0)
-        paddb       xmm2,               xmm2            ; flimit*2 (less than 255)
 
         movdqa      xmm6,               t0              ; get abs (q1 - q0)
 
@@ -939,10 +936,9 @@
         paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
         psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh
 
-        paddb       xmm4,               xmm2            ; flimit * 2 + limit (less than 255)
         psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
 
-        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
         por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
 
         por         xmm1,               xmm0            ; mask
@@ -1014,7 +1010,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    int            count
@@ -1081,7 +1077,7 @@
 ;(
 ;    unsigned char *u,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    unsigned char *v
@@ -1239,7 +1235,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    int            count
@@ -1308,7 +1304,7 @@
 ;(
 ;    unsigned char *u,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    unsigned char *v
@@ -1376,16 +1372,13 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
-;    const char *flimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
+;    const char *blimit,
 ;)
 global sym(vp8_loop_filter_simple_horizontal_edge_sse2)
 sym(vp8_loop_filter_simple_horizontal_edge_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
+    SHADOW_ARGS_TO_STACK 3
     SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
@@ -1394,14 +1387,9 @@
 
         mov         rsi, arg(0)             ;src_ptr
         movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
-        mov         rdx, arg(2) ;flimit     ; get flimit
+        mov         rdx, arg(2)             ;blimit
         movdqa      xmm3, XMMWORD PTR [rdx]
-        mov         rdx, arg(3) ;limit
-        movdqa      xmm7, XMMWORD PTR [rdx]
 
-        paddb       xmm3, xmm3              ; flimit*2 (less than 255)
-        paddb       xmm3, xmm7              ; flimit * 2 + limit (less than 255)
-
         mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
         add         rdi, rax
         neg         rax
@@ -1428,7 +1416,7 @@
         paddusb     xmm5, xmm5              ; abs(p0-q0)*2
         paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
         pxor        xmm3, xmm3
         pcmpeqb     xmm5, xmm3
 
@@ -1493,16 +1481,13 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
-;    const char *flimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
+;    const char *blimit,
 ;)
 global sym(vp8_loop_filter_simple_vertical_edge_sse2)
 sym(vp8_loop_filter_simple_vertical_edge_sse2):
     push        rbp         ; save old base pointer value.
     mov         rbp, rsp    ; set new base pointer value.
-    SHADOW_ARGS_TO_STACK 6
+    SHADOW_ARGS_TO_STACK 3
     SAVE_XMM 7
     GET_GOT     rbx         ; save callee-saved reg
     push        rsi
@@ -1607,14 +1592,10 @@
         paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
         paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        mov         rdx,        arg(2)                          ;flimit
+        mov         rdx,        arg(2)                          ;blimit
         movdqa      xmm7, XMMWORD PTR [rdx]
-        mov         rdx,        arg(3)                          ; get limit
-        movdqa      xmm6, XMMWORD PTR [rdx]
-        paddb       xmm7,        xmm7                           ; flimit*2 (less than 255)
-        paddb       xmm7,        xmm6                           ; flimit * 2 + limit (less than 255)
 
-        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
         pxor        xmm7,        xmm7
         pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
 
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -9,30 +9,18 @@
  */
 
 
-#include "vpx_ports/config.h"
+#include "vpx_config.h"
 #include "vp8/common/loopfilter.h"
 
-prototype_loopfilter(vp8_loop_filter_horizontal_edge_c);
-prototype_loopfilter(vp8_loop_filter_vertical_edge_c);
-prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c);
-prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c);
-prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
-prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
-
 prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
 prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
 prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
 prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
-prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
-prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
 
 prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2);
 prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2);
 prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2);
 prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_sse2);
-prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2);
-prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2);
-prototype_loopfilter(vp8_fast_loop_filter_vertical_edges_sse2);
 
 extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
 extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
@@ -44,75 +32,51 @@
 void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+    vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
+        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
+        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
 
-void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
-}
-
-
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+    vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
+        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
+        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
 
-void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
-}
-
-
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
 
-void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit);
 }
 
 
@@ -120,27 +84,23 @@
 void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
 
-void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
 }
 #endif
 
@@ -150,66 +110,42 @@
 void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
+        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
 }
 
 
-void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
-}
-
-
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
+        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
 }
 
 
-void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
-}
-
-
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride);
+        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride);
 }
 
 
-void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit);
 }
 
 
@@ -217,36 +153,20 @@
 void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4);
+        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4);
 }
 
 
-void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
 }
 
-#endif
-
-#if 0
-void vp8_fast_loop_filter_vertical_edges_sse(unsigned char *y_ptr,
-        int y_stride,
-        loop_filter_info *lfi)
-{
-
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-}
 #endif
--- a/vp8/common/x86/loopfilter_x86.h
+++ b/vp8/common/x86/loopfilter_x86.h
@@ -24,10 +24,10 @@
 extern prototype_loopfilter_block(vp8_loop_filter_bv_mmx);
 extern prototype_loopfilter_block(vp8_loop_filter_mbh_mmx);
 extern prototype_loopfilter_block(vp8_loop_filter_bh_mmx);
-extern prototype_loopfilter_block(vp8_loop_filter_mbvs_mmx);
-extern prototype_loopfilter_block(vp8_loop_filter_bvs_mmx);
-extern prototype_loopfilter_block(vp8_loop_filter_mbhs_mmx);
-extern prototype_loopfilter_block(vp8_loop_filter_bhs_mmx);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
+extern prototype_simple_loopfilter(vp8_loop_filter_bvs_mmx);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
+extern prototype_simple_loopfilter(vp8_loop_filter_bhs_mmx);
 
 
 #if !CONFIG_RUNTIME_CPU_DETECT
@@ -44,13 +44,13 @@
 #define vp8_lf_normal_b_h vp8_loop_filter_bh_mmx
 
 #undef  vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_mmx
+#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_mmx
 
 #undef  vp8_lf_simple_b_v
 #define vp8_lf_simple_b_v vp8_loop_filter_bvs_mmx
 
 #undef  vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_mmx
+#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_mmx
 
 #undef  vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_mmx
@@ -63,10 +63,10 @@
 extern prototype_loopfilter_block(vp8_loop_filter_bv_sse2);
 extern prototype_loopfilter_block(vp8_loop_filter_mbh_sse2);
 extern prototype_loopfilter_block(vp8_loop_filter_bh_sse2);
-extern prototype_loopfilter_block(vp8_loop_filter_mbvs_sse2);
-extern prototype_loopfilter_block(vp8_loop_filter_bvs_sse2);
-extern prototype_loopfilter_block(vp8_loop_filter_mbhs_sse2);
-extern prototype_loopfilter_block(vp8_loop_filter_bhs_sse2);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2);
+extern prototype_simple_loopfilter(vp8_loop_filter_bvs_sse2);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2);
+extern prototype_simple_loopfilter(vp8_loop_filter_bhs_sse2);
 
 
 #if !CONFIG_RUNTIME_CPU_DETECT
@@ -83,13 +83,13 @@
 #define vp8_lf_normal_b_h vp8_loop_filter_bh_sse2
 
 #undef  vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_sse2
+#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_sse2
 
 #undef  vp8_lf_simple_b_v
 #define vp8_lf_simple_b_v vp8_loop_filter_bvs_sse2
 
 #undef  vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_sse2
+#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_sse2
 
 #undef  vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_sse2
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -9,7 +9,7 @@
  */
 
 
-#include "vpx_ports/config.h"
+#include "vpx_config.h"
 #include "vpx_ports/x86.h"
 #include "vp8/common/g_common.h"
 #include "vp8/common/subpixel.h"
@@ -63,9 +63,9 @@
         rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_mmx;
         rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_mmx;
         rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_mmx;
-        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_mmx;
+        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_mmx;
         rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_mmx;
-        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_mmx;
+        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_mmx;
         rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_mmx;
 
 #if CONFIG_POSTPROC
@@ -101,9 +101,9 @@
         rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_sse2;
         rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_sse2;
         rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_sse2;
-        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_sse2;
+        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_sse2;
         rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_sse2;
-        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_sse2;
+        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_sse2;
         rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_sse2;
 
 #if CONFIG_POSTPROC