shithub: libvpx

Download patch

ref: 4c53bacce4a97d98a4e73262bb3517d38ddd3514
parent: 9704cdec9fb777833599312fc84e3f1311d25eed
author: Yunqing Wang <yunqingwang@google.com>
date: Fri Sep 28 06:13:07 EDT 2012

post-proc: deblock filter optimization

1. Algorithm modification:
Instead of having same filter threshold for a whole frame, now we
allow the thresholds to be adjusted for each macroblock. In current
implementation, to avoid excessive blur on background as reported
in issue480(http://code.google.com/p/webm/issues/detail?id=480), we
reduce the thresholds for skipped macroblocks.

2. SSE2 optimization:
As started in issue479(http://code.google.com/p/webm/issues/detail?id=479),
the filter calculation was adjusted for better performance. The c
code was also modified accordingly. This made the deblock filter
2x faster, and the decoder was 1.2x faster overall.

Next, the demacroblock filter will be modified similarly.

Change-Id: I05e54c3f580ccd427487d085096b3174f2ab7e86

--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -19,9 +19,9 @@
                                  unsigned char *dst_ptr,
                                  int src_pixels_per_line,
                                  int dst_pixels_per_line,
-                                 int rows,
                                  int cols,
-                                 int flimit);
+                                 unsigned char *flimit,
+                                 int size);
 
 namespace {
 
@@ -29,7 +29,7 @@
     : public ::testing::TestWithParam<post_proc_func_t> {};
 
 // Test routine for the VP8 post-processing function
-// vp8_post_proc_down_and_across_c.
+// vp8_post_proc_down_and_across_mb_row_c.
 
 TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
   // Size of the underlying data block that will be filtered.
@@ -56,6 +56,8 @@
   // Pointers to top-left pixel of block in the input and output images.
   uint8_t *const src_image_ptr = src_image + (input_stride << 1);
   uint8_t *const dst_image_ptr = dst_image + 8;
+  uint8_t *const flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
+  (void)vpx_memset(flimits, 255, block_width);
 
   // Initialize pixels in the input:
   //   block pixels to value 1,
@@ -73,14 +75,13 @@
   (void)vpx_memset(dst_image, 99, output_size);
 
   GetParam()(src_image_ptr, dst_image_ptr, input_stride,
-             output_stride, block_height, block_width,
-             255);
+             output_stride, block_width, flimits, 16);
 
   static const uint8_t expected_data[block_height] = {
-    3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3
+    4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
   };
 
-  pixel_ptr = dst_image;
+  pixel_ptr = dst_image_ptr;
   for (int i = 0; i < block_height; ++i) {
     for (int j = 0; j < block_width; ++j) {
       EXPECT_EQ(expected_data[i], pixel_ptr[j])
@@ -91,19 +92,15 @@
 
   vpx_free(src_image);
   vpx_free(dst_image);
+  vpx_free(flimits);
 };
 
 INSTANTIATE_TEST_CASE_P(C, Vp8PostProcessingFilterTest,
-                        ::testing::Values(vp8_post_proc_down_and_across_c));
+    ::testing::Values(vp8_post_proc_down_and_across_mb_row_c));
 
-#if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(MMX, Vp8PostProcessingFilterTest,
-                        ::testing::Values(vp8_post_proc_down_and_across_mmx));
-#endif
-
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, Vp8PostProcessingFilterTest,
-                        ::testing::Values(vp8_post_proc_down_and_across_xmm));
+    ::testing::Values(vp8_post_proc_down_and_across_mb_row_sse2));
 #endif
 
 }  // namespace
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -127,25 +127,24 @@
 extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
 /***********************************************************************************************************
  */
-void vp8_post_proc_down_and_across_c
+void vp8_post_proc_down_and_across_mb_row_c
 (
     unsigned char *src_ptr,
     unsigned char *dst_ptr,
     int src_pixels_per_line,
     int dst_pixels_per_line,
-    int rows,
     int cols,
-    int flimit
+    unsigned char *f,
+    int size
 )
 {
     unsigned char *p_src, *p_dst;
     int row;
     int col;
-    int i;
-    int v;
-    unsigned char d[8];
+    unsigned char v;
+    unsigned char d[4];
 
-    for (row = 0; row < rows; row++)
+    for (row = 0; row < size; row++)
     {
         /* post_proc_down for one row */
         p_src = src_ptr;
@@ -153,20 +152,23 @@
 
         for (col = 0; col < cols; col++)
         {
+            unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
+            unsigned char p_above1 = p_src[col - src_pixels_per_line];
+            unsigned char p_below1 = p_src[col + src_pixels_per_line];
+            unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
 
-            int kernel = 4;
-            int v = p_src[col];
+            v = p_src[col];
 
-            for (i = -2; i <= 2; i++)
+            if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
+                && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col]))
             {
-                if (abs(v - p_src[col+i*src_pixels_per_line]) > flimit)
-                    goto down_skip_convolve;
-
-                kernel += kernel5[2+i] * p_src[col+i*src_pixels_per_line];
+                unsigned char k1, k2, k3;
+                k1 = (p_above2 + p_above1 + 1) >> 1;
+                k2 = (p_below2 + p_below1 + 1) >> 1;
+                k3 = (k1 + k2 + 1) >> 1;
+                v = (k3 + v + 1) >> 1;
             }
 
-            v = (kernel >> 3);
-        down_skip_convolve:
             p_dst[col] = v;
         }
 
@@ -174,40 +176,34 @@
         p_src = dst_ptr;
         p_dst = dst_ptr;
 
-        for (i = -8; i<0; i++)
-          p_src[i]=p_src[0];
+        p_src[-2] = p_src[-1] = p_src[0];
+        p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
 
-        for (i = cols; i<cols+8; i++)
-          p_src[i]=p_src[cols-1];
-
-        for (i = 0; i < 8; i++)
-            d[i] = p_src[i];
-
         for (col = 0; col < cols; col++)
         {
-            int kernel = 4;
             v = p_src[col];
 
-            d[col&7] = v;
-
-            for (i = -2; i <= 2; i++)
+            if ((abs(v - p_src[col - 2]) < f[col])
+                && (abs(v - p_src[col - 1]) < f[col])
+                && (abs(v - p_src[col + 1]) < f[col])
+                && (abs(v - p_src[col + 2]) < f[col]))
             {
-                if (abs(v - p_src[col+i]) > flimit)
-                    goto across_skip_convolve;
-
-                kernel += kernel5[2+i] * p_src[col+i];
+                unsigned char k1, k2, k3;
+                k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
+                k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
+                k3 = (k1 + k2 + 1) >> 1;
+                v = (k3 + v + 1) >> 1;
             }
 
-            d[col&7] = (kernel >> 3);
-        across_skip_convolve:
+            d[col & 3] = v;
 
             if (col >= 2)
-                p_dst[col-2] = d[(col-2)&7];
+                p_dst[col - 2] = d[(col - 2) & 3];
         }
 
         /* handle the last two pixels */
-        p_dst[col-2] = d[(col-2)&7];
-        p_dst[col-1] = d[(col-1)&7];
+        p_dst[col - 2] = d[(col - 2) & 3];
+        p_dst[col - 1] = d[(col - 1) & 3];
 
         /* next row */
         src_ptr += src_pixels_per_line;
@@ -318,28 +314,17 @@
     }
 }
 
-
-static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG         *source,
-        YV12_BUFFER_CONFIG         *post,
-        int                         q,
-        int                         low_var_thresh,
-        int                         flag)
+static void vp8_de_mblock(YV12_BUFFER_CONFIG         *post,
+                          int                         q)
 {
-    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
-    int ppl = (int)(level + .5);
-    (void) low_var_thresh;
-    (void) flag;
-
-    vp8_post_proc_down_and_across(source->y_buffer, post->y_buffer, source->y_stride,  post->y_stride, source->y_height, source->y_width,  ppl);
-    vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
-    vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
-
-    vp8_post_proc_down_and_across(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
-    vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
-
+    vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+                              post->y_width, q2mbl(q));
+    vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+                         post->y_width, q2mbl(q));
 }
 
-void vp8_deblock(YV12_BUFFER_CONFIG         *source,
+void vp8_deblock(VP8_COMMON                 *cm,
+                 YV12_BUFFER_CONFIG         *source,
                  YV12_BUFFER_CONFIG         *post,
                  int                         q,
                  int                         low_var_thresh,
@@ -347,12 +332,58 @@
 {
     double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
     int ppl = (int)(level + .5);
+
+    const MODE_INFO *mode_info_context = cm->mi;
+    int mbr, mbc;
+
+    /* The pixel thresholds are adjusted according to if or not the macroblock
+     * is a skipped block.  */
+    unsigned char *ylimits = (unsigned char *)vpx_memalign(16, 16 * cm->mb_cols);
+    unsigned char *uvlimits = (unsigned char *)vpx_memalign(16, 8 * cm->mb_cols);
     (void) low_var_thresh;
     (void) flag;
 
-    vp8_post_proc_down_and_across(source->y_buffer, post->y_buffer, source->y_stride,  post->y_stride, source->y_height, source->y_width,   ppl);
-    vp8_post_proc_down_and_across(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride,  source->uv_height, source->uv_width, ppl);
-    vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
+    if (ppl > 0)
+    {
+        for (mbr = 0; mbr < cm->mb_rows; mbr++)
+        {
+            unsigned char *ylptr = ylimits;
+            unsigned char *uvlptr = uvlimits;
+            for (mbc = 0; mbc < cm->mb_cols; mbc++)
+            {
+                unsigned char mb_ppl;
+
+                if (mode_info_context->mbmi.mb_skip_coeff)
+                    mb_ppl = (unsigned char)ppl >> 1;
+                else
+                    mb_ppl = (unsigned char)ppl;
+
+                vpx_memset(ylptr, mb_ppl, 16);
+                vpx_memset(uvlptr, mb_ppl, 8);
+
+                ylptr += 16;
+                uvlptr += 8;
+                mode_info_context++;
+            }
+            mode_info_context++;
+
+            vp8_post_proc_down_and_across_mb_row(
+                source->y_buffer + 16 * mbr * source->y_stride,
+                post->y_buffer + 16 * mbr * post->y_stride, source->y_stride,
+                post->y_stride, source->y_width, ylimits, 16);
+
+            vp8_post_proc_down_and_across_mb_row(
+                source->u_buffer + 8 * mbr * source->uv_stride,
+                post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
+                post->uv_stride, source->uv_width, uvlimits, 8);
+            vp8_post_proc_down_and_across_mb_row(
+                source->v_buffer + 8 * mbr * source->uv_stride,
+                post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
+                post->uv_stride, source->uv_width, uvlimits, 8);
+        }
+    }
+    vpx_free(ylimits);
+    vpx_free(uvlimits);
 }
 
 #if !(CONFIG_TEMPORAL_DENOISING)
@@ -364,33 +395,35 @@
 {
     double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
     int ppl = (int)(level + .5);
+    int mb_rows = source->y_width >> 4;
+    int mb_cols = source->y_height >> 4;
+    unsigned char *limits = (unsigned char *)vpx_memalign(16, 16 * mb_cols);
+    int mbr, mbc;
     (void) post;
     (void) low_var_thresh;
     (void) flag;
 
-    vp8_post_proc_down_and_across(
-        source->y_buffer + 2 * source->y_stride + 2,
-        source->y_buffer + 2 * source->y_stride + 2,
-        source->y_stride,
-        source->y_stride,
-        source->y_height - 4,
-        source->y_width - 4,
-        ppl);
-    vp8_post_proc_down_and_across(
-        source->u_buffer + 2 * source->uv_stride + 2,
-        source->u_buffer + 2 * source->uv_stride + 2,
-        source->uv_stride,
-        source->uv_stride,
-        source->uv_height - 4,
-        source->uv_width - 4, ppl);
-    vp8_post_proc_down_and_across(
-        source->v_buffer + 2 * source->uv_stride + 2,
-        source->v_buffer + 2 * source->uv_stride + 2,
-        source->uv_stride,
-        source->uv_stride,
-        source->uv_height - 4,
-        source->uv_width - 4, ppl);
+    /* TODO: The original code don't filter the 2 outer rows and columns. */
+    vpx_memset(limits, (unsigned char)ppl, 16 * mb_cols);
 
+    for (mbr = 0; mbr < mb_rows; mbr++)
+    {
+        vp8_post_proc_down_and_across_mb_row(
+            source->y_buffer + 16 * mbr * source->y_stride,
+            source->y_buffer + 16 * mbr * source->y_stride,
+            source->y_stride, source->y_stride, source->y_width, limits, 16);
+
+        vp8_post_proc_down_and_across_mb_row(
+            source->u_buffer + 8 * mbr * source->uv_stride,
+            source->u_buffer + 8 * mbr * source->uv_stride,
+            source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
+        vp8_post_proc_down_and_across_mb_row(
+            source->v_buffer + 8 * mbr * source->uv_stride,
+            source->v_buffer + 8 * mbr * source->uv_stride,
+            source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
+    }
+
+    vpx_free(limits);
 }
 #endif
 
@@ -752,12 +785,14 @@
             vp8_yv12_copy_frame(&oci->post_proc_buffer, &oci->post_proc_buffer_int);
             if (flags & VP8D_DEMACROBLOCK)
             {
-                vp8_deblock_and_de_macro_block(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
+                vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
                                                q + (deblock_level - 5) * 10, 1, 0);
+                vp8_de_mblock(&oci->post_proc_buffer,
+                              q + (deblock_level - 5) * 10);
             }
             else if (flags & VP8D_DEBLOCK)
             {
-                vp8_deblock(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
+                vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
                             q, 1, 0);
             }
         }
@@ -766,13 +801,15 @@
     }
     else if (flags & VP8D_DEMACROBLOCK)
     {
-        vp8_deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
-                                       q + (deblock_level - 5) * 10, 1, 0);
+        vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
+                                     q + (deblock_level - 5) * 10, 1, 0);
+        vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10);
+
         oci->postproc_state.last_base_qindex = oci->base_qindex;
     }
     else if (flags & VP8D_DEBLOCK)
     {
-        vp8_deblock(oci->frame_to_show, &oci->post_proc_buffer,
+        vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
                     q, 1, 0);
         oci->postproc_state.last_base_qindex = oci->base_qindex;
     }
--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -36,7 +36,8 @@
                   int                         low_var_thresh,
                   int                         flag);
 
-void vp8_deblock(YV12_BUFFER_CONFIG         *source,
+void vp8_deblock(struct VP8Common           *oci,
+                 YV12_BUFFER_CONFIG         *source,
                  YV12_BUFFER_CONFIG         *post,
                  int                         q,
                  int                         low_var_thresh,
--- a/vp8/common/ppc/systemdependent.c
+++ b/vp8/common/ppc/systemdependent.c
@@ -19,14 +19,14 @@
 void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
 void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
 
-extern void (*vp8_post_proc_down_and_across)(
+extern void (*vp8_post_proc_down_and_across_mb_row)(
     unsigned char *src_ptr,
     unsigned char *dst_ptr,
     int src_pixels_per_line,
     int dst_pixels_per_line,
-    int rows,
     int cols,
-    int flimit
+    unsigned char *f,
+    int size
 );
 
 extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
@@ -34,15 +34,15 @@
 extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
 extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
 
-extern void vp8_post_proc_down_and_across_c
+extern void vp8_post_proc_down_and_across_mb_row_c
 (
     unsigned char *src_ptr,
     unsigned char *dst_ptr,
     int src_pixels_per_line,
     int dst_pixels_per_line,
-    int rows,
     int cols,
-    int flimit
+    unsigned char *f,
+    int size
 );
 void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
 
@@ -158,7 +158,7 @@
     vp8_lf_mbhsimple                     = loop_filter_mbhs_ppc;
     vp8_lf_bhsimple                      = loop_filter_bhs_ppc;
 
-    vp8_post_proc_down_and_across           = vp8_post_proc_down_and_across_c;
+    vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c;
     vp8_mbpost_proc_down                  = vp8_mbpost_proc_down_c;
     vp8_mbpost_proc_across_ip              = vp8_mbpost_proc_across_ip_c;
     vp8_plane_add_noise                   = vp8_plane_add_noise_c;
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -162,9 +162,8 @@
     specialize vp8_mbpost_proc_across_ip sse2
     vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm
 
-    prototype void vp8_post_proc_down_and_across "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int rows, int cols, int flimit"
-    specialize vp8_post_proc_down_and_across mmx sse2
-    vp8_post_proc_down_and_across_sse2=vp8_post_proc_down_and_across_xmm
+    prototype void vp8_post_proc_down_and_across_mb_row "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"
+    specialize vp8_post_proc_down_and_across_mb_row sse2
 
     prototype void vp8_plane_add_noise "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch"
     specialize vp8_plane_add_noise mmx sse2
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -14,271 +14,6 @@
 %define VP8_FILTER_WEIGHT 128
 %define VP8_FILTER_SHIFT  7
 
-;void vp8_post_proc_down_and_across_mmx
-;(
-;    unsigned char *src_ptr,
-;    unsigned char *dst_ptr,
-;    int src_pixels_per_line,
-;    int dst_pixels_per_line,
-;    int rows,
-;    int cols,
-;    int flimit
-;)
-global sym(vp8_post_proc_down_and_across_mmx) PRIVATE
-sym(vp8_post_proc_down_and_across_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-    ; move the global rd onto the stack, since we don't have enough registers
-    ; to do PIC addressing
-    movq        mm0, [GLOBAL(rd)]
-    sub         rsp, 8
-    movq        [rsp], mm0
-%define RD [rsp]
-%else
-%define RD [GLOBAL(rd)]
-%endif
-
-        push        rbx
-        lea         rbx, [GLOBAL(Blur)]
-        movd        mm2, dword ptr arg(6) ;flimit
-        punpcklwd   mm2, mm2
-        punpckldq   mm2, mm2
-
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(1) ;dst_ptr
-
-        movsxd      rcx, DWORD PTR arg(4) ;rows
-        movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
-        pxor        mm0, mm0              ; mm0 = 00000000
-
-.nextrow:
-
-        xor         rdx,        rdx       ; clear out rdx for use as loop counter
-.nextcol:
-
-        pxor        mm7, mm7              ; mm7 = 00000000
-        movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps
-        movq        mm3, [rsi]            ; mm4 = r0 p0..p7
-        punpcklbw   mm3, mm0              ; mm3 = p0..p3
-        movq        mm1, mm3              ; mm1 = p0..p3
-        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
-
-        movq        mm6, [rbx + 48]       ; mm6 = kernel 3 taps
-        movq        mm5, [rsi + rax]      ; mm4 = r1 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r1 p0..p3
-        pmullw      mm6, mm5              ; mm6 *= p0..p3 * kernel 3 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm6
-
-        ; thresholding
-        movq        mm7, mm1              ; mm7 = r0 p0..p3
-        psubusw     mm7, mm5              ; mm7 = r0 p0..p3 - r1 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r1 p0..p3 - r0 p0..p3
-        paddusw     mm7, mm5              ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
-        pcmpgtw     mm7, mm2
-
-        movq        mm6, [rbx + 64 ]      ; mm6 = kernel 4 modifiers
-        movq        mm5, [rsi + 2*rax]    ; mm4 = r2 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r2 p0..p3
-        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm5              ; mm6 = r0 p0..p3 - r2 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r2 p0..p3 - r2 p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        neg         rax
-        movq        mm6, [rbx ]           ; kernel 0 taps
-        movq        mm5, [rsi+2*rax]      ; mm4 = r-2 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r-2 p0..p3
-        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - r-2 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r-2 p0..p3 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        movq        mm6, [rbx + 16]       ; kernel 1 taps
-        movq        mm4, [rsi+rax]        ; mm4 = r-1 p0..p7
-        punpcklbw   mm4, mm0              ; mm4 = r-1 p0..p3
-        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm4              ; mm6 = p0..p3 - r-2 p0..p3
-        psubusw     mm4, mm1              ; mm5 = r-1 p0..p3 - p0..p3
-        paddusw     mm6, mm4              ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        paddusw     mm3, RD               ; mm3 += round value
-        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
-
-        pand        mm1, mm7              ; mm1 select vals > thresh from source
-        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
-        paddusw     mm1, mm7              ; combination
-
-        packuswb    mm1, mm0              ; pack to bytes
-
-        movd        [rdi], mm1            ;
-        neg         rax                   ; pitch is positive
-
-
-        add         rsi, 4
-        add         rdi, 4
-        add         rdx, 4
-
-        cmp         edx, dword ptr arg(5) ;cols
-        jl          .nextcol
-        ; done with the all cols, start the across filtering in place
-        sub         rsi, rdx
-        sub         rdi, rdx
-
-        ; dup the first byte into the left border 8 times
-        movq        mm1,   [rdi]
-        punpcklbw   mm1,   mm1
-        punpcklwd   mm1,   mm1
-        punpckldq   mm1,   mm1
-
-        mov         rdx,    -8
-        movq        [rdi+rdx], mm1
-
-        ; dup the last byte into the right border
-        movsxd      rdx,    dword arg(5)
-        movq        mm1,   [rdi + rdx + -1]
-        punpcklbw   mm1,   mm1
-        punpcklwd   mm1,   mm1
-        punpckldq   mm1,   mm1
-        movq        [rdi+rdx], mm1
-
-
-        push        rax
-        xor         rdx,    rdx
-        mov         rax,    [rdi-4];
-
-.acrossnextcol:
-        pxor        mm7, mm7              ; mm7 = 00000000
-        movq        mm6, [rbx + 32 ]      ;
-        movq        mm4, [rdi+rdx]        ; mm4 = p0..p7
-        movq        mm3, mm4              ; mm3 = p0..p7
-        punpcklbw   mm3, mm0              ; mm3 = p0..p3
-        movq        mm1, mm3              ; mm1 = p0..p3
-        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
-
-        movq        mm6, [rbx + 48]
-        psrlq       mm4, 8                ; mm4 = p1..p7
-        movq        mm5, mm4              ; mm5 = p1..p7
-        punpcklbw   mm5, mm0              ; mm5 = p1..p4
-        pmullw      mm6, mm5              ; mm6 *= p1..p4 * kernel 3 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm6
-
-        ; thresholding
-        movq        mm7, mm1              ; mm7 = p0..p3
-        psubusw     mm7, mm5              ; mm7 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm7, mm5              ; mm7 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm7, mm2
-
-        movq        mm6, [rbx + 64 ]
-        psrlq       mm4, 8                ; mm4 = p2..p7
-        movq        mm5, mm4              ; mm5 = p2..p7
-        punpcklbw   mm5, mm0              ; mm5 = p2..p5
-        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        movq        mm6, [rbx ]
-        movq        mm4, [rdi+rdx-2]      ; mm4 = p-2..p5
-        movq        mm5, mm4              ; mm5 = p-2..p5
-        punpcklbw   mm5, mm0              ; mm5 = p-2..p1
-        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        movq        mm6, [rbx + 16]
-        psrlq       mm4, 8                ; mm4 = p-1..p5
-        punpcklbw   mm4, mm0              ; mm4 = p-1..p2
-        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm4              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm4, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm4              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        paddusw     mm3, RD               ; mm3 += round value
-        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
-
-        pand        mm1, mm7              ; mm1 select vals > thresh from source
-        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
-        paddusw     mm1, mm7              ; combination
-
-        packuswb    mm1, mm0              ; pack to bytes
-        mov         DWORD PTR [rdi+rdx-4],  eax   ; store previous four bytes
-        movd        eax,    mm1
-
-        add         rdx, 4
-        cmp         edx, dword ptr arg(5) ;cols
-        jl          .acrossnextcol;
-
-        mov         DWORD PTR [rdi+rdx-4],  eax
-        pop         rax
-
-        ; done with this rwo
-        add         rsi,rax               ; next line
-        movsxd      rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
-        add         rdi,rax               ; next destination
-        movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
-
-        dec         rcx                   ; decrement count
-        jnz         .nextrow               ; next row
-        pop         rbx
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef RD
-
-
 ;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
 ;                             int pitch, int rows, int cols,int flimit)
 extern sym(vp8_rv)
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -11,128 +11,139 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-;void vp8_post_proc_down_and_across_xmm
+;macro in deblock functions
+%macro FIRST_2_ROWS 0
+        movdqa      xmm4,       xmm0
+        movdqa      xmm6,       xmm0
+        movdqa      xmm5,       xmm1
+        pavgb       xmm5,       xmm3
+
+        ;calculate absolute value
+        psubusb     xmm4,       xmm1
+        psubusb     xmm1,       xmm0
+        psubusb     xmm6,       xmm3
+        psubusb     xmm3,       xmm0
+        paddusb     xmm4,       xmm1
+        paddusb     xmm6,       xmm3
+
+        ;get threshold
+        movdqa      xmm2,       flimit
+        pxor        xmm1,       xmm1
+        movdqa      xmm7,       xmm2
+
+        ;get mask
+        psubusb     xmm2,       xmm4
+        psubusb     xmm7,       xmm6
+        pcmpeqb     xmm2,       xmm1
+        pcmpeqb     xmm7,       xmm1
+        por         xmm7,       xmm2
+%endmacro
+
+%macro SECOND_2_ROWS 0
+        movdqa      xmm6,       xmm0
+        movdqa      xmm4,       xmm0
+        movdqa      xmm2,       xmm1
+        pavgb       xmm1,       xmm3
+
+        ;calculate absolute value
+        psubusb     xmm6,       xmm2
+        psubusb     xmm2,       xmm0
+        psubusb     xmm4,       xmm3
+        psubusb     xmm3,       xmm0
+        paddusb     xmm6,       xmm2
+        paddusb     xmm4,       xmm3
+
+        pavgb       xmm5,       xmm1
+
+        ;get threshold
+        movdqa      xmm2,       flimit
+        pxor        xmm1,       xmm1
+        movdqa      xmm3,       xmm2
+
+        ;get mask
+        psubusb     xmm2,       xmm6
+        psubusb     xmm3,       xmm4
+        pcmpeqb     xmm2,       xmm1
+        pcmpeqb     xmm3,       xmm1
+
+        por         xmm7,       xmm2
+        por         xmm7,       xmm3
+
+        pavgb       xmm5,       xmm0
+
+        ;decide if or not to use filtered value
+        pand        xmm0,       xmm7
+        pandn       xmm7,       xmm5
+        paddusb     xmm0,       xmm7
+%endmacro
+
+%macro UPDATE_FLIMIT 0
+        movdqa      xmm2,       XMMWORD PTR [rbx]
+        movdqa      [rsp],      xmm2
+        add         rbx,        16
+%endmacro
+
+;void vp8_post_proc_down_and_across_mb_row_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned char *dst_ptr,
 ;    int src_pixels_per_line,
 ;    int dst_pixels_per_line,
-;    int rows,
 ;    int cols,
-;    int flimit
+;    int *flimits,
+;    int size
 ;)
-global sym(vp8_post_proc_down_and_across_xmm) PRIVATE
-sym(vp8_post_proc_down_and_across_xmm):
+global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
+sym(vp8_post_proc_down_and_across_mb_row_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
     SAVE_XMM 7
-    GET_GOT     rbx
+    push        rbx
     push        rsi
     push        rdi
     ; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
     ALIGN_STACK 16, rax
-    ; move the global rd onto the stack, since we don't have enough registers
-    ; to do PIC addressing
-    movdqa      xmm0, [GLOBAL(rd42)]
     sub         rsp, 16
-    movdqa      [rsp], xmm0
-%define RD42 [rsp]
-%else
-%define RD42 [GLOBAL(rd42)]
-%endif
 
+        ; put flimit on stack
+        mov         rbx,        arg(5)           ;flimits ptr
+        UPDATE_FLIMIT
 
-        movd        xmm2,       dword ptr arg(6) ;flimit
-        punpcklwd   xmm2,       xmm2
-        punpckldq   xmm2,       xmm2
-        punpcklqdq  xmm2,       xmm2
+%define flimit [rsp]
 
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(1) ;dst_ptr
+        mov         rsi,        arg(0)           ;src_ptr
+        mov         rdi,        arg(1)           ;dst_ptr
 
-        movsxd      rcx,        DWORD PTR arg(4) ;rows
-        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
-        pxor        xmm0,       xmm0              ; mm0 = 00000000
-
+        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
+        movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
 .nextrow:
-
-        xor         rdx,        rdx       ; clear out rdx for use as loop counter
+        xor         rdx,        rdx              ;col
 .nextcol:
-        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
-        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
-        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
-        psllw       xmm3,       2                       ;
+        ;load current and next 2 rows
+        movdqu      xmm0,       XMMWORD PTR [rsi]
+        movdqu      xmm1,       XMMWORD PTR [rsi + rax]
+        movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
 
-        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
-        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
-        paddusw     xmm3,       xmm5                    ; mm3 += mm6
+        FIRST_2_ROWS
 
-        ; thresholding
-        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
-        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
-        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
-        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
-        pcmpgtw     xmm7,       xmm2
-
-        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
-        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
-        paddusw     xmm3,       xmm5                    ; mm3 += mm5
-
-        ; thresholding
-        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
-        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
-        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
-        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6                    ; accumulate thresholds
-
-
+        ;load above 2 rows
         neg         rax
-        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
-        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
-        paddusw     xmm3,       xmm5                    ; mm3 += mm5
+        movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
+        movdqu      xmm3,       XMMWORD PTR [rsi + rax]
 
-        ; thresholding
-        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
-        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
-        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
-        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6                    ; accumulate thresholds
+        SECOND_2_ROWS
 
-        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
-        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
-        paddusw     xmm3,       xmm4                    ; mm3 += mm5
+        movdqu      XMMWORD PTR [rdi], xmm0
 
-        ; thresholding
-        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
-        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
-        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
-        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6                    ; accumulate thresholds
+        neg         rax                          ; positive stride
+        add         rsi,        16
+        add         rdi,        16
 
+        UPDATE_FLIMIT
 
-        paddusw     xmm3,       RD42                    ; mm3 += round value
-        psraw       xmm3,       3                       ; mm3 /= 8
-
-        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
-        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
-        paddusw     xmm1,       xmm7                    ; combination
-
-        packuswb    xmm1,       xmm0                    ; pack to bytes
-        movq        QWORD PTR [rdi], xmm1             ;
-
-        neg         rax                   ; pitch is positive
-        add         rsi,        8
-        add         rdi,        8
-
-        add         rdx,        8
-        cmp         edx,        dword arg(5) ;cols
-
+        add         rdx,        16
+        cmp         edx,        dword arg(4)     ;cols
         jl          .nextcol
 
         ; done with the all cols, start the across filtering in place
@@ -139,6 +150,8 @@
         sub         rsi,        rdx
         sub         rdi,        rdx
 
+        mov         rbx,        arg(5) ; flimits
+        UPDATE_FLIMIT
 
         ; dup the first byte into the left border 8 times
         movq        mm1,   [rdi]
@@ -145,12 +158,11 @@
         punpcklbw   mm1,   mm1
         punpcklwd   mm1,   mm1
         punpckldq   mm1,   mm1
-
         mov         rdx,    -8
         movq        [rdi+rdx], mm1
 
         ; dup the last byte into the right border
-        movsxd      rdx,    dword arg(5)
+        movsxd      rdx,    dword arg(4)
         movq        mm1,   [rdi + rdx + -1]
         punpcklbw   mm1,   mm1
         punpcklwd   mm1,   mm1
@@ -158,114 +170,64 @@
         movq        [rdi+rdx], mm1
 
         xor         rdx,        rdx
-        movq        mm0,        QWORD PTR [rdi-8];
+        movq        mm0,        QWORD PTR [rdi-16];
+        movq        mm1,        QWORD PTR [rdi-8];
 
 .acrossnextcol:
-        movq        xmm7,       QWORD PTR [rdi +rdx -2]
-        movd        xmm4,       DWORD PTR [rdi +rdx +6]
+        movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
+        movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
+        movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
 
-        pslldq      xmm4,       8
-        por         xmm4,       xmm7
+        FIRST_2_ROWS
 
-        movdqa      xmm3,       xmm4
-        psrldq      xmm3,       2
-        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
-        movdqa      xmm1,       xmm3              ; mm1 = p0..p3
-        psllw       xmm3,       2
+        movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
+        movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
 
+        SECOND_2_ROWS
 
-        movdqa      xmm5,       xmm4
-        psrldq      xmm5,       3
-        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
-        paddusw     xmm3,       xmm5              ; mm3 += mm6
+        movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
+        movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
+        movdq2q     mm0,        xmm0
+        psrldq      xmm0,       8
+        movdq2q     mm1,        xmm0
 
-        ; thresholding
-        movdqa      xmm7,       xmm1              ; mm7 = p0..p3
-        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
-        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
-        pcmpgtw     xmm7,       xmm2
+        UPDATE_FLIMIT
 
-        movdqa      xmm5,       xmm4
-        psrldq      xmm5,       4
-        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
-        paddusw     xmm3,       xmm5              ; mm3 += mm5
-
-        ; thresholding
-        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
-        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
-        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6              ; accumulate thresholds
-
-
-        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
-        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
-        paddusw     xmm3,       xmm5              ; mm3 += mm5
-
-        ; thresholding
-        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
-        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
-        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6              ; accumulate thresholds
-
-        psrldq      xmm4,       1                   ; mm4 = p-1..p5
-        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
-        paddusw     xmm3,       xmm4              ; mm3 += mm5
-
-        ; thresholding
-        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
-        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
-        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6              ; accumulate thresholds
-
-        paddusw     xmm3,       RD42              ; mm3 += round value
-        psraw       xmm3,       3                 ; mm3 /= 8
-
-        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
-        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
-        paddusw     xmm1,       xmm7              ; combination
-
-        packuswb    xmm1,       xmm0              ; pack to bytes
-        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
-        movdq2q     mm0,        xmm1
-
-        add         rdx,        8
-        cmp         edx,        dword arg(5) ;cols
+        add         rdx,        16
+        cmp         edx,        dword arg(4)     ;cols
         jl          .acrossnextcol;
 
-        ; last 8 pixels
-        movq        QWORD PTR [rdi+rdx-8],  mm0
+        ; last 16 pixels
+        movq        QWORD PTR [rdi+rdx-16], mm0
 
+        cmp         edx,        dword arg(4)
+        jne         .throw_last_8
+        movq        QWORD PTR [rdi+rdx-8], mm1
+.throw_last_8:
         ; done with this rwo
-        add         rsi,rax               ; next line
-        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
-        add         rdi,rax               ; next destination
-        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
+        add         rsi,rax                      ;next src line
+        mov         eax, dword arg(3)            ;dst_pixels_per_line
+        add         rdi,rax                      ;next destination
+        mov         eax, dword arg(2)            ;src_pixels_per_line
 
-        dec         rcx                   ; decrement count
-        jnz         .nextrow              ; next row
+        mov         rbx,        arg(5)           ;flimits
+        UPDATE_FLIMIT
 
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-    add rsp,16
+        dec         rcx                          ;decrement count
+        jnz         .nextrow                     ;next row
+
+    add rsp, 16
     pop rsp
-%endif
     ; begin epilog
     pop rdi
     pop rsi
-    RESTORE_GOT
+    pop rbx
     RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
-%undef RD42
+%undef flimit
 
-
 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
 ;                            int pitch, int rows, int cols,int flimit)
 extern sym(vp8_rv)
@@ -753,7 +715,5 @@
 
 SECTION_RODATA
 align 16
-rd42:
-    times 8 dw 0x04
 four8s:
     times 4 dd 8
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -5301,7 +5301,7 @@
                     double frame_psnr2, frame_ssim2 = 0;
                     double weight = 0;
 
-                    vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0);
+                    vp8_deblock(cm, cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0);
                     vp8_clear_system_state();
 
                     ye = calc_plane_error(orig->y_buffer, orig->y_stride,