shithub: libvpx

Download patch

ref: dc29ed27bd298eb84767941f0f13b731395720e6
parent: 016fb2b554e1cbd3eda9ad57a093ae7f8336ee97
parent: a0306ea660b2a35d09645e6d3e98d786614a874d
author: John Koleszar <jkoleszar@google.com>
date: Wed Mar 9 19:05:06 EST 2011

Merge remote branch 'origin/master' into experimental

Change-Id: Icb795cef47a205f33f180f3852d88c36113b673e

--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -153,7 +153,7 @@
 #
 obj_int_extract: build/make/obj_int_extract.c
 	$(if $(quiet),echo "    [HOSTCC] $@")
-	$(qexec)$(HOSTCC) -I. -o $@ $<
+	$(qexec)$(HOSTCC) -I. -I$(SRC_PATH_BARE) -o $@ $<
 CLEAN-OBJS += obj_int_extract
 
 #
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
@@ -14,7 +14,7 @@
 
 #include "vpx_config.h"
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__MINGW32__)
 #include <io.h>
 #include <share.h>
 #include "vpx/vpx_integer.h"
@@ -816,7 +816,7 @@
 #endif
 
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__MINGW32__)
 /*  See "Microsoft Portable Executable and Common Object File Format Specification"
     for reference.
 */
@@ -830,7 +830,6 @@
     unsigned int i;
     unsigned __int8 *ptr;
     unsigned __int32 symoffset;
-    FILE *fp;
 
     char **sectionlist;  //this array holds all section names in their correct order.
     //it is used to check if the symbol is in .bss or .data section.
@@ -871,14 +870,6 @@
     //log_msg("COFF: Symbol table at offset %u\n", symtab_ptr);
     //log_msg("COFF: raw data pointer ofset for section .data is %u\n", sectionrawdata_ptr);
 
-    fp = fopen("assembly_offsets.asm", "w");
-
-    if (fp == NULL)
-    {
-        perror("open file");
-        goto bail;
-    }
-
     /*  The compiler puts the data with non-zero offset in .data section, but puts the data with
         zero offset in .bss section. So, if the data in in .bss section, set offset=0.
         Note from Wiki: In an object module compiled from C, the bss section contains
@@ -912,13 +903,13 @@
                 char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
                 strncpy(name, ptr, 8);
                 //log_msg("COFF: Parsing symbol %s\n",name);
-                fprintf(fp, "%-40s EQU ", name);
+                printf("%-40s EQU ", name + 1);
             }
             else
             {
                 //log_msg("COFF: Parsing symbol %s\n",
                 //        buf + strtab_ptr + get_le32(ptr+4));
-                fprintf(fp, "%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4));
+                printf("%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4) + 1);
             }
 
             if (!(strcmp(sectionlist[section-1], ".bss")))
@@ -935,14 +926,13 @@
             //log_msg("      Address: %u\n",get_le32(ptr+8));
             //log_msg("      Offset: %u\n", symoffset);
 
-            fprintf(fp, "%5d\n", symoffset);
+            printf("%5d\n", symoffset);
         }
 
         ptr += 18;
     }
 
-    fprintf(fp, "    END\n");
-    fclose(fp);
+    printf("    END\n");
 
     for (i = 0; i < nsections; i++)
     {
@@ -992,11 +982,7 @@
     else
         f = argv[1];
 
-    if (_sopen_s(&fd, f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE))
-    {
-        perror("Unable to open file");
-        goto bail;
-    }
+    fd = _sopen(f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE);
 
     if (_fstat(fd, &stat_buf))
     {
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1586,133 +1586,6 @@
         cpi->oxcf = *oxcf;
 
 
-    switch (cpi->oxcf.Mode)
-    {
-
-    case MODE_REALTIME:
-        cpi->pass = 0;
-        cpi->compressor_speed = 2;
-
-        if (cpi->oxcf.cpu_used < -16)
-        {
-            cpi->oxcf.cpu_used = -16;
-        }
-
-        if (cpi->oxcf.cpu_used > 16)
-            cpi->oxcf.cpu_used = 16;
-
-        break;
-
-#if !(CONFIG_REALTIME_ONLY)
-    case MODE_GOODQUALITY:
-        cpi->pass = 0;
-        cpi->compressor_speed = 1;
-
-        if (cpi->oxcf.cpu_used < -5)
-        {
-            cpi->oxcf.cpu_used = -5;
-        }
-
-        if (cpi->oxcf.cpu_used > 5)
-            cpi->oxcf.cpu_used = 5;
-
-        break;
-
-    case MODE_BESTQUALITY:
-        cpi->pass = 0;
-        cpi->compressor_speed = 0;
-        break;
-
-    case MODE_FIRSTPASS:
-        cpi->pass = 1;
-        cpi->compressor_speed = 1;
-        break;
-    case MODE_SECONDPASS:
-        cpi->pass = 2;
-        cpi->compressor_speed = 1;
-
-        if (cpi->oxcf.cpu_used < -5)
-        {
-            cpi->oxcf.cpu_used = -5;
-        }
-
-        if (cpi->oxcf.cpu_used > 5)
-            cpi->oxcf.cpu_used = 5;
-
-        break;
-    case MODE_SECONDPASS_BEST:
-        cpi->pass = 2;
-        cpi->compressor_speed = 0;
-        break;
-#endif
-    }
-
-    if (cpi->pass == 0)
-        cpi->auto_worst_q = 1;
-
-    cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
-    cpi->oxcf.best_allowed_q  = q_trans[oxcf->best_allowed_q];
-    cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
-
-    if (oxcf->fixed_q >= 0)
-    {
-        if (oxcf->worst_allowed_q < 0)
-            cpi->oxcf.fixed_q = q_trans[0];
-        else
-            cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q];
-
-        if (oxcf->alt_q < 0)
-            cpi->oxcf.alt_q = q_trans[0];
-        else
-            cpi->oxcf.alt_q = q_trans[oxcf->alt_q];
-
-        if (oxcf->key_q < 0)
-            cpi->oxcf.key_q = q_trans[0];
-        else
-            cpi->oxcf.key_q = q_trans[oxcf->key_q];
-
-        if (oxcf->gold_q < 0)
-            cpi->oxcf.gold_q = q_trans[0];
-        else
-            cpi->oxcf.gold_q = q_trans[oxcf->gold_q];
-
-    }
-
-    cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
-    cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
-
-    //cpi->use_golden_frame_only = 0;
-    //cpi->use_last_frame_only = 0;
-    cm->refresh_golden_frame = 0;
-    cm->refresh_last_frame = 1;
-    cm->refresh_entropy_probs = 1;
-
-    if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
-        cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;
-
-    setup_features(cpi);
-
-    {
-        int i;
-
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-            cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
-    }
-
-    // At the moment the first order values may not be > MAXQ
-    if (cpi->oxcf.fixed_q > MAXQ)
-        cpi->oxcf.fixed_q = MAXQ;
-
-    // local file playback mode == really big buffer
-    if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
-    {
-        cpi->oxcf.starting_buffer_level   = 60000;
-        cpi->oxcf.optimal_buffer_level    = 60000;
-        cpi->oxcf.maximum_buffer_size     = 240000;
-
-    }
-
-
     // Convert target bandwidth from Kbit/s to Bit/s
     cpi->oxcf.target_bandwidth       *= 1000;
     cpi->oxcf.starting_buffer_level =
@@ -1719,120 +1592,24 @@
         rescale(cpi->oxcf.starting_buffer_level,
                 cpi->oxcf.target_bandwidth, 1000);
 
-    if (cpi->oxcf.optimal_buffer_level == 0)
-        cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
-    else
-        cpi->oxcf.optimal_buffer_level =
-            rescale(cpi->oxcf.optimal_buffer_level,
-                    cpi->oxcf.target_bandwidth, 1000);
-
-    if (cpi->oxcf.maximum_buffer_size == 0)
-        cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
-    else
-        cpi->oxcf.maximum_buffer_size =
-            rescale(cpi->oxcf.maximum_buffer_size,
-                    cpi->oxcf.target_bandwidth, 1000);
-
-    cpi->buffer_level                = cpi->oxcf.starting_buffer_level;
+    cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;
     cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
 
-    vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
-    cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
     cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
-    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
-    cpi->best_quality                = cpi->oxcf.best_allowed_q;
     cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
-    cpi->cq_target_quality = cpi->oxcf.cq_level;
+    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
 
-    cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
-
     cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
     cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_target_bits      = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_actual_bits      = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_actual_bits     = cpi->av_per_frame_bandwidth;
 
     cpi->total_actual_bits            = 0;
-    cpi->total_target_vs_actual        = 0;
+    cpi->total_target_vs_actual       = 0;
 
-    // Only allow dropped frames in buffered mode
-    cpi->drop_frames_allowed          = cpi->oxcf.allow_df && cpi->buffered_mode;
+    // change includes all joint functionality
+   vp8_change_config(ptr, oxcf);
 
-    cm->filter_type      = (LOOPFILTERTYPE) cpi->filter_type;
-
-    if (!cm->use_bilinear_mc_filter)
-        cm->mcomp_filter_type = SIXTAP;
-    else
-        cm->mcomp_filter_type = BILINEAR;
-
-    cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
-
-    cm->Width       = cpi->oxcf.Width     ;
-    cm->Height      = cpi->oxcf.Height    ;
-
-    cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8
-
-    cm->horiz_scale  = cpi->horiz_scale;
-    cm->vert_scale   = cpi->vert_scale ;
-
-    // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
-    if (cpi->oxcf.Sharpness > 7)
-        cpi->oxcf.Sharpness = 7;
-
-    cm->sharpness_level = cpi->oxcf.Sharpness;
-
-    if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL)
-    {
-        int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
-        int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
-
-        Scale2Ratio(cm->horiz_scale, &hr, &hs);
-        Scale2Ratio(cm->vert_scale, &vr, &vs);
-
-        // always go to the next whole number
-        cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
-        cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
-    }
-
-    if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width ||
-        ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height ||
-        cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
-    {
-        alloc_raw_frame_buffers(cpi);
-        vp8_alloc_compressor_data(cpi);
-    }
-
-    // Clamp KF frame size to quarter of data rate
-    if (cpi->intra_frame_target > cpi->target_bandwidth >> 2)
-        cpi->intra_frame_target = cpi->target_bandwidth >> 2;
-
-    if (cpi->oxcf.fixed_q >= 0)
-    {
-        cpi->last_q[0] = cpi->oxcf.fixed_q;
-        cpi->last_q[1] = cpi->oxcf.fixed_q;
-    }
-
-    cpi->Speed = cpi->oxcf.cpu_used;
-
-    // force to allowlag to 0 if lag_in_frames is 0;
-    if (cpi->oxcf.lag_in_frames == 0)
-    {
-        cpi->oxcf.allow_lag = 0;
-    }
-    // Limit on lag buffers as these are not currently dynamically allocated
-    else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
-        cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
-
-    // YX Temp
-    cpi->last_alt_ref_sei    = -1;
-    cpi->is_src_frame_alt_ref = 0;
-    cpi->is_next_src_alt_ref = 0;
-
-#if 0
-    // Experimental RD Code
-    cpi->frame_distortion = 0;
-    cpi->last_frame_distortion = 0;
-#endif
-
 #if VP8_TEMPORAL_ALT_REF
 
     cpi->use_weighted_temporal_filter = 0;
@@ -1848,12 +1625,6 @@
 #endif
 }
 
-/*
- * This function needs more clean up, i.e. be more tuned torwards
- * change_config rather than init_config  !!!!!!!!!!!!!!!!
- * YX - 5/28/2009
- *
- */
 
 void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 {
@@ -2004,10 +1775,6 @@
     // Convert target bandwidth from Kbit/s to Bit/s
     cpi->oxcf.target_bandwidth       *= 1000;
 
-    cpi->oxcf.starting_buffer_level =
-        rescale(cpi->oxcf.starting_buffer_level,
-                cpi->oxcf.target_bandwidth, 1000);
-
     if (cpi->oxcf.optimal_buffer_level == 0)
         cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
     else
@@ -2022,29 +1789,36 @@
             rescale(cpi->oxcf.maximum_buffer_size,
                     cpi->oxcf.target_bandwidth, 1000);
 
-    cpi->buffer_level                = cpi->oxcf.starting_buffer_level;
-    cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
-
     vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
     cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
-    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
-    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
     cpi->best_quality                = cpi->oxcf.best_allowed_q;
-    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
+
+    // active values should only be modified if out of new range
+    if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q)
+    {
+      cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+    }
+    // less likely
+    else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q)
+    {
+      cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
+    }
+    if (cpi->active_best_quality < cpi->oxcf.best_allowed_q)
+    {
+      cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+    }
+    // less likely
+    else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q)
+    {
+      cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
+    }
+
     cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
 
     cpi->cq_target_quality = cpi->oxcf.cq_level;
 
-    cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
-    cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_target_bits      = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_actual_bits      = cpi->av_per_frame_bandwidth;
-
-    cpi->total_actual_bits            = 0;
-    cpi->total_target_vs_actual        = 0;
-
     // Only allow dropped frames in buffered mode
-    cpi->drop_frames_allowed          = cpi->oxcf.allow_df && cpi->buffered_mode;
+    cpi->drop_frames_allowed         = cpi->oxcf.allow_df && cpi->buffered_mode;
 
     cm->filter_type                  = (LOOPFILTERTYPE) cpi->filter_type;
 
@@ -3616,6 +3390,7 @@
     int drop_mark75 = drop_mark * 2 / 3;
     int drop_mark50 = drop_mark / 4;
     int drop_mark25 = drop_mark / 8;
+
 
     // Clear down mmx registers to allow floating point in what follows
     vp8_clear_system_state();
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -790,7 +790,7 @@
     ret
 
 
-;void vp8_half_horiz_vert_variance16x_h_sse2
+;void vp8_half_horiz_vert_variance8x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
 ;    int ref_pixels_per_line,
@@ -800,8 +800,8 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_horiz_vert_variance16x_h_sse2)
-sym(vp8_half_horiz_vert_variance16x_h_sse2):
+global sym(vp8_half_horiz_vert_variance8x_h_sse2)
+sym(vp8_half_horiz_vert_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -835,7 +835,7 @@
         add             rsi, r8
 %endif
 
-vp8_half_horiz_vert_variance16x_h_1:
+vp8_half_horiz_vert_variance8x_h_1:
 
         movq            xmm1,           QWORD PTR [rsi]     ;
         movq            xmm2,           QWORD PTR [rsi+1]   ;
@@ -863,7 +863,7 @@
 %endif
 
         sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
+        jnz             vp8_half_horiz_vert_variance8x_h_1     ;
 
         movdq2q         mm6,            xmm6                ;
         movdq2q         mm7,            xmm7                ;
@@ -910,8 +910,123 @@
     pop         rbp
     ret
 
+;void vp8_half_horiz_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_vert_variance16x_h_sse2)
+sym(vp8_half_horiz_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
 
-;void vp8_half_vert_variance16x_h_sse2
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+        lea             rsi,            [rsi + rax]
+
+vp8_half_horiz_vert_variance16x_h_1:
+        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
+
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+
+        movq            xmm3,           QWORD PTR [rdi+8]
+        punpcklbw       xmm3,           xmm0
+        psubw           xmm4,           xmm3
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_vert_variance8x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
 ;    int ref_pixels_per_line,
@@ -921,8 +1036,8 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_vert_variance16x_h_sse2)
-sym(vp8_half_vert_variance16x_h_sse2):
+global sym(vp8_half_vert_variance8x_h_sse2)
+sym(vp8_half_vert_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -945,7 +1060,7 @@
         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
 
         pxor            xmm0,           xmm0                ;
-vp8_half_vert_variance16x_h_1:
+vp8_half_vert_variance8x_h_1:
         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
         movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
 
@@ -969,7 +1084,7 @@
 %endif
 
         sub             rcx,            1                   ;
-        jnz             vp8_half_vert_variance16x_h_1          ;
+        jnz             vp8_half_vert_variance8x_h_1          ;
 
         movdq2q         mm6,            xmm6                ;
         movdq2q         mm7,            xmm7                ;
@@ -1016,8 +1131,115 @@
     pop         rbp
     ret
 
+;void vp8_half_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance16x_h_sse2)
+sym(vp8_half_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
 
-;void vp8_half_horiz_variance16x_h_sse2
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0)              ;ref_ptr
+
+        mov             rdi,            arg(2)              ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)    ;Height
+        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        lea             rsi,            [rsi + rax          ]
+        pxor            xmm0,           xmm0
+
+vp8_half_vert_variance16x_h_1:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm2,           QWORD PTR [rdi]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm5,           xmm2
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm4,           xmm2
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm3
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1
+        jnz             vp8_half_vert_variance16x_h_1
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_horiz_variance8x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
 ;    int ref_pixels_per_line,
@@ -1027,8 +1249,8 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_horiz_variance16x_h_sse2)
-sym(vp8_half_horiz_variance16x_h_sse2):
+global sym(vp8_half_horiz_variance8x_h_sse2)
+sym(vp8_half_horiz_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -1050,7 +1272,7 @@
         movsxd          rcx,            dword ptr arg(4) ;Height              ;
 
         pxor            xmm0,           xmm0                ;
-vp8_half_horiz_variance16x16_1:
+vp8_half_horiz_variance8x_h_1:
         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
         movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
 
@@ -1073,7 +1295,7 @@
         add             rdi, r9
 %endif
         sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_variance16x16_1        ;
+        jnz             vp8_half_horiz_variance8x_h_1        ;
 
         movdq2q         mm6,            xmm6                ;
         movdq2q         mm7,            xmm7                ;
@@ -1120,6 +1342,109 @@
     pop         rbp
     ret
 
+;void vp8_half_horiz_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance16x_h_sse2)
+sym(vp8_half_horiz_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+vp8_half_horiz_variance16x_h_1:
+        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm1,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm1,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        psubw           xmm1,           xmm2
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm1
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm1,           xmm1
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm1
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_variance16x_h_1        ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 
 SECTION_RODATA
 ;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -456,146 +456,6 @@
     return (xxsum - ((xsum * xsum) >> 7));
 }
 
-unsigned int vp8_i_variance16x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
-
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-    *sse = var;
-    return (var - ((avg * avg) >> 8));
-
-}
-
-unsigned int vp8_i_variance8x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-
-    *sse = var;
-    return (var - ((avg * avg) >> 7));
-
-}
-
-unsigned int vp8_i_sub_pixel_variance16x16_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-    int f2soffset = (src_pixels_per_line >> 1);
-    int f2doffset = (dst_pixels_per_line >> 1);
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum0, &xxsum0
-    );
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + f2soffset, src_pixels_per_line,
-        dst_ptr + f2doffset, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + f2soffset + 8, src_pixels_per_line,
-        dst_ptr + f2doffset + 8, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-    *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp8_i_sub_pixel_variance8x16_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-    int f2soffset = (src_pixels_per_line >> 1);
-    int f2doffset = (dst_pixels_per_line >> 1);
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum0, &xxsum0
-    );
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + f2soffset, src_pixels_per_line,
-        dst_ptr + f2doffset, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-    *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 7));
-}
-
 
 unsigned int vp8_variance_halfpixvar16x16_h_mmx(
     const unsigned char *src_ptr,
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -81,6 +81,16 @@
     int *sum,
     unsigned int *sumsquared
 );
+void vp8_half_horiz_vert_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
 void vp8_half_horiz_vert_variance16x_h_sse2
 (
     const unsigned char *ref_ptr,
@@ -91,6 +101,16 @@
     int *sum,
     unsigned int *sumsquared
 );
+void vp8_half_horiz_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
 void vp8_half_horiz_variance16x_h_sse2
 (
     const unsigned char *ref_ptr,
@@ -101,6 +121,16 @@
     int *sum,
     unsigned int *sumsquared
 );
+void vp8_half_vert_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
 void vp8_half_vert_variance16x_h_sse2
 (
     const unsigned char *ref_ptr,
@@ -262,7 +292,7 @@
 
     if (xoffset == 4 && yoffset == 0)
     {
-        vp8_half_horiz_variance16x_h_sse2(
+        vp8_half_horiz_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum, &xxsum);
@@ -269,7 +299,7 @@
     }
     else if (xoffset == 0 && yoffset == 4)
     {
-        vp8_half_vert_variance16x_h_sse2(
+        vp8_half_vert_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum, &xxsum);
@@ -276,7 +306,7 @@
     }
     else if (xoffset == 4 && yoffset == 4)
     {
-        vp8_half_horiz_vert_variance16x_h_sse2(
+        vp8_half_horiz_vert_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum, &xxsum);
@@ -317,11 +347,6 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum0, &xxsum0);
-
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
     }
     else if (xoffset == 0 && yoffset == 4)
     {
@@ -329,11 +354,6 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum0, &xxsum0);
-
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
     }
     else if (xoffset == 4 && yoffset == 4)
     {
@@ -341,11 +361,6 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum0, &xxsum0);
-
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
     }
     else
     {
@@ -356,7 +371,6 @@
             &xsum0, &xxsum0
         );
 
-
         vp8_filter_block2d_bil_var_sse2(
             src_ptr + 8, src_pixels_per_line,
             dst_ptr + 8, dst_pixels_per_line, 16,
@@ -363,10 +377,10 @@
             xoffset, yoffset,
             &xsum1, &xxsum1
         );
+        xsum0 += xsum1;
+        xxsum0 += xxsum1;
     }
 
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
@@ -406,11 +420,6 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum0, &xxsum0);
-
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            &xsum1, &xxsum1);
     }
     else if (xoffset == 0 && yoffset == 4)
     {
@@ -418,11 +427,6 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum0, &xxsum0);
-
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            &xsum1, &xxsum1);
     }
     else if (xoffset == 4 && yoffset == 4)
     {
@@ -430,11 +434,6 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum0, &xxsum0);
-
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            &xsum1, &xxsum1);
     }
     else
     {
@@ -449,11 +448,10 @@
             dst_ptr + 8, dst_pixels_per_line, 8,
             xoffset, yoffset,
             &xsum1, &xxsum1);
+        xsum0 += xsum1;
+        xxsum0 += xxsum1;
     }
 
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 7));
 }
@@ -474,7 +472,7 @@
 
     if (xoffset == 4 && yoffset == 0)
     {
-        vp8_half_horiz_variance16x_h_sse2(
+        vp8_half_horiz_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum, &xxsum);
@@ -481,7 +479,7 @@
     }
     else if (xoffset == 0 && yoffset == 4)
     {
-        vp8_half_vert_variance16x_h_sse2(
+        vp8_half_vert_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum, &xxsum);
@@ -488,7 +486,7 @@
     }
     else if (xoffset == 4 && yoffset == 4)
     {
-        vp8_half_horiz_vert_variance16x_h_sse2(
+        vp8_half_horiz_vert_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum, &xxsum);
@@ -506,82 +504,7 @@
     return (xxsum - ((xsum * xsum) >> 7));
 }
 
-unsigned int vp8_i_variance16x16_wmt(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
 
-
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-
-    *sse = var;
-    return (var - ((avg * avg) >> 8));
-
-}
-
-unsigned int vp8_i_variance8x16_wmt(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-
-    *sse = var;
-    return (var - ((avg * avg) >> 7));
-
-}
-
-
-unsigned int vp8_i_sub_pixel_variance16x16_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
-}
-
-
-unsigned int vp8_i_sub_pixel_variance8x16_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-
-    return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
-}
-
-
 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
     const unsigned char *src_ptr,
     int  src_pixels_per_line,
@@ -589,8 +512,8 @@
     int  dst_pixels_per_line,
     unsigned int *sse)
 {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
+    int xsum0;
+    unsigned int xxsum0;
 
     vp8_half_horiz_variance16x_h_sse2(
         src_ptr, src_pixels_per_line,
@@ -597,13 +520,6 @@
         dst_ptr, dst_pixels_per_line, 16,
         &xsum0, &xxsum0);
 
-    vp8_half_horiz_variance16x_h_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
@@ -616,21 +532,13 @@
     int  dst_pixels_per_line,
     unsigned int *sse)
 {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
+    int xsum0;
+    unsigned int xxsum0;
     vp8_half_vert_variance16x_h_sse2(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 16,
         &xsum0, &xxsum0);
 
-    vp8_half_vert_variance16x_h_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
@@ -643,8 +551,8 @@
     int  dst_pixels_per_line,
     unsigned int *sse)
 {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
+    int xsum0;
+    unsigned int xxsum0;
 
     vp8_half_horiz_vert_variance16x_h_sse2(
         src_ptr, src_pixels_per_line,
@@ -651,13 +559,6 @@
         dst_ptr, dst_pixels_per_line, 16,
         &xsum0, &xxsum0);
 
-    vp8_half_horiz_vert_variance16x_h_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
--- a/vp8/encoder/x86/variance_ssse3.c
+++ b/vp8/encoder/x86/variance_ssse3.c
@@ -76,8 +76,8 @@
     unsigned int *sse
 )
 {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
+    int xsum0;
+    unsigned int xxsum0;
 
     // note we could avoid these if statements if the calling function
     // just called the appropriate functions inside.
@@ -87,14 +87,6 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum0, &xxsum0);
-
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
-
-        xsum0 += xsum1;
-        xxsum0 += xxsum1;
     }
     else if (xoffset == 0 && yoffset == 4)
     {
@@ -102,14 +94,6 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum0, &xxsum0);
-
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
-
-        xsum0 += xsum1;
-        xxsum0 += xxsum1;
     }
     else if (xoffset == 4 && yoffset == 4)
     {
@@ -117,24 +101,65 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum0, &xxsum0);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_ssse3(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            xoffset, yoffset,
+            &xsum0, &xxsum0);
+    }
 
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
 
-        xsum0 += xsum1;
-        xxsum0 += xxsum1;
+unsigned int vp8_sub_pixel_variance16x8_ssse3
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+
+)
+{
+    int xsum0;
+    unsigned int xxsum0;
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
     }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
     else
     {
-      vp8_filter_block2d_bil_var_ssse3(
-          src_ptr, src_pixels_per_line,
-          dst_ptr, dst_pixels_per_line, 16,
-          xoffset, yoffset,
-          &xsum0, &xxsum0);
+        vp8_filter_block2d_bil_var_ssse3(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum0, &xxsum0);
     }
 
     *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+    return (xxsum0 - ((xsum0 * xsum0) >> 7));
 }
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -286,6 +286,7 @@
 #if HAVE_SSSE3
 extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
 extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_ssse3);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
@@ -294,6 +295,9 @@
 
 #undef  vp8_variance_sad16x8x3
 #define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3
+
+#undef  vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_ssse3
 
 #undef  vp8_variance_subpixvar16x16
 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -334,6 +334,7 @@
         cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;
         cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;
 
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_ssse3;
         cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_ssse3;
 
         cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -24,6 +24,7 @@
 VP8_COMMON_SRCS-yes += common/entropymv.c
 VP8_COMMON_SRCS-yes += common/extend.c
 VP8_COMMON_SRCS-yes += common/filter.c
+VP8_COMMON_SRCS-yes += common/filter.h
 VP8_COMMON_SRCS-yes += common/findnearmv.c
 VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
 VP8_COMMON_SRCS-yes += common/idctllm.c