shithub: libvpx

Download patch

ref: 727476adfe13c45ad8bcc9d00278c95d852c0b7a
parent: 18ef62d40c3267729adc9b8005f598e15bbf8a74
parent: d3e9409bb07e6411ff867935883bd5d56d2f9041
author: John Koleszar <jkoleszar@google.com>
date: Fri Jan 21 19:05:13 EST 2011

Merge remote branch 'internal/upstream' into HEAD

--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -1316,6 +1316,43 @@
 {
 }
 
+// This function gives and estimate of how badly we believe
+// the predicition quality is decaying from frame to frame.
+double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+{
+    double prediction_decay_rate;
+    double motion_decay;
+    double motion_pct = next_frame->pcnt_motion;
+
+
+    // Initial basis is the % mbs inter coded
+    prediction_decay_rate = next_frame->pcnt_inter;
+
+    // High % motion -> somewhat higher decay rate
+    motion_decay = (1.0 - (motion_pct / 20.0));
+    if (motion_decay < prediction_decay_rate)
+        prediction_decay_rate = motion_decay;
+
+    // Adjustment to decay rate based on speed of motion
+    {
+        double this_mv_rabs;
+        double this_mv_cabs;
+        double distance_factor;
+
+        this_mv_rabs = fabs(next_frame->mvr_abs * motion_pct);
+        this_mv_cabs = fabs(next_frame->mvc_abs * motion_pct);
+
+        distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
+                               (this_mv_cabs * this_mv_cabs)) / 250.0;
+        distance_factor = ((distance_factor > 1.0)
+                                ? 0.0 : (1.0 - distance_factor));
+        if (distance_factor < prediction_decay_rate)
+            prediction_decay_rate = distance_factor;
+    }
+
+    return prediction_decay_rate;
+}
+
 // Analyse and define a gf/arf group .
 static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
@@ -1468,36 +1505,11 @@
         if (r > GF_RMAX)
             r = GF_RMAX;
 
-        // Adjust loop decay rate
-        //if ( next_frame.pcnt_inter < loop_decay_rate )
-        loop_decay_rate = next_frame.pcnt_inter;
+        loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame);
 
-        // High % motion -> somewhat higher decay rate
-        motion_decay = (1.0 - (motion_pct / 20.0));
-        if (motion_decay < loop_decay_rate)
-            loop_decay_rate = motion_decay;
-
-        // Adjustment to decay rate based on speed of motion
-        {
-            double this_mv_rabs;
-            double this_mv_cabs;
-            double distance_factor;
-
-            this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct);
-            this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct);
-
-            distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
-                                   (this_mv_cabs * this_mv_cabs)) / 250.0;
-            distance_factor = ((distance_factor > 1.0)
-                                    ? 0.0 : (1.0 - distance_factor));
-            if (distance_factor < loop_decay_rate)
-                loop_decay_rate = distance_factor;
-        }
-
         // Cumulative effect of decay
         decay_accumulator = decay_accumulator * loop_decay_rate;
         decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
-        //decay_accumulator = ( loop_decay_rate < decay_accumulator ) ? loop_decay_rate : decay_accumulator;
 
         boost_score += (decay_accumulator * r);
 
@@ -1508,11 +1520,42 @@
              (loop_decay_rate >= 0.999) &&
              (decay_accumulator < 0.9) )
         {
-             // Force GF not alt ref
-             allow_alt_ref = FALSE;
+            int j;
+            FIRSTPASS_STATS * position = cpi->stats_in;
+            FIRSTPASS_STATS tmp_next_frame;
+            double decay_rate;
 
-             boost_score = old_boost_score;
-             break;
+            // Look ahead a few frames to see if static condition
+            // persists...
+            for ( j = 0; j < 4; j++ )
+            {
+                if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
+                    break;
+
+                decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame);
+                if ( decay_rate < 0.999 )
+                    break;
+            }
+            reset_fpf_position(cpi, position);            // Reset file position
+
+            // Force GF not alt ref
+            if ( j == 4 )
+            {
+                if (0)
+                {
+                    FILE *f = fopen("fadegf.stt", "a");
+                    fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n",
+                         cpi->common.current_video_frame+i, i,
+                         loop_decay_rate, decay_accumulator,
+                         boost_score );
+                    fclose(f);
+                }
+
+                allow_alt_ref = FALSE;
+
+                boost_score = old_boost_score;
+                break;
+            }
         }
 
         // Break out conditions.
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -493,8 +493,8 @@
 ;    unsigned char *src_ptr,
 ;    int src_pixels_per_line,
 ;    unsigned int Height,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
+;    int  xoffset,
+;    int  yoffset,
 ;    int *sum,
 ;    unsigned int *sumsquared;;
 ;
@@ -504,68 +504,80 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM
     GET_GOT     rbx
     push rsi
     push rdi
-    sub         rsp, 16
+    push rbx
     ; end prolog
 
         pxor            xmm6,           xmm6                 ;
         pxor            xmm7,           xmm7                 ;
-        mov             rax,            arg(5) ;HFilter             ;
 
-        mov             rdx,            arg(6) ;VFilter             ;
-        mov             rsi,            arg(0) ;ref_ptr              ;
+        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
+        movdqa          xmm4,           XMMWORD PTR [rsi]
 
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        lea             rcx,            [GLOBAL(vp8_bilinear_filters_sse2)]
+        movsxd          rax,            dword ptr arg(5)     ; xoffset
 
+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
+        je              filter_block2d_bil_var_sse2_sp_only
+
+        shl             rax,            5                    ; point to filter coeff with xoffset
+        lea             rax,            [rax + rcx]          ; HFilter
+
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
+        je              filter_block2d_bil_var_sse2_fp_only
+
+        shl             rdx,            5
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+
         pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]               ;
+        movq            xmm1,           QWORD PTR [rsi]      ;
+        movq            xmm3,           QWORD PTR [rsi+1]    ;
 
-        movq            xmm3,           QWORD PTR [rsi+1]        ;
         punpcklbw       xmm1,           xmm0                 ;
-
-        pmullw          xmm1,           [rax]               ;
+        pmullw          xmm1,           [rax]                ;
         punpcklbw       xmm3,           xmm0
-            ;
         pmullw          xmm3,           [rax+16]             ;
-        paddw           xmm1,           xmm3                 ;
 
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift     ;
         movdqa          xmm5,           xmm1
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rsi,            r8
+
+        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
+        lea             rsi,            [rsi + rbx]
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
 %endif
-filter_block2d_bil_var_sse2_loop:
 
+filter_block2d_bil_var_sse2_loop:
         movq            xmm1,           QWORD PTR [rsi]               ;
         movq            xmm3,           QWORD PTR [rsi+1]             ;
 
         punpcklbw       xmm1,           xmm0                 ;
         pmullw          xmm1,           [rax]               ;
-
         punpcklbw       xmm3,           xmm0                 ;
         pmullw          xmm3,           [rax+16]             ;
 
         paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
-
+        paddw           xmm1,           xmm4               ;
         psraw           xmm1,           xmm_filter_shift    ;
-        movdqa          xmm3,           xmm5                 ;
 
+        movdqa          xmm3,           xmm5                 ;
         movdqa          xmm5,           xmm1                 ;
-        pmullw          xmm3,           [rdx]               ;
 
+        pmullw          xmm3,           [rdx]               ;
         pmullw          xmm1,           [rdx+16]             ;
         paddw           xmm1,           xmm3                 ;
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
+        paddw           xmm1,           xmm4                 ;
         psraw           xmm1,           xmm_filter_shift    ;
 
         movq            xmm3,           QWORD PTR [rdi]               ;
@@ -577,20 +589,103 @@
         pmaddwd         xmm1,           xmm1                 ;
         paddd           xmm7,           xmm1                 ;
 
+        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
 %if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
 %else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
-        add             rsi,            r8
-        add             rdi,            r9
+        lea             rdi,            [rdi + r9]
 %endif
 
         sub             rcx,            1                   ;
         jnz             filter_block2d_bil_var_sse2_loop       ;
 
+        jmp             filter_block2d_bil_variance
 
+filter_block2d_bil_var_sse2_sp_only:
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+        shl             rdx,            5
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                 ;
+        movq            xmm1,           QWORD PTR [rsi]      ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        lea             rsi,            [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+        movq            xmm3,           QWORD PTR [rsi]             ;
+        punpcklbw       xmm3,           xmm0                 ;
+        movdqa          xmm5,           xmm3
+
+        pmullw          xmm1,           [rdx]               ;
+        pmullw          xmm3,           [rdx+16]             ;
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        movdqa          xmm1,           xmm5                 ;
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_sp_only_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_fp_only:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                 ;
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+        movq            xmm1,           QWORD PTR [rsi]       ;
+        movq            xmm3,           QWORD PTR [rsi+1]     ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4  ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]     ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+        lea             rsi,            [rsi + rdx]
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_fp_only_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
         movdq2q         mm6,            xmm6                ;
         movdq2q         mm7,            xmm7                ;
 
@@ -627,12 +722,12 @@
         movd            [rsi],          mm2    ; xsum
         movd            [rdi],          mm4    ; xxsum
 
-
     ; begin epilog
-    add rsp, 16
+    pop rbx
     pop rdi
     pop rsi
     RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -974,3 +1069,13 @@
 align 16
 xmm_bi_rd:
     times 8 dw 64
+align 16
+vp8_bilinear_filters_sse2:
+    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
+    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -76,8 +76,8 @@
     const unsigned char *src_ptr,
     int src_pixels_per_line,
     unsigned int Height,
-    const short *HFilter,
-    const short *VFilter,
+    int  xoffset,
+    int  yoffset,
     int *sum,
     unsigned int *sumsquared
 );
@@ -222,21 +222,6 @@
 
 }
 
-///////////////////////////////////////////////////////////////////////////
-// the mmx function that does the bilinear filtering and var calculation //
-// int one pass                                                          //
-///////////////////////////////////////////////////////////////////////////
-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
-{
-    { 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0 },
-    { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
-    {  96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
-    {  80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
-    {  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-    {  48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
-    {  32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
-    {  16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
-};
 unsigned int vp8_sub_pixel_variance4x4_wmt
 (
     const unsigned char  *src_ptr,
@@ -272,16 +257,39 @@
     unsigned int *sse
 )
 {
-
     int xsum;
     unsigned int xxsum;
-    vp8_filter_block2d_bil_var_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
-        &xsum, &xxsum
-    );
 
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum, &xxsum);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum, &xxsum);
+    }
+
     *sse = xxsum;
     return (xxsum - ((xsum * xsum) >> 6));
 }
@@ -344,7 +352,7 @@
         vp8_filter_block2d_bil_var_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
-            vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+            xoffset, yoffset,
             &xsum0, &xxsum0
         );
 
@@ -352,7 +360,7 @@
         vp8_filter_block2d_bil_var_sse2(
             src_ptr + 8, src_pixels_per_line,
             dst_ptr + 8, dst_pixels_per_line, 16,
-            vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+            xoffset, yoffset,
             &xsum1, &xxsum1
         );
     }
@@ -392,22 +400,57 @@
     int xsum0, xsum1;
     unsigned int xxsum0, xxsum1;
 
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
 
-    vp8_filter_block2d_bil_var_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
-        &xsum0, &xxsum0
-    );
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 8,
+            &xsum1, &xxsum1);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
 
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 8,
+            &xsum1, &xxsum1);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
 
-    vp8_filter_block2d_bil_var_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
-        &xsum1, &xxsum1
-    );
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 8,
+            &xsum1, &xxsum1);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum0, &xxsum0);
 
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum1, &xxsum1);
+    }
+
     xsum0 += xsum1;
     xxsum0 += xxsum1;
 
@@ -428,12 +471,36 @@
 {
     int xsum;
     unsigned int xxsum;
-    vp8_filter_block2d_bil_var_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
-        &xsum, &xxsum
-    );
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum, &xxsum);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            xoffset, yoffset,
+            &xsum, &xxsum);
+    }
 
     *sse = xxsum;
     return (xxsum - ((xsum * xsum) >> 7));