shithub: libvpx

--- a/vp8/common/arm/armv6/bilinearfilter_v6.asm

+++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm

@@ -15,12 +15,12 @@

     AREA    |.text|, CODE, READONLY  ; name this block of code

 ;-------------------------------------

-; r0    unsigned char *src_ptr,

-; r1    unsigned short *output_ptr,

-; r2    unsigned int src_pixels_per_line,

-; r3    unsigned int output_height,

-; stack    unsigned int output_width,

-; stack    const short *vp8_filter

+; r0    unsigned char  *src_ptr,

+; r1    unsigned short *dst_ptr,

+; r2    unsigned int    src_pitch,

+; r3    unsigned int    height,

+; stack unsigned int    width,

+; stack const short    *vp8_filter

 ;-------------------------------------

 ; The output is transposed stroed in output array to make it easy for second pass filtering.

 |vp8_filter_block2d_bil_first_pass_armv6| PROC

@@ -27,7 +27,7 @@

     stmdb   sp!, {r4 - r11, lr}

     ldr     r11, [sp, #40]                  ; vp8_filter address

-    ldr     r4, [sp, #36]                   ; output width

+    ldr     r4, [sp, #36]                   ; width

     mov     r12, r3                         ; outer-loop counter

     sub     r2, r2, r4                      ; src increment for height loop

@@ -38,10 +38,10 @@

     ldr     r5, [r11]                       ; load up filter coefficients

-    mov     r3, r3, lsl #1                  ; output_height*2

+    mov     r3, r3, lsl #1                  ; height*2

     add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)

-    mov     r11, r1                         ; save output_ptr for each row

+    mov     r11, r1                         ; save dst_ptr for each row

     cmp     r5, #128                        ; if filter coef = 128, then skip the filter

     beq     bil_null_1st_filter

@@ -140,17 +140,17 @@

 ;---------------------------------

 ; r0    unsigned short *src_ptr,

-; r1    unsigned char *output_ptr,

-; r2    int output_pitch,

-; r3    unsigned int  output_height,

-; stack unsigned int  output_width,

-; stack const short *vp8_filter

+; r1    unsigned char  *dst_ptr,

+; r2    int             dst_pitch,

+; r3    unsigned int    height,

+; stack unsigned int    width,

+; stack const short    *vp8_filter

 ;---------------------------------

 |vp8_filter_block2d_bil_second_pass_armv6| PROC

     stmdb   sp!, {r4 - r11, lr}

     ldr     r11, [sp, #40]                  ; vp8_filter address

-    ldr     r4, [sp, #36]                   ; output width

+    ldr     r4, [sp, #36]                   ; width

     ldr     r5, [r11]                       ; load up filter coefficients

     mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix

--- a/vp8/common/arm/bilinearfilter_arm.c

+++ b/vp8/common/arm/bilinearfilter_arm.c

@@ -10,128 +10,48 @@

 #include <math.h>

+#include "filter.h"

 #include "subpixel.h"

-#define BLOCK_HEIGHT_WIDTH 4

-#define VP8_FILTER_WEIGHT 128

-#define VP8_FILTER_SHIFT  7

-static const short bilinear_filters[8][2] =

-{

-    { 128,   0 },

-    { 112,  16 },

-    {  96,  32 },

-    {  80,  48 },

-    {  64,  64 },

-    {  48,  80 },

-    {  32,  96 },

-    {  16, 112 }

-};

 extern void vp8_filter_block2d_bil_first_pass_armv6

-    unsigned char *src_ptr,

-    unsigned short *output_ptr,

-    unsigned int src_pixels_per_line,

-    unsigned int output_height,

-    unsigned int output_width,

-    const short *vp8_filter

+    unsigned char  *src_ptr,

+    unsigned short *dst_ptr,

+    unsigned int    src_pitch,

+    unsigned int    height,

+    unsigned int    width,

+    const short    *vp8_filter

);

 extern void vp8_filter_block2d_bil_second_pass_armv6

     unsigned short *src_ptr,

-    unsigned char  *output_ptr,

-    int output_pitch,

-    unsigned int  output_height,

-    unsigned int  output_width,

-    const short *vp8_filter

+    unsigned char  *dst_ptr,

+    int             dst_pitch,

+    unsigned int    height,

+    unsigned int    width,

+    const short    *vp8_filter

);

-#if 0

-void vp8_filter_block2d_bil_first_pass_6

-(

-    unsigned char *src_ptr,

-    unsigned short *output_ptr,

-    unsigned int src_pixels_per_line,

-    unsigned int output_height,

-    unsigned int output_width,

-    const short *vp8_filter

-)

-{

-    unsigned int i, j;

-    for ( i=0; i<output_height; i++ )

-    {

-        for ( j=0; j<output_width; j++ )

-        {

-            /* Apply bilinear filter */

-            output_ptr[j] = ( ( (int)src_ptr[0]          * vp8_filter[0]) +

-                               ((int)src_ptr[1] * vp8_filter[1]) +

-                                (VP8_FILTER_WEIGHT/2) ) >> VP8_FILTER_SHIFT;

-            src_ptr++;

-        }

-        /* Next row... */

-        src_ptr    += src_pixels_per_line - output_width;

-        output_ptr += output_width;

-    }

-}

-void vp8_filter_block2d_bil_second_pass_6

-(

-    unsigned short *src_ptr,

-    unsigned char  *output_ptr,

-    int output_pitch,

-    unsigned int  output_height,

-    unsigned int  output_width,

-    const short *vp8_filter

-)

-{

-    unsigned int  i,j;

-    int  Temp;

-    for ( i=0; i<output_height; i++ )

-    {

-        for ( j=0; j<output_width; j++ )

-        {

-            /* Apply filter */

-            Temp =  ((int)src_ptr[0]         * vp8_filter[0]) +

-                    ((int)src_ptr[output_width] * vp8_filter[1]) +

-                    (VP8_FILTER_WEIGHT/2);

-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);

-            src_ptr++;

-        }

-        /* Next row... */

-        /*src_ptr    += src_pixels_per_line - output_width;*/

-        output_ptr += output_pitch;

-    }

-}

-#endif

 void vp8_filter_block2d_bil_armv6

     unsigned char *src_ptr,

-    unsigned char *output_ptr,

-    unsigned int   src_pixels_per_line,

+    unsigned char *dst_ptr,

+    unsigned int   src_pitch,

     unsigned int   dst_pitch,

-    const short      *HFilter,

-    const short      *VFilter,

+    const short   *HFilter,

+    const short   *VFilter,

     int            Width,

     int            Height

+    unsigned short FData[36*16]; /* Temp data buffer used in filtering */

-    unsigned short FData[36*16]; /* Temp data bufffer used in filtering */

     /* First filter 1-D horizontally... */

-    /* pixel_step = 1; */

-    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pixels_per_line, Height + 1, Width, HFilter);

+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

     /* then 1-D vertically... */

-    vp8_filter_block2d_bil_second_pass_armv6(FData, output_ptr, dst_pitch, Height, Width, VFilter);

+    vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);

@@ -148,8 +68,8 @@

     const short  *HFilter;

     const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];

-    VFilter = bilinear_filters[yoffset];

+    HFilter = vp8_bilinear_filters[xoffset];

+    VFilter = vp8_bilinear_filters[yoffset];

     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);

@@ -167,8 +87,8 @@

     const short  *HFilter;

     const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];

-    VFilter = bilinear_filters[yoffset];

+    HFilter = vp8_bilinear_filters[xoffset];

+    VFilter = vp8_bilinear_filters[yoffset];

     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);

@@ -186,8 +106,8 @@

     const short  *HFilter;

     const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];

-    VFilter = bilinear_filters[yoffset];

+    HFilter = vp8_bilinear_filters[xoffset];

+    VFilter = vp8_bilinear_filters[yoffset];

     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);

@@ -205,8 +125,8 @@

     const short  *HFilter;

     const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];

-    VFilter = bilinear_filters[yoffset];

+    HFilter = vp8_bilinear_filters[xoffset];

+    VFilter = vp8_bilinear_filters[yoffset];

     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);

--- a/vp8/common/arm/filter_arm.c

+++ b/vp8/common/arm/filter_arm.c

@@ -11,26 +11,10 @@

 #include "vpx_ports/config.h"

 #include <math.h>

+#include "filter.h"

 #include "subpixel.h"

 #include "vpx_ports/mem.h"

-#define BLOCK_HEIGHT_WIDTH 4

-#define VP8_FILTER_WEIGHT 128

-#define VP8_FILTER_SHIFT  7

-DECLARE_ALIGNED(16, static const short, sub_pel_filters[8][6]) =

-{

-    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */

-    { 0, -6,  123,   12,  -1,  0 },

-    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */

-    { 0, -9,   93,   50,  -6,  0 },

-    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */

-    { 0, -6,   50,   93,  -9,  0 },

-    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */

-    { 0, -1,   12,  123,  -6,  0 },

-};

 extern void vp8_filter_block2d_first_pass_armv6

     unsigned char *src_ptr,

@@ -93,11 +77,11 @@

     const short  *HFilter;

     const short  *VFilter;

-    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data bufffer used in filtering */

+    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */

-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */

+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */

+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

     /* Vfilter is null. First pass only */

     if (xoffset && !yoffset)

@@ -129,47 +113,6 @@

-#if 0

-void vp8_sixtap_predict8x4_armv6

-(

-    unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    unsigned char *dst_ptr,

-    int  dst_pitch

-)

-{

-    const short  *HFilter;

-    const short  *VFilter;

-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */

-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */

-    /*if (xoffset && !yoffset)

-    {

-        vp8_filter_block2d_first_pass_only_armv6 (  src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter );

-    }*/

-    /* Hfilter is null. Second pass only */

-    /*else if (!xoffset && yoffset)

-    {

-        vp8_filter_block2d_second_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter );

-    }

-    else

-    {

-        if (yoffset & 0x1)

-            vp8_filter_block2d_first_pass_armv6 ( src_ptr-src_pixels_per_line, FData+1, src_pixels_per_line, 8, 7, HFilter );

-        else*/

-        vp8_filter_block2d_first_pass_armv6 ( src_ptr-(2*src_pixels_per_line), FData, src_pixels_per_line, 8, 9, HFilter );

-        vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, 8, VFilter );

-    /*}*/

-}

-#endif

 void vp8_sixtap_predict8x8_armv6

     unsigned char  *src_ptr,

@@ -182,10 +125,10 @@

     const short  *HFilter;

     const short  *VFilter;

-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */

+    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */

-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */

+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */

+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

     if (xoffset && !yoffset)

@@ -224,10 +167,10 @@

     const short  *HFilter;

     const short  *VFilter;

-    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data bufffer used in filtering */

+    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */

-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */

+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */

+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

     if (xoffset && !yoffset)

--- /dev/null

+++ b/vp8/common/filter.c

@@ -1,0 +1,520 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdlib.h>

+#include "filter.h"

+#include "vpx_ports/mem.h"

+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =

+{

+    { 128,   0 },

+    { 112,  16 },

+    {  96,  32 },

+    {  80,  48 },

+    {  64,  64 },

+    {  48,  80 },

+    {  32,  96 },

+    {  16, 112 }

+};

+DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =

+{

+    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */

+    { 0, -6,  123,   12,  -1,  0 },

+    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */

+    { 0, -9,   93,   50,  -6,  0 },

+    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */

+    { 0, -6,   50,   93,  -9,  0 },

+    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */

+    { 0, -1,   12,  123,  -6,  0 },

+};

+void vp8_filter_block2d_first_pass

+(

+    unsigned char *src_ptr,

+    int *output_ptr,

+    unsigned int src_pixels_per_line,

+    unsigned int pixel_step,

+    unsigned int output_height,

+    unsigned int output_width,

+    const short *vp8_filter

+)

+{

+    unsigned int i, j;

+    int  Temp;

+    for (i = 0; i < output_height; i++)

+    {

+        for (j = 0; j < output_width; j++)

+        {

+            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +

+                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +

+                   ((int)src_ptr[0]                 * vp8_filter[2]) +

+                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +

+                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +

+                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +

+                   (VP8_FILTER_WEIGHT >> 1);      /* Rounding */

+            /* Normalize back to 0-255 */

+            Temp = Temp >> VP8_FILTER_SHIFT;

+            if (Temp < 0)

+                Temp = 0;

+            else if (Temp > 255)

+                Temp = 255;

+            output_ptr[j] = Temp;

+            src_ptr++;

+        }

+        /* Next row... */

+        src_ptr    += src_pixels_per_line - output_width;

+        output_ptr += output_width;

+    }

+}

+void vp8_filter_block2d_second_pass

+(

+    int *src_ptr,

+    unsigned char *output_ptr,

+    int output_pitch,

+    unsigned int src_pixels_per_line,

+    unsigned int pixel_step,

+    unsigned int output_height,

+    unsigned int output_width,

+    const short *vp8_filter

+)

+{

+    unsigned int i, j;

+    int  Temp;

+    for (i = 0; i < output_height; i++)

+    {

+        for (j = 0; j < output_width; j++)

+        {

+            /* Apply filter */

+            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +

+                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +

+                   ((int)src_ptr[0]                 * vp8_filter[2]) +

+                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +

+                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +

+                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +

+                   (VP8_FILTER_WEIGHT >> 1);   /* Rounding */

+            /* Normalize back to 0-255 */

+            Temp = Temp >> VP8_FILTER_SHIFT;

+            if (Temp < 0)

+                Temp = 0;

+            else if (Temp > 255)

+                Temp = 255;

+            output_ptr[j] = (unsigned char)Temp;

+            src_ptr++;

+        }

+        /* Start next row */

+        src_ptr    += src_pixels_per_line - output_width;

+        output_ptr += output_pitch;

+    }

+}

+void vp8_filter_block2d

+(

+    unsigned char  *src_ptr,

+    unsigned char  *output_ptr,

+    unsigned int src_pixels_per_line,

+    int output_pitch,

+    const short  *HFilter,

+    const short  *VFilter

+)

+{

+    int FData[9*4]; /* Temp data buffer used in filtering */

+    /* First filter 1-D horizontally... */

+    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);

+    /* then filter verticaly... */

+    vp8_filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);

+}

+void vp8_block_variation_c

+(

+    unsigned char  *src_ptr,

+    int   src_pixels_per_line,

+    int *HVar,

+    int *VVar

+)

+{

+    int i, j;

+    unsigned char *Ptr = src_ptr;

+    for (i = 0; i < 4; i++)

+    {

+        for (j = 0; j < 4; j++)

+        {

+            *HVar += abs((int)Ptr[j] - (int)Ptr[j+1]);

+            *VVar += abs((int)Ptr[j] - (int)Ptr[j+src_pixels_per_line]);

+        }

+        Ptr += src_pixels_per_line;

+    }

+}

+void vp8_sixtap_predict_c

+(

+    unsigned char  *src_ptr,

+    int   src_pixels_per_line,

+    int  xoffset,

+    int  yoffset,

+    unsigned char *dst_ptr,

+    int dst_pitch

+)

+{

+    const short  *HFilter;

+    const short  *VFilter;

+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */

+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

+    vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);

+}

+void vp8_sixtap_predict8x8_c

+(

+    unsigned char  *src_ptr,

+    int  src_pixels_per_line,

+    int  xoffset,

+    int  yoffset,

+    unsigned char *dst_ptr,

+    int  dst_pitch

+)

+{

+    const short  *HFilter;

+    const short  *VFilter;

+    int FData[13*16];   /* Temp data buffer used in filtering */

+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */

+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

+    /* First filter 1-D horizontally... */

+    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);

+    /* then filter verticaly... */

+    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);

+}

+void vp8_sixtap_predict8x4_c

+(

+    unsigned char  *src_ptr,

+    int  src_pixels_per_line,

+    int  xoffset,

+    int  yoffset,

+    unsigned char *dst_ptr,

+    int  dst_pitch

+)

+{

+    const short  *HFilter;

+    const short  *VFilter;

+    int FData[13*16];   /* Temp data buffer used in filtering */

+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */

+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

+    /* First filter 1-D horizontally... */

+    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);

+    /* then filter verticaly... */

+    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);

+}

+void vp8_sixtap_predict16x16_c

+(

+    unsigned char  *src_ptr,

+    int  src_pixels_per_line,

+    int  xoffset,

+    int  yoffset,

+    unsigned char *dst_ptr,

+    int  dst_pitch

+)

+{

+    const short  *HFilter;

+    const short  *VFilter;

+    int FData[21*24];   /* Temp data buffer used in filtering */

+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */

+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

+    /* First filter 1-D horizontally... */

+    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);

+    /* then filter verticaly... */

+    vp8_filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);

+}

+/****************************************************************************

+ *

+ *  ROUTINE       : filter_block2d_bil_first_pass

+ *

+ *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.

+ *                  UINT32  src_stride : Stride of source block.

+ *                  UINT32  height     : Block height.

+ *                  UINT32  width      : Block width.

+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.

+ *

+ *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.

+ *

+ *  RETURNS       : void

+ *

+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block

+ *                  in the horizontal direction to produce the filtered output

+ *                  block. Used to implement first-pass of 2-D separable filter.

+ *

+ *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.

+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.

+ *

+ ****************************************************************************/

+void vp8_filter_block2d_bil_first_pass

+(

+    unsigned char  *src_ptr,

+    unsigned short *dst_ptr,

+    unsigned int    src_stride,

+    unsigned int    height,

+    unsigned int    width,

+    const short    *vp8_filter

+)

+{

+    unsigned int i, j;

+    for (i = 0; i < height; i++)

+    {

+        for (j = 0; j < width; j++)

+        {

+            /* Apply bilinear filter */

+            dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +

+                          ((int)src_ptr[1] * vp8_filter[1]) +

+                          (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;

+            src_ptr++;

+        }

+        /* Next row... */

+        src_ptr += src_stride - width;

+        dst_ptr += width;

+    }

+}

+/****************************************************************************

+ *

+ *  ROUTINE       : filter_block2d_bil_second_pass

+ *

+ *  INPUTS        : INT32  *src_ptr    : Pointer to source block.

+ *                  UINT32  dst_pitch  : Destination block pitch.

+ *                  UINT32  height     : Block height.

+ *                  UINT32  width      : Block width.

+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.

+ *

+ *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.

+ *

+ *  RETURNS       : void

+ *

+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block

+ *                  in the vertical direction to produce the filtered output

+ *                  block. Used to implement second-pass of 2-D separable filter.

+ *

+ *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.

+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.

+ *

+ ****************************************************************************/

+void vp8_filter_block2d_bil_second_pass

+(

+    unsigned short *src_ptr,

+    unsigned char  *dst_ptr,

+    int             dst_pitch,

+    unsigned int    height,

+    unsigned int    width,

+    const short    *vp8_filter

+)

+{

+    unsigned int  i, j;

+    int  Temp;

+    for (i = 0; i < height; i++)

+    {

+        for (j = 0; j < width; j++)

+        {

+            /* Apply filter */

+            Temp = ((int)src_ptr[0]     * vp8_filter[0]) +

+                   ((int)src_ptr[width] * vp8_filter[1]) +

+                   (VP8_FILTER_WEIGHT / 2);

+            dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);

+            src_ptr++;

+        }

+        /* Next row... */

+        dst_ptr += dst_pitch;

+    }

+}

+/****************************************************************************

+ *

+ *  ROUTINE       : filter_block2d_bil

+ *

+ *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.

+ *                  UINT32  src_pitch        : Stride of source block.

+ *                  UINT32  dst_pitch        : Stride of destination block.

+ *                  INT32  *HFilter          : Array of 2 horizontal filter taps.

+ *                  INT32  *VFilter          : Array of 2 vertical filter taps.

+ *                  INT32  Width             : Block width

+ *                  INT32  Height            : Block height

+ *

+ *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.

+ *

+ *  RETURNS       : void

+ *

+ *  FUNCTION      : 2-D filters an input block by applying a 2-tap

+ *                  bi-linear filter horizontally followed by a 2-tap

+ *                  bi-linear filter vertically on the result.

+ *

+ *  SPECIAL NOTES : The largest block size can be handled here is 16x16

+ *

+ ****************************************************************************/

+void vp8_filter_block2d_bil

+(

+    unsigned char *src_ptr,

+    unsigned char *dst_ptr,

+    unsigned int   src_pitch,

+    unsigned int   dst_pitch,

+    const short   *HFilter,

+    const short   *VFilter,

+    int            Width,

+    int            Height

+)

+{

+    unsigned short FData[17*16];    /* Temp data buffer used in filtering */

+    /* First filter 1-D horizontally... */

+    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

+    /* then 1-D vertically... */

+    vp8_filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);

+}

+void vp8_bilinear_predict4x4_c

+(

+    unsigned char  *src_ptr,

+    int   src_pixels_per_line,

+    int  xoffset,

+    int  yoffset,

+    unsigned char *dst_ptr,

+    int dst_pitch

+)

+{

+    const short *HFilter;

+    const short *VFilter;

+    HFilter = vp8_bilinear_filters[xoffset];

+    VFilter = vp8_bilinear_filters[yoffset];

+#if 0

+    {

+        int i;

+        unsigned char temp1[16];

+        unsigned char temp2[16];

+        bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);

+        vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);

+        for (i = 0; i < 16; i++)

+        {

+            if (temp1[i] != temp2[i])

+            {

+                bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);

+                vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);

+            }

+        }

+    }

+#endif

+    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);

+}

+void vp8_bilinear_predict8x8_c

+(

+    unsigned char  *src_ptr,

+    int  src_pixels_per_line,

+    int  xoffset,

+    int  yoffset,

+    unsigned char *dst_ptr,

+    int  dst_pitch

+)

+{

+    const short *HFilter;

+    const short *VFilter;

+    HFilter = vp8_bilinear_filters[xoffset];

+    VFilter = vp8_bilinear_filters[yoffset];

+    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);

+}

+void vp8_bilinear_predict8x4_c

+(

+    unsigned char  *src_ptr,

+    int  src_pixels_per_line,

+    int  xoffset,

+    int  yoffset,

+    unsigned char *dst_ptr,

+    int  dst_pitch

+)

+{

+    const short *HFilter;

+    const short *VFilter;

+    HFilter = vp8_bilinear_filters[xoffset];

+    VFilter = vp8_bilinear_filters[yoffset];

+    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);

+}

+void vp8_bilinear_predict16x16_c

+(

+    unsigned char  *src_ptr,

+    int  src_pixels_per_line,

+    int  xoffset,

+    int  yoffset,

+    unsigned char *dst_ptr,

+    int  dst_pitch

+)

+{

+    const short *HFilter;

+    const short *VFilter;

+    HFilter = vp8_bilinear_filters[xoffset];

+    VFilter = vp8_bilinear_filters[yoffset];

+    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);

+}

--- /dev/null

+++ b/vp8/common/filter.h

@@ -1,0 +1,22 @@

+/*

+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef FILTER_H

+#define FILTER_H

+#define BLOCK_HEIGHT_WIDTH 4

+#define VP8_FILTER_WEIGHT 128

+#define VP8_FILTER_SHIFT  7

+extern const short vp8_bilinear_filters[8][2];

+extern const short vp8_sub_pel_filters[8][6];

+#endif //FILTER_H

--- a/vp8/common/filter_c.c

+++ /dev/null

@@ -1,540 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <stdlib.h>

-#define BLOCK_HEIGHT_WIDTH 4

-#define VP8_FILTER_WEIGHT 128

-#define VP8_FILTER_SHIFT  7

-static const int bilinear_filters[8][2] =

-{

-    { 128,   0 },

-    { 112,  16 },

-    {  96,  32 },

-    {  80,  48 },

-    {  64,  64 },

-    {  48,  80 },

-    {  32,  96 },

-    {  16, 112 }

-};

-static const short sub_pel_filters[8][6] =

-{

-    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */

-    { 0, -6,  123,   12,  -1,  0 },

-    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */

-    { 0, -9,   93,   50,  -6,  0 },

-    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */

-    { 0, -6,   50,   93,  -9,  0 },

-    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */

-    { 0, -1,   12,  123,  -6,  0 },

-};

-void vp8_filter_block2d_first_pass

-(

-    unsigned char *src_ptr,

-    int *output_ptr,

-    unsigned int src_pixels_per_line,

-    unsigned int pixel_step,

-    unsigned int output_height,

-    unsigned int output_width,

-    const short *vp8_filter

-)

-{

-    unsigned int i, j;

-    int  Temp;

-    for (i = 0; i < output_height; i++)

-    {

-        for (j = 0; j < output_width; j++)

-        {

-            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +

-                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +

-                   ((int)src_ptr[0]                 * vp8_filter[2]) +

-                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +

-                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +

-                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +

-                   (VP8_FILTER_WEIGHT >> 1);      /* Rounding */

-            /* Normalize back to 0-255 */

-            Temp = Temp >> VP8_FILTER_SHIFT;

-            if (Temp < 0)

-                Temp = 0;

-            else if (Temp > 255)

-                Temp = 255;

-            output_ptr[j] = Temp;

-            src_ptr++;

-        }

-        /* Next row... */

-        src_ptr    += src_pixels_per_line - output_width;

-        output_ptr += output_width;

-    }

-}

-void vp8_filter_block2d_second_pass

-(

-    int *src_ptr,

-    unsigned char *output_ptr,

-    int output_pitch,

-    unsigned int src_pixels_per_line,

-    unsigned int pixel_step,

-    unsigned int output_height,

-    unsigned int output_width,

-    const short *vp8_filter

-)

-{

-    unsigned int i, j;

-    int  Temp;

-    for (i = 0; i < output_height; i++)

-    {

-        for (j = 0; j < output_width; j++)

-        {

-            /* Apply filter */

-            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +

-                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +

-                   ((int)src_ptr[0]                 * vp8_filter[2]) +

-                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +

-                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +

-                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +

-                   (VP8_FILTER_WEIGHT >> 1);   /* Rounding */

-            /* Normalize back to 0-255 */

-            Temp = Temp >> VP8_FILTER_SHIFT;

-            if (Temp < 0)

-                Temp = 0;

-            else if (Temp > 255)

-                Temp = 255;

-            output_ptr[j] = (unsigned char)Temp;

-            src_ptr++;

-        }

-        /* Start next row */

-        src_ptr    += src_pixels_per_line - output_width;

-        output_ptr += output_pitch;

-    }

-}

-void vp8_filter_block2d

-(

-    unsigned char  *src_ptr,

-    unsigned char  *output_ptr,

-    unsigned int src_pixels_per_line,

-    int output_pitch,

-    const short  *HFilter,

-    const short  *VFilter

-)

-{

-    int FData[9*4]; /* Temp data bufffer used in filtering */

-    /* First filter 1-D horizontally... */

-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);

-    /* then filter verticaly... */

-    vp8_filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);

-}

-void vp8_block_variation_c

-(

-    unsigned char  *src_ptr,

-    int   src_pixels_per_line,

-    int *HVar,

-    int *VVar

-)

-{

-    int i, j;

-    unsigned char *Ptr = src_ptr;

-    for (i = 0; i < 4; i++)

-    {

-        for (j = 0; j < 4; j++)

-        {

-            *HVar += abs((int)Ptr[j] - (int)Ptr[j+1]);

-            *VVar += abs((int)Ptr[j] - (int)Ptr[j+src_pixels_per_line]);

-        }

-        Ptr += src_pixels_per_line;

-    }

-}

-void vp8_sixtap_predict_c

-(

-    unsigned char  *src_ptr,

-    int   src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    unsigned char *dst_ptr,

-    int dst_pitch

-)

-{

-    const short  *HFilter;

-    const short  *VFilter;

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */

-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */

-    vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);

-}

-void vp8_sixtap_predict8x8_c

-(

-    unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    unsigned char *dst_ptr,

-    int  dst_pitch

-)

-{

-    const short  *HFilter;

-    const short  *VFilter;

-    int FData[13*16];   /* Temp data bufffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */

-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */

-    /* First filter 1-D horizontally... */

-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);

-    /* then filter verticaly... */

-    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);

-}

-void vp8_sixtap_predict8x4_c

-(

-    unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    unsigned char *dst_ptr,

-    int  dst_pitch

-)

-{

-    const short  *HFilter;

-    const short  *VFilter;

-    int FData[13*16];   /* Temp data bufffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */

-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */

-    /* First filter 1-D horizontally... */

-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);

-    /* then filter verticaly... */

-    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);

-}

-void vp8_sixtap_predict16x16_c

-(

-    unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    unsigned char *dst_ptr,

-    int  dst_pitch

-)

-{

-    const short  *HFilter;

-    const short  *VFilter;

-    int FData[21*24];   /* Temp data bufffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */

-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */

-    /* First filter 1-D horizontally... */

-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);

-    /* then filter verticaly... */

-    vp8_filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil_first_pass

- *

- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.

- *                  UINT32 src_pixels_per_line : Stride of input block.

- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).

- *                  UINT32 output_height     : Input block height.

- *                  UINT32 output_width      : Input block width.

- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.

- *

- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in

- *                  either horizontal or vertical direction to produce the

- *                  filtered output block. Used to implement first-pass

- *                  of 2-D separable filter.

- *

- *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.

- *                  Two filter taps should sum to VP8_FILTER_WEIGHT.

- *                  pixel_step defines whether the filter is applied

- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).

- *                  It defines the offset required to move from one input

- *                  to the next.

- *

- ****************************************************************************/

-void vp8_filter_block2d_bil_first_pass

-(

-    unsigned char *src_ptr,

-    unsigned short *output_ptr,

-    unsigned int src_pixels_per_line,

-    int pixel_step,

-    unsigned int output_height,

-    unsigned int output_width,

-    const int *vp8_filter

-)

-{

-    unsigned int i, j;

-    for (i = 0; i < output_height; i++)

-    {

-        for (j = 0; j < output_width; j++)

-        {

-            /* Apply bilinear filter */

-            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +

-                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +

-                             (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;

-            src_ptr++;

-        }

-        /* Next row... */

-        src_ptr    += src_pixels_per_line - output_width;

-        output_ptr += output_width;

-    }

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil_second_pass

- *

- *  INPUTS        : INT32  *src_ptr          : Pointer to source block.

- *                  UINT32 src_pixels_per_line : Stride of input block.

- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).

- *                  UINT32 output_height     : Input block height.

- *                  UINT32 output_width      : Input block width.

- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.

- *

- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in

- *                  either horizontal or vertical direction to produce the

- *                  filtered output block. Used to implement second-pass

- *                  of 2-D separable filter.

- *

- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.

- *                  Two filter taps should sum to VP8_FILTER_WEIGHT.

- *                  pixel_step defines whether the filter is applied

- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).

- *                  It defines the offset required to move from one input

- *                  to the next.

- *

- ****************************************************************************/

-void vp8_filter_block2d_bil_second_pass

-(

-    unsigned short *src_ptr,

-    unsigned char  *output_ptr,

-    int output_pitch,

-    unsigned int  src_pixels_per_line,

-    unsigned int  pixel_step,

-    unsigned int  output_height,

-    unsigned int  output_width,

-    const int *vp8_filter

-)

-{

-    unsigned int  i, j;

-    int  Temp;

-    for (i = 0; i < output_height; i++)

-    {

-        for (j = 0; j < output_width; j++)

-        {

-            /* Apply filter */

-            Temp = ((int)src_ptr[0]         * vp8_filter[0]) +

-                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +

-                   (VP8_FILTER_WEIGHT / 2);

-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);

-            src_ptr++;

-        }

-        /* Next row... */

-        src_ptr    += src_pixels_per_line - output_width;

-        output_ptr += output_pitch;

-    }

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil

- *

- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.

- *                  UINT32 src_pixels_per_line : Stride of input block.

- *                  INT32  *HFilter         : Array of 2 horizontal filter taps.

- *                  INT32  *VFilter         : Array of 2 vertical filter taps.

- *

- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : 2-D filters an input block by applying a 2-tap

- *                  bi-linear filter horizontally followed by a 2-tap

- *                  bi-linear filter vertically on the result.

- *

- *  SPECIAL NOTES : The largest block size can be handled here is 16x16

- *

- ****************************************************************************/

-void vp8_filter_block2d_bil

-(

-    unsigned char *src_ptr,

-    unsigned char *output_ptr,

-    unsigned int   src_pixels_per_line,

-    unsigned int   dst_pitch,

-    const int      *HFilter,

-    const int      *VFilter,

-    int            Width,

-    int            Height

-)

-{

-    unsigned short FData[17*16];    /* Temp data bufffer used in filtering */

-    /* First filter 1-D horizontally... */

-    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, Height + 1, Width, HFilter);

-    /* then 1-D vertically... */

-    vp8_filter_block2d_bil_second_pass(FData, output_ptr, dst_pitch, Width, Width, Height, Width, VFilter);

-}

-void vp8_bilinear_predict4x4_c

-(

-    unsigned char  *src_ptr,

-    int   src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    unsigned char *dst_ptr,

-    int dst_pitch

-)

-{

-    const int  *HFilter;

-    const int  *VFilter;

-    HFilter = bilinear_filters[xoffset];

-    VFilter = bilinear_filters[yoffset];

-#if 0

-    {

-        int i;

-        unsigned char temp1[16];

-        unsigned char temp2[16];

-        bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);

-        vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);

-        for (i = 0; i < 16; i++)

-        {

-            if (temp1[i] != temp2[i])

-            {

-                bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);

-                vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);

-            }

-        }

-    }

-#endif

-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);

-}

-void vp8_bilinear_predict8x8_c

-(

-    unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    unsigned char *dst_ptr,

-    int  dst_pitch

-)

-{

-    const int  *HFilter;

-    const int  *VFilter;

-    HFilter = bilinear_filters[xoffset];

-    VFilter = bilinear_filters[yoffset];

-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);

-}

-void vp8_bilinear_predict8x4_c

-(

-    unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    unsigned char *dst_ptr,

-    int  dst_pitch

-)

-{

-    const int  *HFilter;

-    const int  *VFilter;

-    HFilter = bilinear_filters[xoffset];

-    VFilter = bilinear_filters[yoffset];

-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);

-}

-void vp8_bilinear_predict16x16_c

-(

-    unsigned char  *src_ptr,

-    int  src_pixels_per_line,

-    int  xoffset,

-    int  yoffset,

-    unsigned char *dst_ptr,

-    int  dst_pitch

-)

-{

-    const int  *HFilter;

-    const int  *VFilter;

-    HFilter = bilinear_filters[xoffset];

-    VFilter = bilinear_filters[yoffset];

-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);

-}

--- a/vp8/common/threading.h

+++ b/vp8/common/threading.h

@@ -14,7 +14,7 @@

 #define VPXINFINITE 10000       /* 10second. */

-#if CONFIG_OS_SUPPORT

+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD

 /* Thread management macros */

 #ifdef _WIN32

@@ -90,8 +90,6 @@

 #define x86_pause_hint()

 #endif

-#else /* CONFIG_OS_SUPPORT = 0 */

-#define THREAD_FUNCTION void *

-#endif /* CONFIG_OS_SUPPORT */

+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */

 #endif

--- a/vp8/decoder/decodframe.c

+++ b/vp8/decoder/decodframe.c

@@ -484,9 +484,11 @@

         bool_decoder++;

+#if CONFIG_MULTITHREAD

     /* Clamp number of decoder threads */

     if (pbi->decoding_thread_count > num_part - 1)

         pbi->decoding_thread_count = num_part - 1;

+#endif

@@ -844,7 +846,9 @@

     vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));

     /* set up frame new frame for intra coded blocks */

+#if CONFIG_MULTITHREAD

     if (!(pbi->b_multithreaded_rd) || pc->multi_token_partition == ONE_PARTITION || !(pc->filter_level))

+#endif

         vp8_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);

     vp8_setup_block_dptrs(xd);

@@ -864,6 +868,7 @@

     vpx_memcpy(&xd->block[0].bmi, &xd->mode_info_context->bmi[0], sizeof(B_MODE_INFO));

+#if CONFIG_MULTITHREAD

     if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)

         vp8mt_decode_mb_rows(pbi, xd);

@@ -878,6 +883,7 @@

         vp8_yv12_extend_frame_borders_ptr(&pc->yv12_fb[pc->new_fb_idx]);    /*cm->frame_to_show);*/

     else

+#endif

         int ibc = 0;

         int num_part = 1 << pc->multi_token_partition;

--- a/vp8/decoder/onyxd_if.c

+++ b/vp8/decoder/onyxd_if.c

@@ -114,8 +114,10 @@

     pbi->ready_for_new_data = 1;

     pbi->CPUFreq = 0; /*vp8_get_processor_freq();*/

+#if CONFIG_MULTITHREAD

     pbi->max_threads = oxcf->max_threads;

     vp8_decoder_create_threads(pbi);

+#endif

     /* vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid

      *  unnecessary calling of vp8cx_init_de_quantizer() for every frame.

@@ -149,8 +151,8 @@

 #if CONFIG_MULTITHREAD

     if (pbi->b_multithreaded_rd)

         vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);

-#endif

     vp8_decoder_remove_threads(pbi);

+#endif

     vp8_remove_common(&pbi->common);

     vpx_free(pbi);

@@ -407,6 +409,7 @@

         return retcode;

+#if CONFIG_MULTITHREAD

     if (pbi->b_multithreaded_rd && cm->multi_token_partition != ONE_PARTITION)

         if (swap_frame_buffers (cm))

@@ -424,6 +427,7 @@

             return -1;

     } else

+#endif

         if (swap_frame_buffers (cm))

--- a/vp8/decoder/onyxd_int.h

+++ b/vp8/decoder/onyxd_int.h

@@ -87,6 +87,9 @@

     unsigned int time_decoding;

     unsigned int time_loop_filtering;

+#if CONFIG_MULTITHREAD

+    /* variable for threading */

     volatile int b_multithreaded_rd;

     int max_threads;

     int current_mb_col_main;

@@ -93,8 +96,6 @@

     int decoding_thread_count;

     int allocated_decoding_thread_count;

-    /* variable for threading */

-#if CONFIG_MULTITHREAD

     int mt_baseline_filter_level[MAX_MB_SEGMENTS];

     int sync_range;

     int *mt_current_mb_col;                  /* Each row remembers its already decoded column. */

--- a/vp8/decoder/reconintra_mt.c

+++ b/vp8/decoder/reconintra_mt.c

@@ -21,7 +21,6 @@

 void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)

-#if CONFIG_MULTITHREAD

     unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */

     unsigned char *yleft_col;

     unsigned char yleft_buf[16];

@@ -146,17 +145,10 @@

     case MB_MODE_COUNT:

         break;

-#else

-    (void) pbi;

-    (void) x;

-    (void) mb_row;

-    (void) mb_col;

-#endif

 void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)

-#if CONFIG_MULTITHREAD

     unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */

     unsigned char *yleft_col;

     unsigned char yleft_buf[16];

@@ -289,17 +281,10 @@

     case MB_MODE_COUNT:

         break;

-#else

-    (void) pbi;

-    (void) x;

-    (void) mb_row;

-    (void) mb_col;

-#endif

 void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)

-#if CONFIG_MULTITHREAD

     unsigned char *uabove_row;   /* = x->dst.u_buffer - x->dst.uv_stride; */

     unsigned char *uleft_col;    /*[16];*/

     unsigned char uleft_buf[8];

@@ -452,17 +437,10 @@

     case MB_MODE_COUNT:

         break;

-#else

-    (void) pbi;

-    (void) x;

-    (void) mb_row;

-    (void) mb_col;

-#endif

 void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)

-#if CONFIG_MULTITHREAD

     unsigned char *uabove_row;  /* = x->dst.u_buffer - x->dst.uv_stride; */

     unsigned char *uleft_col;   /*[16];*/

     unsigned char uleft_buf[8];

@@ -621,12 +599,6 @@

     case MB_MODE_COUNT:

         break;

-#else

-    (void) pbi;

-    (void) x;

-    (void) mb_row;

-    (void) mb_col;

-#endif

@@ -638,7 +610,6 @@

                           int mb_col,

                           int num)

-#if CONFIG_MULTITHREAD

     int i, r, c;

     unsigned char *Above;   /* = *(x->base_dst) + x->dst - x->dst_stride; */

@@ -935,15 +906,6 @@

-#else

-    (void) pbi;

-    (void) xd;

-    (void) b_mode;

-    (void) predictor;

-    (void) mb_row;

-    (void) mb_col;

-    (void) num;

-#endif

 /* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and

@@ -951,7 +913,6 @@

*/

 void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)

-#if CONFIG_MULTITHREAD

     unsigned char *above_right;   /* = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16; */

     unsigned int *src_ptr;

     unsigned int *dst_ptr0;

@@ -973,10 +934,4 @@

     *dst_ptr0 = *src_ptr;

     *dst_ptr1 = *src_ptr;

     *dst_ptr2 = *src_ptr;

-#else

-    (void) pbi;

-    (void) x;

-    (void) mb_row;

-    (void) mb_col;

-#endif

--- a/vp8/decoder/threading.c

+++ b/vp8/decoder/threading.c

@@ -38,7 +38,6 @@

 void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)

-#if CONFIG_MULTITHREAD

     VP8_COMMON *const pc = & pbi->common;

     int i, j;

@@ -88,18 +87,11 @@

     for (i=0; i< pc->mb_rows; i++)

         pbi->mt_current_mb_col[i]=-1;

-#else

-    (void) pbi;

-    (void) xd;

-    (void) mbrd;

-    (void) count;

-#endif

 void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col)

-#if CONFIG_MULTITHREAD

     int eobtotal = 0;

     int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;

     VP8_COMMON *pc = &pbi->common;

@@ -222,18 +214,11 @@

                     (xd->qcoeff+16*16, xd->block[16].dequant,

                      xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,

                      xd->dst.uv_stride, xd->eobs+16);

-#else

-    (void) pbi;

-    (void) xd;

-    (void) mb_row;

-    (void) mb_col;

-#endif

 THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)

-#if CONFIG_MULTITHREAD

     int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;

     VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);

     MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);

@@ -438,9 +423,6 @@

             sem_post(&pbi->h_event_end_decoding);

-#else

-    (void) p_data;

-#endif

     return 0 ;

@@ -448,7 +430,6 @@

 void vp8_decoder_create_threads(VP8D_COMP *pbi)

-#if CONFIG_MULTITHREAD

     int core_count = 0;

     int ithread;

@@ -482,16 +463,11 @@

         pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;

-#else

-    (void) pbi;

-#endif

 void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)

-#if CONFIG_MULTITHREAD

     VP8_COMMON *const pc = & pbi->common;

     int i;

@@ -589,15 +565,11 @@

             pbi->mt_vleft_col = NULL ;

-#else

-    (void) pbi;

-#endif

 void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)

-#if CONFIG_MULTITHREAD

     VP8_COMMON *const pc = & pbi->common;

     int i;

     int uv_width;

@@ -646,17 +618,11 @@

         for (i=0; i< pc->mb_rows; i++)

             CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));

-#else

-    (void) pbi;

-    (void) width;

-#endif

 void vp8_decoder_remove_threads(VP8D_COMP *pbi)

-#if CONFIG_MULTITHREAD

     /* shutdown MB Decoding thread; */

     if (pbi->b_multithreaded_rd)

@@ -702,15 +668,11 @@

             pbi->de_thread_data = NULL;

-#else

-    (void) pbi;

-#endif

 void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)

-#if CONFIG_MULTITHREAD

     VP8_COMMON *cm  = &pbi->common;

     MACROBLOCKD *mbd = &pbi->mb;

     /*YV12_BUFFER_CONFIG *post = &cm->new_frame;*/  /*frame_to_show;*/

@@ -752,16 +714,11 @@

         vp8_init_loop_filter(cm);

     else if (frame_type != cm->last_frame_type)

         vp8_frame_init_loop_filter(lfi, frame_type);

-#else

-    (void) pbi;

-    (void) default_filt_lvl;

-#endif

 void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)

-#if CONFIG_MULTITHREAD

     int mb_row;

     VP8_COMMON *pc = &pbi->common;

@@ -981,8 +938,4 @@

     sem_wait(&pbi->h_event_end_decoding);   /* add back for each frame */

-#else

-    (void) pbi;

-    (void) xd;

-#endif

--- a/vp8/encoder/bitstream.c

+++ b/vp8/encoder/bitstream.c

@@ -1654,10 +1654,12 @@

         vp8_start_encode(&cpi->bc2, cx_data + bc->pos);

-        if (!cpi->b_multi_threaded)

-            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);

-        else

+#if CONFIG_MULTITHREAD

+        if (cpi->b_multi_threaded)

             pack_mb_row_tokens(cpi, &cpi->bc2);

+        else

+#endif

+            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);

         vp8_stop_encode(&cpi->bc2);

         oh.first_partition_length_in_bytes = cpi->bc.pos ;

--- a/vp8/encoder/encodeframe.c

+++ b/vp8/encoder/encodeframe.c

@@ -800,28 +800,9 @@

         struct vpx_usec_timer  emr_timer;

         vpx_usec_timer_start(&emr_timer);

-        if (!cpi->b_multi_threaded)

-        {

-            // for each macroblock row in image

-            for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)

-            {

-                vp8_zero(cm->left_context)

-                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);

-                // adjust to the next row of mbs

-                x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;

-                x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;

-                x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;

-            }

-            cpi->tok_count = tp - cpi->tok;

-        }

-        else

-        {

 #if CONFIG_MULTITHREAD

+        if (cpi->b_multi_threaded)

+        {

             int i;

             vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);

@@ -886,8 +867,26 @@

                 x->activity_sum += cpi->mb_row_ei[i].mb.activity_sum;

+        }

+        else

 #endif

+        {

+            // for each macroblock row in image

+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)

+            {

+                vp8_zero(cm->left_context)

+                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);

+                // adjust to the next row of mbs

+                x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;

+                x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;

+                x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;

+            }

+            cpi->tok_count = tp - cpi->tok;

         vpx_usec_timer_mark(&emr_timer);

@@ -1166,7 +1165,7 @@

         Error16x16 = vp8_rd_pick_intra16x16mby_mode(cpi, x, &rate16x16, &rate16x16_tokenonly, &dist16x16);

-        Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4);

+        Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4, Error16x16);

         rate += (Error4x4 < Error16x16) ? rate4x4 : rate16x16;

--- a/vp8/encoder/encodemv.c

+++ b/vp8/encoder/encodemv.c

@@ -128,7 +128,7 @@

         while (--i > 3);

-        if (x & 240)

+        if (x & 0xFFF0)

             cost += vp8_cost_bit(p [MVPbits + 3], (x >> 3) & 1);

--- a/vp8/encoder/ethreading.c

+++ b/vp8/encoder/ethreading.c

@@ -13,6 +13,8 @@

 #include "common.h"

 #include "extend.h"

+#if CONFIG_MULTITHREAD

 extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,

                                          TOKENEXTRA **t, int recon_yoffset,

                                          int recon_uvoffset);

@@ -25,7 +27,6 @@

 static

 THREAD_FUNCTION thread_encoding_proc(void *p_data)

-#if CONFIG_MULTITHREAD

     int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;

     VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);

     MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);

@@ -247,10 +248,6 @@

-#else

-    (void) p_data;

-#endif

     //printf("exit thread %d\n", ithread);

     return 0;

@@ -436,10 +433,6 @@

     cpi->processor_core_count = 32; //vp8_get_proc_core_count();

-    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));

-#if CONFIG_MULTITHREAD

     if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)

         int ithread;

@@ -488,13 +481,10 @@

-#endif

 void vp8cx_remove_encoder_threads(VP8_COMP *cpi)

-#if CONFIG_MULTITHREAD

     if (cpi->b_multi_threaded)

         //shutdown other threads

@@ -521,7 +511,5 @@

         vpx_free(cpi->en_thread_data);

         vpx_free(cpi->mt_current_mb_col);

-#endif

-    vpx_free(cpi->tplist);

+#endif

--- a/vp8/encoder/onyx_if.c

+++ b/vp8/encoder/onyx_if.c

@@ -331,6 +331,9 @@

 void vp8_dealloc_compressor_data(VP8_COMP *cpi)

+    vpx_free(cpi->tplist);

+    cpi->tplist = NULL;

     // Delete last frame MV storage buffers

     if (cpi->lfmv != 0)

         vpx_free(cpi->lfmv);

@@ -1545,6 +1548,8 @@

     else

         cpi->mt_sync_range = 16;

 #endif

+    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));

@@ -2496,7 +2501,9 @@

     init_mv_ref_counts();

 #endif

+#if CONFIG_MULTITHREAD

     vp8cx_create_encoder_threads(cpi);

+#endif

     cpi->fn_ptr[BLOCK_16X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);

     cpi->fn_ptr[BLOCK_16X16].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);

@@ -2771,7 +2778,9 @@

+#if CONFIG_MULTITHREAD

     vp8cx_remove_encoder_threads(cpi);

+#endif

     vp8_dealloc_compressor_data(cpi);

     vpx_free(cpi->mb.ss);

--- a/vp8/encoder/onyx_int.h

+++ b/vp8/encoder/onyx_int.h

@@ -589,6 +589,7 @@

     int cyclic_refresh_q;

     signed char *cyclic_refresh_map;

+#if CONFIG_MULTITHREAD

     // multithread data

     int * mt_current_mb_col;

     int mt_sync_range;

@@ -596,13 +597,10 @@

     int b_multi_threaded;

     int encoding_thread_count;

-#if CONFIG_MULTITHREAD

     pthread_t *h_encoding_thread;

-#endif

     MB_ROW_COMP *mb_row_ei;

     ENCODETHREAD_DATA *en_thread_data;

-#if CONFIG_MULTITHREAD

     //events

     sem_t *h_event_start_encoding;

     sem_t h_event_end_encoding;

--- a/vp8/encoder/rdopt.c

+++ b/vp8/encoder/rdopt.c

@@ -645,7 +645,7 @@

     *Rate = vp8_rdcost_mby(mb);

-static void rd_pick_intra4x4block(

+static int rd_pick_intra4x4block(

     VP8_COMP *cpi,

     MACROBLOCK *x,

     BLOCK *be,

@@ -711,10 +711,13 @@

     b->bmi.mode = (B_PREDICTION_MODE)(*best_mode);

     vp8_encode_intra4x4block_rd(IF_RTCD(&cpi->rtcd), x, be, b, b->bmi.mode);

+    return best_rd;

-int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, int *rate_y, int *Distortion)

+int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,

+                                  int *rate_y, int *Distortion, int best_rd)

     MACROBLOCKD *const xd = &mb->e_mbd;

     int i;

@@ -721,6 +724,7 @@

     int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];

     int distortion = 0;

     int tot_rate_y = 0;

+    int total_rd = 0;

     ENTROPY_CONTEXT_PLANES t_above, t_left;

     ENTROPY_CONTEXT *ta;

     ENTROPY_CONTEXT *tl;

@@ -742,7 +746,7 @@

         B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);

         int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);

-        rd_pick_intra4x4block(

+        total_rd += rd_pick_intra4x4block(

             cpi, mb, mb->block + i, xd->block + i, &best_mode, A, L,

             ta + vp8_block2above[i],

             tl + vp8_block2left[i], &r, &ry, &d);

@@ -751,8 +755,14 @@

         distortion += d;

         tot_rate_y += ry;

         mic->bmi[i].mode = xd->block[i].bmi.mode = best_mode;

+        if(total_rd >= best_rd)

+          break;

+    if(total_rd >= best_rd)

+      return INT_MAX;

     *Rate = cost;

     *rate_y += tot_rate_y;

     *Distortion = distortion;

@@ -2025,15 +2035,28 @@

         switch (this_mode)

         case B_PRED:

+        {

+            int tmp_rd;

             // Note the rate value returned here includes the cost of coding the BPRED mode : x->mbmode_cost[x->e_mbd.frame_type][BPRED];

-            vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion);

+            tmp_rd = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd);

             rate2 += rate;

             distortion2 += distortion;

-            rate2 += uv_intra_rate;

-            rate_uv = uv_intra_rate_tokenonly;

-            distortion2 += uv_intra_distortion;

-            distortion_uv = uv_intra_distortion;

-            break;

+            if(tmp_rd < best_yrd)

+            {

+                rate2 += uv_intra_rate;

+                rate_uv = uv_intra_rate_tokenonly;

+                distortion2 += uv_intra_distortion;

+                distortion_uv = uv_intra_distortion;

+            }

+            else

+            {

+                this_rd = INT_MAX;

+                disable_skip = 1;

+            }

+        }

+        break;

         case SPLITMV:

--- a/vp8/encoder/rdopt.h

+++ b/vp8/encoder/rdopt.h

@@ -12,7 +12,7 @@

 #ifndef __INC_RDOPT_H

 #define __INC_RDOPT_H

 void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);

-int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion);

+int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion, int best_rd);

 int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *returnrate, int *rate_to, int *returndistortion);

 int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_to, int *distortion);

 extern int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);

--- a/vp8/encoder/temporal_filter.c

+++ b/vp8/encoder/temporal_filter.c

@@ -70,7 +70,7 @@

     // U & V

     mv_row >>= 1;

     mv_col >>= 1;

-    stride >>= 1;

+    stride = (stride + 1) >> 1;

     offset = (mv_row >> 3) * stride + (mv_col >> 3);

     uptr = u_mb_ptr + offset;

     vptr = v_mb_ptr + offset;

--- a/vp8/vp8_common.mk

+++ b/vp8/vp8_common.mk

@@ -35,7 +35,7 @@

 VP8_COMMON_SRCS-yes += common/entropymode.c

 VP8_COMMON_SRCS-yes += common/entropymv.c

 VP8_COMMON_SRCS-yes += common/extend.c

-VP8_COMMON_SRCS-yes += common/filter_c.c

+VP8_COMMON_SRCS-yes += common/filter.c

 VP8_COMMON_SRCS-yes += common/findnearmv.c

 VP8_COMMON_SRCS-yes += common/generic/systemdependent.c

 VP8_COMMON_SRCS-yes += common/idctllm.c

--- a/vp8/vp8_cx_iface.c

+++ b/vp8/vp8_cx_iface.c

@@ -934,8 +934,8 @@

         ctx->preview_img.x_chroma_shift = 1;

         ctx->preview_img.y_chroma_shift = 1;

-        ctx->preview_img.d_w = ctx->cfg.g_w;

-        ctx->preview_img.d_h = ctx->cfg.g_h;

+        ctx->preview_img.d_w = sd.y_width;

+        ctx->preview_img.d_h = sd.y_height;

         ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;

         ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;

         ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;

--- a/vp8/vp8cx.mk

+++ b/vp8/vp8cx.mk

@@ -42,7 +42,7 @@

 VP8_CX_SRCS-yes += encoder/encodeintra.c

 VP8_CX_SRCS-yes += encoder/encodemb.c

 VP8_CX_SRCS-yes += encoder/encodemv.c

-VP8_CX_SRCS-yes += encoder/ethreading.c

+VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c

 VP8_CX_SRCS-yes += encoder/firstpass.c

 VP8_CX_SRCS-yes += encoder/generic/csystemdependent.c

 VP8_CX_SRCS-yes += encoder/block.h

--- a/vp8/vp8dx.mk

+++ b/vp8/vp8dx.mk

@@ -65,7 +65,7 @@

 VP8_DX_SRCS-yes += decoder/onyxd_int.h

 VP8_DX_SRCS-yes += decoder/treereader.h

 VP8_DX_SRCS-yes += decoder/onyxd_if.c

-VP8_DX_SRCS-yes += decoder/threading.c

+VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c

 VP8_DX_SRCS-yes += decoder/idct_blk.c

 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.h

 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.c

--

⑨