shithub: libvpx

--- a/build/make/Makefile

+++ b/build/make/Makefile

@@ -124,6 +124,12 @@

 	$(if $(quiet),@echo "    [AS] $@")

 	$(qexec)$(AS) $(ASFLAGS) -o $@ $<

+.PRECIOUS: %.c.S

+%.c.S: CFLAGS += -DINLINE_ASM

+$(BUILD_PFX)%.c.S: %.c

+	$(if $(quiet),@echo "    [GEN] $@")

+	$(qexec)$(CC) -S $(CFLAGS) -o $@ $<

 .PRECIOUS: %.asm.s

 $(BUILD_PFX)%.asm.s: %.asm

 	$(if $(quiet),@echo "    [ASM CONVERSION] $@")

--- a/build/make/configure.sh

+++ b/build/make/configure.sh

@@ -412,11 +412,14 @@

 write_common_target_config_h() {

     cat > ${TMP_H} << EOF

 /* This file automatically generated by configure. Do not edit! */

+#ifndef VPX_CONFIG_H

+#define VPX_CONFIG_H

 #define RESTRICT    ${RESTRICT}

EOF

     print_config_h ARCH   "${TMP_H}" ${ARCH_LIST}

     print_config_h HAVE   "${TMP_H}" ${HAVE_LIST}

     print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST}

+    echo "#endif /* VPX_CONFIG_H */" >> ${TMP_H}

     mkdir -p `dirname "$1"`

     cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"

@@ -689,7 +692,7 @@

             if enabled armv7

                 then

                     check_add_cflags --cpu=Cortex-A8 --fpu=softvfp+vfpv3

-                    check_add_asflags --cpu=Cortex-A8 --fpu=none

+                    check_add_asflags --cpu=Cortex-A8 --fpu=softvfp+vfpv3

                 else

                     check_add_cflags --cpu=${tgt_isa##armv}

                     check_add_asflags --cpu=${tgt_isa##armv}

@@ -751,41 +754,24 @@

         linux*)

             enable linux

             if enabled rvct; then

-                # Compiling with RVCT requires an alternate libc (glibc) when

-                # targetting linux.

-                disabled builtin_libc \

-                    || die "Must supply --libc when targetting *-linux-rvct"

+                # Check if we have CodeSourcery GCC in PATH. Needed for

+                # libraries

+                hash arm-none-linux-gnueabi-gcc 2>&- || \

+                  die "Couldn't find CodeSourcery GCC from PATH"

-                # Set up compiler

-                add_cflags --library_interface=aeabi_glibc

-                add_cflags --no_hide_all

-                add_cflags --dwarf2

+                # Use armcc as a linker to enable translation of

+                # some gcc specific options such as -lm and -lpthread.

+                LD="armcc --translate_gcc"

-                # Set up linker

-                add_ldflags --sysv --no_startup --no_ref_cpp_init

-                add_ldflags --entry=_start

-                add_ldflags --keep '"*(.init)"' --keep '"*(.fini)"'

-                add_ldflags --keep '"*(.init_array)"' --keep '"*(.fini_array)"'

-                add_ldflags --dynamiclinker=/lib/ld-linux.so.3

-                add_extralibs libc.so.6 -lc_nonshared crt1.o crti.o crtn.o

+                # create configuration file (uses path to CodeSourcery GCC)

+                armcc --arm_linux_configure --arm_linux_config_file=arm_linux.cfg

-                # Add the paths for the alternate libc

-                for d in usr/include; do

-                    try_dir="${alt_libc}/${d}"

-                    [ -d "${try_dir}" ] && add_cflags -J"${try_dir}"

-                done

-                add_cflags -J"${RVCT31INC}"

-                for d in lib usr/lib; do

-                    try_dir="${alt_libc}/${d}"

-                    [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"

-                done

-                # glibc has some struct members named __align, which is a

-                # storage modifier in RVCT. If we need to use this modifier,

-                # we'll have to #undef it in our code. Note that this must

-                # happen AFTER all libc inclues.

-                add_cflags -D__align=x_align_x

+                add_cflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg

+                add_asflags --no_hide_all --apcs=/interwork

+                add_ldflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg

+                enabled pic && add_cflags --apcs=/fpic

+                enabled pic && add_asflags --apcs=/fpic

+                enabled shared && add_cflags --shared

fi

;;

@@ -953,9 +939,13 @@

     enabled gcov &&

         check_add_cflags -fprofile-arcs -ftest-coverage &&

         check_add_ldflags -fprofile-arcs -ftest-coverage

     if enabled optimizations; then

-        enabled rvct && check_add_cflags -Otime

-        enabled small && check_add_cflags -O2 || check_add_cflags -O3

+        if enabled rvct; then

+            enabled small && check_add_cflags -Ospace || check_add_cflags -Otime

+        else

+            enabled small && check_add_cflags -O2 ||  check_add_cflags -O3

+        fi

fi

     # Position Independent Code (PIC) support, for building relocatable

--- a/build/make/gen_msvs_proj.sh

+++ b/build/make/gen_msvs_proj.sh

@@ -365,7 +365,7 @@

                             DebugInformationFormat="1" \

                             Detect64BitPortabilityProblems="true" \

-                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="1"

+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"

;;

*)

                         tag Tool \

@@ -379,7 +379,7 @@

                             DebugInformationFormat="1" \

                             Detect64BitPortabilityProblems="true" \

-                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="1"

+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"

;;

                 esac

;;

--- a/libs.mk

+++ b/libs.mk

@@ -184,7 +184,7 @@

 LIBVPX_SO               := libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)

 LIBS-$(BUILD_LIBVPX_SO) += $(BUILD_PFX)$(LIBVPX_SO)

 $(BUILD_PFX)$(LIBVPX_SO): $(LIBVPX_OBJS) libvpx.ver

-$(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm -pthread

+$(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm

 $(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(VERSION_MAJOR)

 $(BUILD_PFX)$(LIBVPX_SO): SO_VERSION_SCRIPT = libvpx.ver

 LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \

@@ -257,8 +257,24 @@

 # Calculate platform- and compiler-specific offsets for hand coded assembly

-ifeq ($(CONFIG_EXTERNAL_BUILD),) # Visual Studio uses obj_int_extract.bat

-  ifeq ($(ARCH_ARM), yes)

+ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))

+    asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.S

+	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@

+    $(VP8_PREFIX)common/asm_com_offsets.c.S: vp8/common/asm_com_offsets.c

+    CLEAN-OBJS += asm_com_offsets.asm $(VP8_PREFIX)common/asm_com_offsets.c.S

+    asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.S

+	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@

+    $(VP8_PREFIX)encoder/asm_enc_offsets.c.S: vp8/encoder/asm_enc_offsets.c

+    CLEAN-OBJS += asm_enc_offsets.asm $(VP8_PREFIX)encoder/asm_enc_offsets.c.S

+    asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.S

+	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@

+    $(VP8_PREFIX)decoder/asm_dec_offsets.c.S: vp8/decoder/asm_dec_offsets.c

+    CLEAN-OBJS += asm_dec_offsets.asm $(VP8_PREFIX)decoder/asm_dec_offsets.c.S

+else

+  ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))

     asm_com_offsets.asm: obj_int_extract

     asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o

 	./obj_int_extract rvds $< $(ADS2GAS) > $@

@@ -265,28 +281,20 @@

     OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o

     CLEAN-OBJS += asm_com_offsets.asm

     $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm

-  endif

-  ifeq ($(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64), yes)

-    ifeq ($(CONFIG_VP8_ENCODER), yes)

-      asm_enc_offsets.asm: obj_int_extract

-      asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o

+    asm_enc_offsets.asm: obj_int_extract

+    asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o

 	./obj_int_extract rvds $< $(ADS2GAS) > $@

-      OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o

-      CLEAN-OBJS += asm_enc_offsets.asm

-      $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm

-    endif

-  endif

+    OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o

+    CLEAN-OBJS += asm_enc_offsets.asm

+    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm

-  ifeq ($(ARCH_ARM), yes)

-    ifeq ($(CONFIG_VP8_DECODER), yes)

-      asm_dec_offsets.asm: obj_int_extract

-      asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o

+    asm_dec_offsets.asm: obj_int_extract

+    asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o

 	./obj_int_extract rvds $< $(ADS2GAS) > $@

-      OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o

-      CLEAN-OBJS += asm_dec_offsets.asm

-      $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm

-    endif

+    OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o

+    CLEAN-OBJS += asm_dec_offsets.asm

+    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm

   endif

 endif

--- a/vp8/common/asm_com_offsets.c

+++ b/vp8/common/asm_com_offsets.c

@@ -9,27 +9,12 @@

*/

-#include "vpx_ports/config.h"

-#include <stddef.h>

+#include "vpx_ports/asm_offsets.h"

 #include "vpx_scale/yv12config.h"

-#define ct_assert(name,cond) \

-    static void assert_##name(void) UNUSED;\

-    static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}

+BEGIN

-#define DEFINE(sym, val) int sym = val;

-/*

-#define BLANK() asm volatile("\n->" : : )

-*/

-/*

- * int main(void)

- * {

- */

-//vpx_scale

+/* vpx_scale */

 DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));

 DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));

 DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));

@@ -41,9 +26,7 @@

 DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));

 DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));

-//add asserts for any offset that is not supported by assembly code

-//add asserts for any size that is not supported by assembly code

-/*

- * return 0;

- * }

- */

+END

+/* add asserts for any offset that is not supported by assembly code */

+/* add asserts for any size that is not supported by assembly code */

--- a/vp8/common/blockd.h

+++ b/vp8/common/blockd.h

@@ -252,6 +252,9 @@

     int mb_to_top_edge;

     int mb_to_bottom_edge;

+    int ref_frame_cost[MAX_REF_FRAMES];

     unsigned int frames_since_golden;

     unsigned int frames_till_alt_ref_frame;

     vp8_subpix_fn_t  subpixel_predict;

--- a/vp8/common/findnearmv.h

+++ b/vp8/common/findnearmv.h

@@ -123,7 +123,21 @@

         /* On L edge, get from MB to left of us */

         --cur_mb;

-        b += 4;

+        switch (cur_mb->mbmi.mode)

+        {

+            case B_PRED:

+              return (cur_mb->bmi + b + 3)->as_mode;

+            case DC_PRED:

+                return B_DC_PRED;

+            case V_PRED:

+                return B_VE_PRED;

+            case H_PRED:

+                return B_HE_PRED;

+            case TM_PRED:

+                return B_TM_PRED;

+            default:

+                return B_DC_PRED;

+        }

     return (cur_mb->bmi + b - 1)->as_mode;

@@ -135,7 +149,22 @@

         /* On top edge, get from MB above us */

         cur_mb -= mi_stride;

-        b += 16;

+        switch (cur_mb->mbmi.mode)

+        {

+            case B_PRED:

+              return (cur_mb->bmi + b + 12)->as_mode;

+            case DC_PRED:

+                return B_DC_PRED;

+            case V_PRED:

+                return B_VE_PRED;

+            case H_PRED:

+                return B_HE_PRED;

+            case TM_PRED:

+                return B_TM_PRED;

+            default:

+                return B_DC_PRED;

+        }

     return (cur_mb->bmi + b - 4)->as_mode;

--- a/vp8/decoder/asm_dec_offsets.c

+++ b/vp8/decoder/asm_dec_offsets.c

@@ -9,22 +9,11 @@

*/

-#include "vpx_ports/config.h"

-#include <stddef.h>

+#include "vpx_ports/asm_offsets.h"

 #include "onyxd_int.h"

-#define DEFINE(sym, val) int sym = val;

+BEGIN

-/*

-#define BLANK() asm volatile("\n->" : : )

-*/

-/*

- * int main(void)

- * {

- */

 DEFINE(detok_scan,                              offsetof(DETOK, scan));

 DEFINE(detok_ptr_block2leftabove,               offsetof(DETOK, ptr_block2leftabove));

 DEFINE(detok_coef_tree_ptr,                     offsetof(DETOK, vp8_coef_tree_ptr));

@@ -49,9 +38,7 @@

 DEFINE(tokenextrabits_min_val,                  offsetof(TOKENEXTRABITS, min_val));

 DEFINE(tokenextrabits_length,                   offsetof(TOKENEXTRABITS, Length));

-//add asserts for any offset that is not supported by assembly code

-//add asserts for any size that is not supported by assembly code

-/*

- * return 0;

- * }

- */

+END

+/* add asserts for any offset that is not supported by assembly code */

+/* add asserts for any size that is not supported by assembly code */

--- a/vp8/decoder/decodemv.c

+++ b/vp8/decoder/decodemv.c

@@ -101,36 +101,6 @@

                 while (++i < 16);

-            else

-            {

-                int BMode;

-                int i = 0;

-                switch (y_mode)

-                {

-                case DC_PRED:

-                    BMode = B_DC_PRED;

-                    break;

-                case V_PRED:

-                    BMode = B_VE_PRED;

-                    break;

-                case H_PRED:

-                    BMode = B_HE_PRED;

-                    break;

-                case TM_PRED:

-                    BMode = B_TM_PRED;

-                    break;

-                default:

-                    BMode = B_DC_PRED;

-                    break;

-                }

-                do

-                {

-                    m->bmi[i].as_mode = (B_PREDICTION_MODE)BMode;

-                }

-                while (++i < 16);

-            }

             m->mbmi.uv_mode = (MB_PREDICTION_MODE)vp8_read_uv_mode(bc, pbi->common.kf_uv_mode_prob);

--- a/vp8/decoder/decodframe.c

+++ b/vp8/decoder/decodframe.c

@@ -533,7 +533,10 @@

     VP8_COMMON *pc = &pbi->common;

     if (pc->multi_token_partition != ONE_PARTITION)

+    {

         vpx_free(pbi->mbc);

+        pbi->mbc = NULL;

+    }

 static void init_frame(VP8D_COMP *pbi)

--- a/vp8/decoder/onyxd_if.c

+++ b/vp8/decoder/onyxd_if.c

@@ -129,6 +129,7 @@

     vp8_de_alloc_overlap_lists(pbi);

 #endif

     vp8_remove_common(&pbi->common);

+    vpx_free(pbi->mbc);

     vpx_free(pbi);

--- a/vp8/decoder/threading.c

+++ b/vp8/decoder/threading.c

@@ -135,7 +135,7 @@

         mb_init_dequantizer(pbi, xd);

     /* do prediction */

-    if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)

+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)

         vp8mt_build_intra_predictors_mbuv(pbi, xd, mb_row, mb_col);

@@ -181,7 +181,7 @@

                          xd->predictor, xd->dst.y_buffer,

                          xd->dst.y_stride, xd->eobs, xd->block[24].diff);

-    else if ((xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)

+    else if (xd->mode_info_context->mbmi.mode == B_PRED)

         for (i = 0; i < 16; i++)

@@ -334,7 +334,7 @@

                                 MODE_INFO *next = xd->mode_info_context +1;

-                                if (xd->frame_type == KEY_FRAME  ||  next->mbmi.ref_frame == INTRA_FRAME)

+                                if (next->mbmi.ref_frame == INTRA_FRAME)

                                     for (i = 0; i < 16; i++)

                                         pbi->mt_yleft_col[mb_row][i] = xd->dst.y_buffer [i* recon_y_stride + 15];

@@ -824,7 +824,7 @@

                         MODE_INFO *next = xd->mode_info_context +1;

-                        if (xd->frame_type == KEY_FRAME  ||  next->mbmi.ref_frame == INTRA_FRAME)

+                        if (next->mbmi.ref_frame == INTRA_FRAME)

                             for (i = 0; i < 16; i++)

                                 pbi->mt_yleft_col[mb_row][i] = xd->dst.y_buffer [i* recon_y_stride + 15];

--- a/vp8/decoder/treereader.h

+++ b/vp8/decoder/treereader.h

@@ -38,27 +38,4 @@

     return -i;

-/* Variant reads a binary number given distributions on each bit.

-   Note that tree is arbitrary; probability of decoding a zero

-   may or may not depend on previously decoded bits. */

-static int vp8_treed_read_num(

-    vp8_reader *const r,        /* !!! must return a 0 or 1 !!! */

-    vp8_tree t,

-    const vp8_prob *const p

-)

-{

-    vp8_tree_index i = 0;

-    int v = 0, b;

-    do

-    {

-        b = vp8_read(r, p[i>>1]);

-        v = (v << 1) + b;

-    }

-    while ((i = t[i+b]) > 0);

-    return v;

-}

 #endif /* tree_reader_h */

--- a/vp8/decoder/x86/x86_dsystemdependent.c

+++ b/vp8/decoder/x86/x86_dsystemdependent.c

@@ -17,7 +17,7 @@

 #if HAVE_MMX

 void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);

-static void dequantize_b_mmx(BLOCKD *d)

+void vp8_dequantize_b_mmx(BLOCKD *d)

     short *sq = (short *) d->qcoeff;

     short *dq = (short *) d->dqcoeff;

@@ -28,6 +28,7 @@

 void vp8_arch_x86_decode_init(VP8D_COMP *pbi)

+#if CONFIG_RUNTIME_CPU_DETECT

     int flags = x86_simd_caps();

     /* Note:

@@ -36,12 +37,11 @@

      * you modify any of the function mappings present in this file, be sure

      * to also update them in static mapings (<arch>/filename_<arch>.h)

*/

-#if CONFIG_RUNTIME_CPU_DETECT

     /* Override default functions with fastest ones for this CPU. */

 #if HAVE_MMX

     if (flags & HAS_MMX)

-        pbi->dequant.block               = dequantize_b_mmx;

+        pbi->dequant.block               = vp8_dequantize_b_mmx;

         pbi->dequant.idct_add            = vp8_dequant_idct_add_mmx;

         pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_mmx;

         pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx;

--- a/vp8/encoder/arm/arm_csystemdependent.c

+++ b/vp8/encoder/arm/arm_csystemdependent.c

@@ -53,10 +53,7 @@

         cpi->rtcd.variance.mse16x16              = vp8_mse16x16_armv6;

         /*cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/

-        /*cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;

-        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;

-        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;

-        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;*/

+        /*cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;*/

         /*cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;

         cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;*/

@@ -103,9 +100,6 @@

         cpi->rtcd.variance.mse16x16              = vp8_mse16x16_neon;

         /*cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/

-        cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_neon;

-        /*cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;

-        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;*/

         cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_neon;

         cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_neon;

--- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm

+++ b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm

@@ -10,7 +10,6 @@

     EXPORT  |vp8_mse16x16_neon|

-    EXPORT  |vp8_get16x16pred_error_neon|

     EXPORT  |vp8_get4x4sse_cs_neon|

ARM

@@ -76,62 +75,6 @@

     ENDP

-;============================

-; r0    unsigned char *src_ptr

-; r1    int src_stride

-; r2    unsigned char *ref_ptr

-; r3    int ref_stride

-|vp8_get16x16pred_error_neon| PROC

-    vmov.i8         q8, #0                      ;q8 - sum

-    vmov.i8         q9, #0                      ;q9, q10 - pred_error

-    vmov.i8         q10, #0

-    mov             r12, #8

-get16x16pred_error_neon_loop

-    vld1.8          {q0}, [r0], r1              ;Load up source and reference

-    vld1.8          {q2}, [r2], r3

-    vld1.8          {q1}, [r0], r1

-    vld1.8          {q3}, [r2], r3

-    vsubl.u8        q11, d0, d4

-    vsubl.u8        q12, d1, d5

-    vsubl.u8        q13, d2, d6

-    vsubl.u8        q14, d3, d7

-    vpadal.s16      q8, q11

-    vmlal.s16       q9, d22, d22

-    vmlal.s16       q10, d23, d23

-    subs            r12, r12, #1

-    vpadal.s16      q8, q12

-    vmlal.s16       q9, d24, d24

-    vmlal.s16       q10, d25, d25

-    vpadal.s16      q8, q13

-    vmlal.s16       q9, d26, d26

-    vmlal.s16       q10, d27, d27

-    vpadal.s16      q8, q14

-    vmlal.s16       q9, d28, d28

-    vmlal.s16       q10, d29, d29

-    bne             get16x16pred_error_neon_loop

-    vadd.u32        q10, q9, q10

-    vpaddl.s32      q0, q8

-    vpaddl.u32      q1, q10

-    vadd.s64        d0, d0, d1

-    vadd.u64        d1, d2, d3

-    vmull.s32       q5, d0, d0

-    vshr.s32        d10, d10, #8

-    vsub.s32        d0, d1, d10

-    vmov.32         r0, d0[0]

-    bx              lr

-    ENDP

 ;=============================

 ; r0    unsigned char *src_ptr,

--- a/vp8/encoder/arm/variance_arm.h

+++ b/vp8/encoder/arm/variance_arm.h

@@ -83,9 +83,6 @@

 //extern prototype_getmbss(vp8_get_mb_ss_c);

 extern prototype_variance(vp8_mse16x16_neon);

-extern prototype_get16x16prederror(vp8_get16x16pred_error_neon);

-//extern prototype_variance2(vp8_get8x8var_c);

-//extern prototype_variance2(vp8_get16x16var_c);

 extern prototype_get16x16prederror(vp8_get4x4sse_cs_neon);

 #if !CONFIG_RUNTIME_CPU_DETECT

@@ -148,15 +145,6 @@

 #undef  vp8_variance_mse16x16

 #define vp8_variance_mse16x16 vp8_mse16x16_neon

-#undef  vp8_variance_get16x16prederror

-#define vp8_variance_get16x16prederror vp8_get16x16pred_error_neon

-//#undef  vp8_variance_get8x8var

-//#define vp8_variance_get8x8var vp8_get8x8var_c

-//#undef  vp8_variance_get16x16var

-//#define vp8_variance_get16x16var vp8_get16x16var_c

 #undef  vp8_variance_get4x4sse_cs

 #define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_neon

--- a/vp8/encoder/asm_enc_offsets.c

+++ b/vp8/encoder/asm_enc_offsets.c

@@ -9,9 +9,8 @@

*/

-#include "vpx_ports/config.h"

-#include <stddef.h>

+#include "vpx_ports/asm_offsets.h"

+#include "vpx_config.h"

 #include "block.h"

 #include "vp8/common/blockd.h"

 #include "onyx_int.h"

@@ -18,22 +17,9 @@

 #include "treewriter.h"

 #include "tokenize.h"

-#define ct_assert(name,cond) \

-    static void assert_##name(void) UNUSED;\

-    static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}

+BEGIN

-#define DEFINE(sym, val) int sym = val;

-/*

-#define BLANK() asm volatile("\n->" : : )

-*/

-/*

- * int main(void)

- * {

- */

-//regular quantize

+/* regular quantize */

 DEFINE(vp8_block_coeff,                         offsetof(BLOCK, coeff));

 DEFINE(vp8_block_zbin,                          offsetof(BLOCK, zbin));

 DEFINE(vp8_block_round,                         offsetof(BLOCK, round));

@@ -48,7 +34,7 @@

 DEFINE(vp8_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));

 DEFINE(vp8_blockd_eob,                          offsetof(BLOCKD, eob));

-// subtract

+/* subtract */

 DEFINE(vp8_block_base_src,                      offsetof(BLOCK, base_src));

 DEFINE(vp8_block_src,                           offsetof(BLOCK, src));

 DEFINE(vp8_block_src_diff,                      offsetof(BLOCK, src_diff));

@@ -56,7 +42,7 @@

 DEFINE(vp8_blockd_predictor,                    offsetof(BLOCKD, predictor));

-//pack tokens

+/* pack tokens */

 DEFINE(vp8_writer_lowvalue,                     offsetof(vp8_writer, lowvalue));

 DEFINE(vp8_writer_range,                        offsetof(vp8_writer, range));

 DEFINE(vp8_writer_value,                        offsetof(vp8_writer, value));

@@ -90,16 +76,16 @@

 DEFINE(vp8_common_mb_rows,                      offsetof(VP8_COMMON, mb_rows));

-// These two sizes are used in vp8cx_pack_tokens.  They are hard coded

-// so if the size changes this will have to be adjusted.

+END

+/* add asserts for any offset that is not supported by assembly code

+ * add asserts for any size that is not supported by assembly code

+ * These are used in vp8cx_pack_tokens.  They are hard coded so if their sizes

+ * change they will have to be adjusted.

+ */

 #if HAVE_ARMV5TE

 ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)

 ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)

 #endif

-//add asserts for any offset that is not supported by assembly code

-//add asserts for any size that is not supported by assembly code

-/*

- * return 0;

- * }

- */

--- a/vp8/encoder/bitstream.c

+++ b/vp8/encoder/bitstream.c

@@ -776,9 +776,9 @@

     vp8_writer *w, MB_PREDICTION_MODE m, const vp8_prob *p

+#if CONFIG_DEBUG

     assert(NEARESTMV <= m  &&  m <= SPLITMV);

+#endif

     vp8_write_token(w, vp8_mv_ref_tree, p,

                     vp8_mv_ref_encoding_array - NEARESTMV + m);

@@ -788,8 +788,9 @@

     vp8_writer *w, B_PREDICTION_MODE m, const vp8_prob *p

+#if CONFIG_DEBUG

     assert(LEFT4X4 <= m  &&  m <= NEW4X4);

+#endif

     vp8_write_token(w, vp8_sub_mv_ref_tree, p,

                     vp8_sub_mv_ref_encoding_array - LEFT4X4 + m);

@@ -1017,11 +1018,13 @@

                         blockmode =  cpi->mb.partition_info->bmi[j].mode;

                         blockmv =  cpi->mb.partition_info->bmi[j].mv;

+#if CONFIG_DEBUG

                         while (j != L[++k])

                             if (k >= 16)

                                 assert(0);

+#else

+                        while (j != L[++k]);

+#endif

                         leftmv.as_int = left_block_mv(m, k);

                         abovemv.as_int = above_block_mv(m, k, mis);

                         mv_contz = vp8_mv_cont(&leftmv, &abovemv);

--- a/vp8/encoder/encodeframe.c

+++ b/vp8/encoder/encodeframe.c

@@ -50,6 +50,7 @@

 void vp8_setup_block_ptrs(MACROBLOCK *x);

 int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset);

 int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);

+static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x );

 #ifdef MODE_STATS

 unsigned int inter_y_modes[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

@@ -84,8 +85,6 @@

     unsigned int act;

     unsigned int sse;

-    int sum;

     /* TODO: This could also be done over smaller areas (8x8), but that would

      *  require extensive changes elsewhere, as lambda is assumed to be fixed

      *  over an entire MB in most of the code.

@@ -93,15 +92,10 @@

      *  lambda using a non-linear combination (e.g., the smallest, or second

      *  smallest, etc.).

*/

-    VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)(x->src.y_buffer,

-                    x->src.y_stride, VP8_VAR_OFFS, 0, &sse, &sum);

+    act =     VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)(x->src.y_buffer,

+                    x->src.y_stride, VP8_VAR_OFFS, 0, &sse);

+    act = act<<4;

-    /* This requires a full 32 bits of precision. */

-    act = (sse<<8) - sum*sum;

-    /* Drop 4 to give us some headroom to work with. */

-    act = (act + 8) >> 4;

     /* If the region is flat, lower the activity some more. */

     if (act < 8<<12)

         act = act < 5<<12 ? act : 5<<12;

@@ -110,70 +104,121 @@

 // Stub for alternative experimental activity measures.

-static unsigned int alt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )

+static unsigned int alt_activity_measure( VP8_COMP *cpi,

+                                          MACROBLOCK *x, int use_dc_pred )

-    unsigned int mb_activity = VP8_ACTIVITY_AVG_MIN;

-    x->e_mbd.mode_info_context->mbmi.mode = DC_PRED;

-    x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;

-    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

-    vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);

-    mb_activity = VARIANCE_INVOKE(&cpi->rtcd.variance, getmbss)(x->src_diff);

-    return mb_activity;

+    return vp8_encode_intra(cpi,x, use_dc_pred);

 // Measure the activity of the current macroblock

 // What we measure here is TBD so abstracted to this function

-static unsigned int mb_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )

+#define ALT_ACT_MEASURE 1

+static unsigned int mb_activity_measure( VP8_COMP *cpi, MACROBLOCK *x,

+                                  int mb_row, int mb_col)

     unsigned int mb_activity;

-    if  ( 1 )

+    if  ( ALT_ACT_MEASURE )

-        // Original activity measure from Tim T's code.

-        mb_activity = tt_activity_measure( cpi, x );

+        int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);

+        // Or use and alternative.

+        mb_activity = alt_activity_measure( cpi, x, use_dc_pred );

     else

-        // Or use and alternative.

-        mb_activity = alt_activity_measure( cpi, x );

+        // Original activity measure from Tim T's code.

+        mb_activity = tt_activity_measure( cpi, x );

+    if ( mb_activity < VP8_ACTIVITY_AVG_MIN )

+        mb_activity = VP8_ACTIVITY_AVG_MIN;

     return mb_activity;

 // Calculate an "average" mb activity value for the frame

+#define ACT_MEDIAN 0

 static void calc_av_activity( VP8_COMP *cpi, INT64 activity_sum )

+#if ACT_MEDIAN

+    // Find median: Simple n^2 algorithm for experimentation

+    {

+        unsigned int median;

+        unsigned int i,j;

+        unsigned int * sortlist;

+        unsigned int tmp;

+        // Create a list to sort to

+        CHECK_MEM_ERROR(sortlist,

+                        vpx_calloc(sizeof(unsigned int),

+                        cpi->common.MBs));

+        // Copy map to sort list

+        vpx_memcpy( sortlist, cpi->mb_activity_map,

+                    sizeof(unsigned int) * cpi->common.MBs );

+        // Ripple each value down to its correct position

+        for ( i = 1; i < cpi->common.MBs; i ++ )

+        {

+            for ( j = i; j > 0; j -- )

+            {

+                if ( sortlist[j] < sortlist[j-1] )

+                {

+                    // Swap values

+                    tmp = sortlist[j-1];

+                    sortlist[j-1] = sortlist[j];

+                    sortlist[j] = tmp;

+                }

+                else

+                    break;

+            }

+        }

+        // Even number MBs so estimate median as mean of two either side.

+        median = ( 1 + sortlist[cpi->common.MBs >> 1] +

+                   sortlist[(cpi->common.MBs >> 1) + 1] ) >> 1;

+        cpi->activity_avg = median;

+        vpx_free(sortlist);

+    }

+#else

     // Simple mean for now

     cpi->activity_avg = (unsigned int)(activity_sum/cpi->common.MBs);

+#endif

     if (cpi->activity_avg < VP8_ACTIVITY_AVG_MIN)

         cpi->activity_avg = VP8_ACTIVITY_AVG_MIN;

+    // Experimental code: return fixed value normalized for several clips

+    if  ( ALT_ACT_MEASURE )

+        cpi->activity_avg = 100000;

+#define USE_ACT_INDEX   0

 #define OUTPUT_NORM_ACT_STATS   0

-// Calculate a normalized activity value for each mb

-static void calc_norm_activity( VP8_COMP *cpi, MACROBLOCK *x )

+#if USE_ACT_INDEX

+// Calculate and activity index for each mb

+static void calc_activity_index( VP8_COMP *cpi, MACROBLOCK *x )

     VP8_COMMON *const cm = & cpi->common;

     int mb_row, mb_col;

-    unsigned int act;

-    unsigned int a;

-    unsigned int b;

+    INT64 act;

+    INT64 a;

+    INT64 b;

 #if OUTPUT_NORM_ACT_STATS

     FILE *f = fopen("norm_act.stt", "a");

-    fprintf(f, "\n");

+    fprintf(f, "\n%12d\n", cpi->activity_avg );

 #endif

     // Reset pointers to start of activity map

     x->mb_activity_ptr = cpi->mb_activity_map;

-    x->mb_norm_activity_ptr = cpi->mb_norm_activity_map;

     // Calculate normalized mb activity number.

     for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)

@@ -185,25 +230,19 @@

             act = *(x->mb_activity_ptr);

             // Calculate a normalized activity number

-            a = act + 2*cpi->activity_avg;

-            b = 2*act + cpi->activity_avg;

+            a = act + 4*cpi->activity_avg;

+            b = 4*act + cpi->activity_avg;

             if ( b >= a )

-                *(x->mb_norm_activity_ptr) = (int)((b + (a>>1))/a);

+                *(x->activity_ptr) = (int)((b + (a>>1))/a) - 1;

             else

-                *(x->mb_norm_activity_ptr) = -(int)((a + (b>>1))/b);

+                *(x->activity_ptr) = 1 - (int)((a + (b>>1))/b);

-            if ( *(x->mb_norm_activity_ptr) == 0 )

-            {

-                *(x->mb_norm_activity_ptr) = 1;

-            }

 #if OUTPUT_NORM_ACT_STATS

-            fprintf(f, " %6d", *(x->mb_norm_activity_ptr));

+            fprintf(f, " %6d", *(x->mb_activity_ptr));

 #endif

             // Increment activity map pointers

             x->mb_activity_ptr++;

-            x->mb_norm_activity_ptr++;

 #if OUTPUT_NORM_ACT_STATS

@@ -217,33 +256,44 @@

 #endif

+#endif

 // Loop through all MBs. Note activity of each, average activity and

 // calculate a normalized activity for each

 static void build_activity_map( VP8_COMP *cpi )

     MACROBLOCK *const x = & cpi->mb;

+    MACROBLOCKD *xd = &x->e_mbd;

     VP8_COMMON *const cm = & cpi->common;

+#if ALT_ACT_MEASURE

+    YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];

+    int recon_yoffset;

+    int recon_y_stride = new_yv12->y_stride;

+#endif

     int mb_row, mb_col;

     unsigned int mb_activity;

     INT64 activity_sum = 0;

-    // Initialise source buffer pointer

-    x->src = *cpi->Source;

-    // Set pointer to start of activity map

-    x->mb_activity_ptr = cpi->mb_activity_map;

     // for each macroblock row in image

     for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)

+#if ALT_ACT_MEASURE

+        // reset above block coeffs

+        xd->up_available = (mb_row != 0);

+        recon_yoffset = (mb_row * recon_y_stride * 16);

+#endif

         // for each macroblock col in image

         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)

+#if ALT_ACT_MEASURE

+            xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;

+            xd->left_available = (mb_col != 0);

+            recon_yoffset += 16;

+#endif

             // measure activity

-            mb_activity = mb_activity_measure( cpi, x );

+            mb_activity = mb_activity_measure( cpi, x, mb_row, mb_col );

             // Keep frame sum

             activity_sum += mb_activity;

@@ -258,49 +308,50 @@

             x->src.y_buffer += 16;

         // adjust to the next row of mbs

         x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;

+#if ALT_ACT_MEASURE

+        //extend the recon for intra prediction

+        vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,

+                          xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);

+#endif

     // Calculate an "average" MB activity

     calc_av_activity(cpi, activity_sum);

-    // Calculate a normalized activity number of each mb

-    calc_norm_activity( cpi, x );

+#if USE_ACT_INDEX

+    // Calculate an activity index number of each mb

+    calc_activity_index( cpi, x );

+#endif

-// Activity masking based on Tim T's original code

+// Macroblock activity masking

 void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x)

+#if USE_ACT_INDEX

+    x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2);

+    x->errorperbit = x->rdmult/x->rddiv;

+#else

+    INT64 a;

+    INT64 b;

+    INT64 act = *(x->mb_activity_ptr);

-    unsigned int a;

-    unsigned int b;

-    unsigned int act = *(x->mb_activity_ptr);

     // Apply the masking to the RD multiplier.

-    a = act + 2*cpi->activity_avg;

-    b = 2*act + cpi->activity_avg;

+    a = act + (2*cpi->activity_avg);

+    b = (2*act) + cpi->activity_avg;

-    //tmp = (unsigned int)(((INT64)tmp*b + (a>>1))/a);

     x->rdmult = (unsigned int)(((INT64)x->rdmult*b + (a>>1))/a);

+    x->errorperbit = x->rdmult/x->rddiv;

-    // For now now zbin adjustment on mode choice

-    x->act_zbin_adj = 0;

-}

+#endif

-// Stub function to use a normalized activity measure stored at mb level.

-void vp8_norm_activity_masking(VP8_COMP *cpi, MACROBLOCK *x)

-{

-    int norm_act;

-    norm_act = *(x->mb_norm_activity_ptr);

-    if (norm_act > 0)

-        x->rdmult = norm_act * (x->rdmult);

-    else

-        x->rdmult = -(x->rdmult / norm_act);

-    // For now now zbin adjustment on mode choice

-    x->act_zbin_adj = 0;

+    // Activity based Zbin adjustment

+    adjust_act_zbin(cpi, x);

 static

@@ -356,7 +407,6 @@

     // Set the mb activity pointer to the start of the row.

     x->mb_activity_ptr = &cpi->mb_activity_map[map_index];

-    x->mb_norm_activity_ptr = &cpi->mb_norm_activity_map[map_index];

     // for each macroblock col in image

     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)

@@ -476,7 +526,6 @@

         // Increment the activity mask pointers.

         x->mb_activity_ptr++;

-        x->mb_norm_activity_ptr++;

         /* save the block info */

         for (i = 0; i < 16; i++)

@@ -525,6 +574,92 @@

 #endif

+void init_encode_frame_mb_context(VP8_COMP *cpi)

+{

+    MACROBLOCK *const x = & cpi->mb;

+    VP8_COMMON *const cm = & cpi->common;

+    MACROBLOCKD *const xd = & x->e_mbd;

+    // GF active flags data structure

+    x->gf_active_ptr = (signed char *)cpi->gf_active_flags;

+    // Activity map pointer

+    x->mb_activity_ptr = cpi->mb_activity_map;

+    x->vector_range = 32;

+    x->act_zbin_adj = 0;

+    x->partition_info = x->pi;

+    xd->mode_info_context = cm->mi;

+    xd->mode_info_stride = cm->mode_info_stride;

+    xd->frame_type = cm->frame_type;

+    xd->frames_since_golden = cm->frames_since_golden;

+    xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;

+    // reset intra mode contexts

+    if (cm->frame_type == KEY_FRAME)

+        vp8_init_mbmode_probs(cm);

+    // Copy data over into macro block data sturctures.

+    x->src = * cpi->Source;

+    xd->pre = cm->yv12_fb[cm->lst_fb_idx];

+    xd->dst = cm->yv12_fb[cm->new_fb_idx];

+    // set up frame for intra coded blocks

+    vp8_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);

+    vp8_build_block_offsets(x);

+    vp8_setup_block_dptrs(&x->e_mbd);

+    vp8_setup_block_ptrs(x);

+    xd->mode_info_context->mbmi.mode = DC_PRED;

+    xd->mode_info_context->mbmi.uv_mode = DC_PRED;

+    xd->left_context = &cm->left_context;

+    vp8_zero(cpi->count_mb_ref_frame_usage)

+    vp8_zero(cpi->ymode_count)

+    vp8_zero(cpi->uv_mode_count)

+    x->mvc = cm->fc.mvc;

+    vpx_memset(cm->above_context, 0,

+               sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);

+    xd->ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(cpi->prob_intra_coded);

+    // Special case treatment when GF and ARF are not sensible options for reference

+    if (cpi->ref_frame_flags == VP8_LAST_FLAG)

+    {

+        xd->ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)

+                                        + vp8_cost_zero(255);

+        xd->ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)

+                                        + vp8_cost_one(255)

+                                        + vp8_cost_zero(128);

+        xd->ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)

+                                        + vp8_cost_one(255)

+                                        + vp8_cost_one(128);

+    }

+    else

+    {

+        xd->ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)

+                                        + vp8_cost_zero(cpi->prob_last_coded);

+        xd->ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)

+                                        + vp8_cost_one(cpi->prob_last_coded)

+                                        + vp8_cost_zero(cpi->prob_gf_coded);

+        xd->ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)

+                                        + vp8_cost_one(cpi->prob_last_coded)

+                                        + vp8_cost_one(cpi->prob_gf_coded);

+    }

+}

 void vp8_encode_frame(VP8_COMP *cpi)

     int mb_row;

@@ -536,6 +671,17 @@

     int segment_counts[MAX_MB_SEGMENTS];

     int totalrate;

+    vpx_memset(segment_counts, 0, sizeof(segment_counts));

+    totalrate = 0;

+    if (cpi->compressor_speed == 2)

+    {

+        if (cpi->oxcf.cpu_used < 0)

+            cpi->Speed = -(cpi->oxcf.cpu_used);

+        else

+            vp8_auto_select_speed(cpi);

+    }

     // Functions setup for all frame types so we can use MC in AltRef

     if (cm->mcomp_filter_type == SIXTAP)

@@ -560,10 +706,6 @@

                                       &cpi->common.rtcd.subpix, bilinear16x16);

-    x->gf_active_ptr = (signed char *)cpi->gf_active_flags;     // Point to base of GF active flags data structure

-    x->vector_range = 32;

     // Reset frame count of inter 0,0 motion vector useage.

     cpi->inter_zz_count = 0;

@@ -574,8 +716,6 @@

     cpi->skip_true_count = 0;

     cpi->skip_false_count = 0;

-    x->act_zbin_adj = 0;

 #if 0

     // Experimental code

     cpi->frame_distortion = 0;

@@ -582,81 +722,28 @@

     cpi->last_mb_distortion = 0;

 #endif

-    totalrate = 0;

-    x->partition_info = x->pi;

     xd->mode_info_context = cm->mi;

-    xd->mode_info_stride = cm->mode_info_stride;

-    xd->frame_type = cm->frame_type;

-    xd->frames_since_golden = cm->frames_since_golden;

-    xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;

     vp8_zero(cpi->MVcount);

-    // vp8_zero( Contexts)

     vp8_zero(cpi->coef_counts);

-    // reset intra mode contexts

-    if (cm->frame_type == KEY_FRAME)

-        vp8_init_mbmode_probs(cm);

     vp8cx_frame_init_quantizer(cpi);

-    if (cpi->compressor_speed == 2)

-    {

-        if (cpi->oxcf.cpu_used < 0)

-            cpi->Speed = -(cpi->oxcf.cpu_used);

-        else

-            vp8_auto_select_speed(cpi);

-    }

     vp8_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);

     vp8cx_initialize_me_consts(cpi, cm->base_qindex);

-    // Copy data over into macro block data sturctures.

-    x->src = * cpi->Source;

-    xd->pre = cm->yv12_fb[cm->lst_fb_idx];

-    xd->dst = cm->yv12_fb[cm->new_fb_idx];

-    // set up frame new frame for intra coded blocks

-    vp8_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);

-    vp8_build_block_offsets(x);

-    vp8_setup_block_dptrs(&x->e_mbd);

-    vp8_setup_block_ptrs(x);

-    xd->mode_info_context->mbmi.mode = DC_PRED;

-    xd->mode_info_context->mbmi.uv_mode = DC_PRED;

-    xd->left_context = &cm->left_context;

-    vp8_zero(cpi->count_mb_ref_frame_usage)

-    vp8_zero(cpi->ymode_count)

-    vp8_zero(cpi->uv_mode_count)

-    x->mvc = cm->fc.mvc;

-    vpx_memset(cm->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);

     if(cpi->oxcf.tuning == VP8_TUNE_SSIM)

-        if(1)

-        {

-            // Build a frame level activity map

-            build_activity_map(cpi);

-        }

+        // Initialize encode frame context.

+        init_encode_frame_mb_context(cpi);

-        // Reset various MB pointers.

-        x->src = *cpi->Source;

-        x->mb_activity_ptr = cpi->mb_activity_map;

-        x->mb_norm_activity_ptr = cpi->mb_norm_activity_map;

+        // Build a frame level activity map

+        build_activity_map(cpi);

+    // re-initencode frame context.

+    init_encode_frame_mb_context(cpi);

         struct vpx_usec_timer  emr_timer;

         vpx_usec_timer_start(&emr_timer);

@@ -997,99 +1084,45 @@

 // Experimental stub function to create a per MB zbin adjustment based on

 // some previously calculated measure of MB activity.

-void adjust_act_zbin( VP8_COMP *cpi, int rate, MACROBLOCK *x )

+static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x )

-    INT64 act;

+#if USE_ACT_INDEX

+    x->act_zbin_adj = *(x->mb_activity_ptr);

+#else

     INT64 a;

     INT64 b;

+    INT64 act = *(x->mb_activity_ptr);

-    // Read activity from the map

-    act = (INT64)(*(x->mb_activity_ptr));

-    // Calculate a zbin adjustment for this mb

+    // Apply the masking to the RD multiplier.

     a = act + 4*cpi->activity_avg;

     b = 4*act + cpi->activity_avg;

-    if ( b > a )

-        //x->act_zbin_adj = (char)((b * 8) / a) - 8;

-        x->act_zbin_adj = 8;

-    else

-        x->act_zbin_adj = 0;

-    // Tmp force to 0 to disable.

-    x->act_zbin_adj = 0;

+    if ( act > cpi->activity_avg )

+        x->act_zbin_adj = (int)(((INT64)b + (a>>1))/a) - 1;

+    else

+        x->act_zbin_adj = 1 - (int)(((INT64)a + (b>>1))/b);

+#endif

 int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)

-    int Error4x4, Error16x16;

-    int rate4x4, rate16x16, rateuv;

-    int dist4x4, dist16x16, distuv;

-    int rate = 0;

-    int rate4x4_tokenonly = 0;

-    int rate16x16_tokenonly = 0;

-    int rateuv_tokenonly = 0;

+    int rate;

-    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

     if (cpi->sf.RD && cpi->compressor_speed != 2)

-    {

-        vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);

-        rate += rateuv;

-        Error16x16 = vp8_rd_pick_intra16x16mby_mode(cpi, x, &rate16x16, &rate16x16_tokenonly, &dist16x16);

-        Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4, Error16x16);

-        rate += (Error4x4 < Error16x16) ? rate4x4 : rate16x16;

-        if(cpi->oxcf.tuning == VP8_TUNE_SSIM)

-        {

-            adjust_act_zbin( cpi, rate, x );

-            vp8_update_zbin_extra(cpi, x);

-        }

-    }

+        vp8_rd_pick_intra_mode(cpi, x, &rate);

     else

-    {

-        int rate2, best_distortion;

-        MB_PREDICTION_MODE mode, best_mode = DC_PRED;

-        int this_rd;

-        Error16x16 = INT_MAX;

+        vp8_pick_intra_mode(cpi, x, &rate);

-        vp8_pick_intra_mbuv_mode(x);

-        for (mode = DC_PRED; mode <= TM_PRED; mode ++)

-        {

-            int distortion2;

-            x->e_mbd.mode_info_context->mbmi.mode = mode;

-            RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)

-                (&x->e_mbd);

-            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16);

-            rate2  = x->mbmode_cost[x->e_mbd.frame_type][mode];

-            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);

-            if (Error16x16 > this_rd)

-            {

-                Error16x16 = this_rd;

-                best_mode = mode;

-                best_distortion = distortion2;

-            }

-        }

-        x->e_mbd.mode_info_context->mbmi.mode = best_mode;

-        Error4x4 = vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate2, &best_distortion);

+    if(cpi->oxcf.tuning == VP8_TUNE_SSIM)

+    {

+        adjust_act_zbin( cpi, x );

+        vp8_update_zbin_extra(cpi, x);

-    if (Error4x4 < Error16x16)

-    {

-        x->e_mbd.mode_info_context->mbmi.mode = B_PRED;

+    if (x->e_mbd.mode_info_context->mbmi.mode == B_PRED)

         vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);

-    }

     else

-    {

         vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);

-    }

     vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);

     sum_intra_stats(cpi, x);

@@ -1163,7 +1196,7 @@

     if(cpi->oxcf.tuning == VP8_TUNE_SSIM)

         // Adjust the zbin based on this MB rate.

-        adjust_act_zbin( cpi, rate, x );

+        adjust_act_zbin( cpi, x );

 #if 0

@@ -1193,11 +1226,10 @@

         // Experimental code. Special case for gf and arf zeromv modes.

         // Increase zbin size to supress noise

+        cpi->zbin_mode_boost = 0;

         if (cpi->zbin_mode_boost_enabled)

-            if ( xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME )

-                 cpi->zbin_mode_boost = 0;

-            else

+            if ( xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME )

                 if (xd->mode_info_context->mbmi.mode == ZEROMV)

@@ -1212,9 +1244,6 @@

                     cpi->zbin_mode_boost = MV_ZBIN_BOOST;

-        else

-            cpi->zbin_mode_boost = 0;

         vp8_update_zbin_extra(cpi, x);

--- a/vp8/encoder/encodeintra.c

+++ b/vp8/encoder/encodeintra.c

@@ -28,7 +28,35 @@

 #define IF_RTCD(x) NULL

 #endif

+int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)

+{

+    int i;

+    int intra_pred_var = 0;

+    (void) cpi;

+    if (use_dc_pred)

+    {

+        x->e_mbd.mode_info_context->mbmi.mode = DC_PRED;

+        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;

+        x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

+        vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);

+    }

+    else

+    {

+        for (i = 0; i < 16; i++)

+        {

+            x->e_mbd.block[i].bmi.as_mode = B_DC_PRED;

+            vp8_encode_intra4x4block(IF_RTCD(&cpi->rtcd), x, i);

+        }

+    }

+    intra_pred_var = VARIANCE_INVOKE(&cpi->rtcd.variance, getmbss)(x->src_diff);

+    return intra_pred_var;

+}

 void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,

                               MACROBLOCK *x, int ib)

@@ -81,30 +109,6 @@

     RECON_INVOKE(&rtcd->common->recon, recon_mby)

         (IF_RTCD(&rtcd->common->recon), &x->e_mbd);

-    // make sure block modes are set the way we want them for context updates

-    for (b = 0; b < 16; b++)

-    {

-        BLOCKD *d = &x->e_mbd.block[b];

-        switch (x->e_mbd.mode_info_context->mbmi.mode)

-        {

-        case DC_PRED:

-            d->bmi.as_mode = B_DC_PRED;

-            break;

-        case V_PRED:

-            d->bmi.as_mode = B_VE_PRED;

-            break;

-        case H_PRED:

-            d->bmi.as_mode = B_HE_PRED;

-            break;

-        case TM_PRED:

-            d->bmi.as_mode = B_TM_PRED;

-            break;

-        default:

-            d->bmi.as_mode = B_DC_PRED;

-            break;

-        }

-    }

 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

@@ -124,4 +128,3 @@

     vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);

--- a/vp8/encoder/encodeintra.h

+++ b/vp8/encoder/encodeintra.h

@@ -13,6 +13,7 @@

 #define _ENCODEINTRA_H_

 #include "onyx_int.h"

+int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred);

 void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *, MACROBLOCK *x);

 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *, MACROBLOCK *x);

 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *, MACROBLOCK *mb);

--- a/vp8/encoder/ethreading.c

+++ b/vp8/encoder/ethreading.c

@@ -114,8 +114,6 @@

                 // Set the mb activity pointer to the start of the row.

                 x->mb_activity_ptr = &cpi->mb_activity_map[map_index];

-                x->mb_norm_activity_ptr =

-                    &cpi->mb_norm_activity_map[map_index];

                 // for each macroblock col in image

                 for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)

@@ -230,7 +228,6 @@

                     // Increment the activity mask pointers.

                     x->mb_activity_ptr++;

-                    x->mb_norm_activity_ptr++;

                     /* save the block info */

                     for (i = 0; i < 16; i++)

--- a/vp8/encoder/firstpass.c

+++ b/vp8/encoder/firstpass.c

@@ -81,35 +81,6 @@

 static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);

-static int encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)

-{

-    int i;

-    int intra_pred_var = 0;

-    (void) cpi;

-    if (use_dc_pred)

-    {

-        x->e_mbd.mode_info_context->mbmi.mode = DC_PRED;

-        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;

-        x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

-        vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);

-    }

-    else

-    {

-        for (i = 0; i < 16; i++)

-        {

-            x->e_mbd.block[i].bmi.as_mode = B_DC_PRED;

-            vp8_encode_intra4x4block(IF_RTCD(&cpi->rtcd), x, i);

-        }

-    }

-    intra_pred_var = VARIANCE_INVOKE(&cpi->rtcd.variance, getmbss)(x->src_diff);

-    return intra_pred_var;

-}

 // Resets the first pass file to the given position using a relative seek from the current position

 static void reset_fpf_position(VP8_COMP *cpi, FIRSTPASS_STATS *Position)

@@ -243,33 +214,58 @@

     int max_bits;

     // For CBR we need to also consider buffer fullness.

-    // If we are running below the optimal level then we need to gradually tighten up on max_bits.

     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)

-        double buffer_fullness_ratio = (double)cpi->buffer_level / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.optimal_buffer_level);

+        max_bits = 2 * cpi->av_per_frame_bandwidth;

+        max_bits -= cpi->buffered_av_per_frame_bandwidth;

+        max_bits *= ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0);

+    }

+    // VBR

+    else

+    {

+        // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user

+        max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));

+    }

-        // For CBR base this on the target average bits per frame plus the maximum sedction rate passed in by the user

-        max_bits = (int)(cpi->av_per_frame_bandwidth * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));

+    // Trap case where we are out of bits

+    if (max_bits < 0)

+        max_bits = 0;

-        // If our buffer is below the optimum level

-        if (buffer_fullness_ratio < 1.0)

-        {

-            // The lower of max_bits / 4 or cpi->av_per_frame_bandwidth / 4.

-            int min_max_bits = ((cpi->av_per_frame_bandwidth >> 2) < (max_bits >> 2)) ? cpi->av_per_frame_bandwidth >> 2 : max_bits >> 2;

+    return max_bits;

+}

-            max_bits = (int)(max_bits * buffer_fullness_ratio);

-            if (max_bits < min_max_bits)

-                max_bits = min_max_bits;       // Lowest value we will set ... which should allow the buffer to refil.

+static int gf_group_max_bits(VP8_COMP *cpi)

+{

+    // Max allocation for a golden frame group

+    int max_bits;

+    // For CBR we need to also consider buffer fullness.

+    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)

+    {

+        max_bits = cpi->av_per_frame_bandwidth * cpi->baseline_gf_interval;

+        if (max_bits > cpi->oxcf.optimal_buffer_level)

+        {

+            max_bits -= cpi->oxcf.optimal_buffer_level;

+            max_bits += cpi->buffer_level;

+        else

+        {

+            max_bits -= (cpi->buffered_av_per_frame_bandwidth

+                         - cpi->av_per_frame_bandwidth)

+                        * cpi->baseline_gf_interval;

+        }

+        max_bits *= ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0);

-    // VBR

     else

         // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user

         max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));

+        max_bits *=  cpi->baseline_gf_interval;

     // Trap case where we are out of bits

     if (max_bits < 0)

         max_bits = 0;

@@ -582,7 +578,7 @@

             xd->left_available = (mb_col != 0);

             // do intra 16x16 prediction

-            this_error = encode_intra(cpi, x, use_dc_pred);

+            this_error = vp8_encode_intra(cpi, x, use_dc_pred);

             // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)

             // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.

@@ -1362,7 +1358,7 @@

     double abs_mv_in_out_accumulator = 0.0;

     double mod_err_per_mb_accumulator = 0.0;

-    int max_bits = frame_max_bits(cpi);     // Max for a single frame

+    int max_group_bits;

     unsigned int allow_alt_ref =

                     cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;

@@ -1715,8 +1711,9 @@

     cpi->twopass.gf_group_bits = (cpi->twopass.gf_group_bits < 0) ? 0 : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits) ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;

     // Clip cpi->twopass.gf_group_bits based on user supplied data rate variability limit (cpi->oxcf.two_pass_vbrmax_section)

-    if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval)

-        cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval;

+    max_group_bits = gf_group_max_bits(cpi);

+    if (cpi->twopass.gf_group_bits > max_group_bits)

+        cpi->twopass.gf_group_bits = max_group_bits;

     // Reset the file position

     reset_fpf_position(cpi, start_pos);

@@ -1725,14 +1722,15 @@

     cpi->twopass.modified_error_used += gf_group_err;

     // Assign  bits to the arf or gf.

-    {

+    for (i = 0; i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME); i++) {

         int Boost;

         int frames_in_section;

         int allocation_chunks;

         int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;

+        int gf_bits;

         // For ARF frames

-        if (cpi->source_alt_ref_pending)

+        if (cpi->source_alt_ref_pending && i == 0)

             Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);

             //Boost += (cpi->baseline_gf_interval * 25);

@@ -1771,7 +1769,7 @@

         // Calculate the number of bits to be spent on the gf or arf based on the boost number

-        cpi->twopass.gf_bits = (int)((double)Boost * (cpi->twopass.gf_group_bits / (double)allocation_chunks));

+        gf_bits = (int)((double)Boost * (cpi->twopass.gf_group_bits / (double)allocation_chunks));

         // If the frame that is to be boosted is simpler than the average for

         // the gf/arf group then use an alternative calculation

@@ -1789,9 +1787,9 @@

             alt_gf_bits = (int)((double)Boost * (alt_gf_grp_bits /

                                                  (double)allocation_chunks));

-            if (cpi->twopass.gf_bits > alt_gf_bits)

+            if (gf_bits > alt_gf_bits)

-                cpi->twopass.gf_bits = alt_gf_bits;

+                gf_bits = alt_gf_bits;

         // Else if it is harder than other frames in the group make sure it at

@@ -1804,23 +1802,29 @@

                       mod_frame_err /

                       DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left));

-            if (alt_gf_bits > cpi->twopass.gf_bits)

+            if (alt_gf_bits > gf_bits)

-                cpi->twopass.gf_bits = alt_gf_bits;

+                gf_bits = alt_gf_bits;

-        // Apply an additional limit for CBR

-        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)

+        // Dont allow a negative value for gf_bits

+        if (gf_bits < 0)

+            gf_bits = 0;

+        gf_bits += cpi->min_frame_bandwidth;                     // Add in minimum for a frame

+        if (i == 0)

-            if (cpi->twopass.gf_bits > (cpi->buffer_level >> 1))

-                cpi->twopass.gf_bits = cpi->buffer_level >> 1;

+            cpi->twopass.gf_bits = gf_bits;

+        if (i == 1 || (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)))

+        {

+            cpi->per_frame_bandwidth = gf_bits;                 // Per frame bit target for this frame

+        }

+    }

-        // Dont allow a negative value for gf_bits

-        if (cpi->twopass.gf_bits < 0)

-            cpi->twopass.gf_bits = 0;

+    {

         // Adjust KF group bits and error remainin

         cpi->twopass.kf_group_error_left -= gf_group_err;

         cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;

@@ -1835,7 +1839,7 @@

         else

             cpi->twopass.gf_group_error_left = gf_group_err;

-        cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits;

+        cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits - cpi->min_frame_bandwidth;

         if (cpi->twopass.gf_group_bits < 0)

             cpi->twopass.gf_group_bits = 0;

@@ -1851,15 +1855,8 @@

         else

             cpi->twopass.mid_gf_extra_bits = 0;

-        cpi->twopass.gf_bits += cpi->min_frame_bandwidth;                                              // Add in minimum for a frame

-    if (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))                  // Normal GF and not a KF

-    {

-        cpi->per_frame_bandwidth = cpi->twopass.gf_bits;                                               // Per frame bit target for this frame

-    }

     // Adjustment to estimate_max_q based on a measure of complexity of the section

     if (cpi->common.frame_type != KEY_FRAME)

@@ -1907,12 +1904,6 @@

     int max_bits = frame_max_bits(cpi);    // Max for a single frame

-    // The final few frames have special treatment

-    if (cpi->frames_till_gf_update_due >= (int)(cpi->twopass.total_stats->count - cpi->common.current_video_frame))

-    {

-        cpi->twopass.gf_group_bits = (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;;

-    }

     // Calculate modified prediction error used in bit allocation

     modified_err = calculate_modified_err(cpi, this_frame);

@@ -2014,22 +2005,10 @@

         if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))

             // Assign a standard frames worth of bits from those allocated to the GF group

+            int bak = cpi->per_frame_bandwidth;

             vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

             assign_std_frame_bits(cpi, &this_frame_copy);

-            // If appropriate (we are switching into ARF active but it was not previously active) apply a boost for the gf at the start of the group.

-            //if ( !cpi->source_alt_ref_active && (cpi->gfu_boost > 150) )

-            if (FALSE)

-            {

-                int extra_bits;

-                int pct_extra = (cpi->gfu_boost - 100) / 50;

-                pct_extra = (pct_extra > 20) ? 20 : pct_extra;

-                extra_bits = (cpi->twopass.gf_group_bits * pct_extra) / 100;

-                cpi->twopass.gf_group_bits -= extra_bits;

-                cpi->per_frame_bandwidth += extra_bits;

-            }

+            cpi->per_frame_bandwidth = bak;

--- a/vp8/encoder/generic/csystemdependent.c

+++ b/vp8/encoder/generic/csystemdependent.c

@@ -67,9 +67,6 @@

     cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;

     cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;

-    cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;

-    cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;

-    cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;

     cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;

     cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;

--- a/vp8/encoder/onyx_if.c

+++ b/vp8/encoder/onyx_if.c

@@ -1546,6 +1546,7 @@

     cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;

     cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;

     cpi->long_rolling_actual_bits     = cpi->av_per_frame_bandwidth;

+    cpi->buffered_av_per_frame_bandwidth = cpi->av_per_frame_bandwidth;

     cpi->total_actual_bits            = 0;

     cpi->total_target_vs_actual       = 0;

@@ -1641,7 +1642,7 @@

         break;

-    if (cpi->pass == 0)

+    if (cpi->pass == 0 && cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER)

         cpi->auto_worst_q = 1;

     cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];

@@ -3528,7 +3529,8 @@

     // For CBR if the buffer reaches its maximum level then we can no longer

     // save up bits for later frames so we might as well use them up

     // on the current frame.

-    if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&

+    if (cpi->pass == 2

+        && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&

         (cpi->buffer_level >= cpi->oxcf.optimal_buffer_level) && cpi->buffered_mode)

         int Adjustment = cpi->active_worst_quality / 4;       // Max adjustment is 1/4

@@ -3619,6 +3621,9 @@

         else

+            if(cpi->pass != 2)

+                Q = cpi->avg_frame_qindex;

             cpi->active_best_quality = inter_minq[Q];

             // For the constant/constrained quality mode we dont want

@@ -3931,15 +3936,16 @@

             (cpi->active_worst_quality < cpi->worst_quality)      &&

             (cpi->projected_frame_size > frame_over_shoot_limit))

-            int over_size_percent = ((cpi->projected_frame_size - frame_over_shoot_limit) * 100) / frame_over_shoot_limit;

+            /* step down active_worst_quality such that the corresponding

+             * active_best_quality will be equal to the current

+             * active_worst_quality + 1

+             */

+            int i;

-            // If so is there any scope for relaxing it

-            while ((cpi->active_worst_quality < cpi->worst_quality) && (over_size_percent > 0))

-            {

-                cpi->active_worst_quality++;

-                top_index = cpi->active_worst_quality;

-                over_size_percent = (int)(over_size_percent * 0.96);        // Assume 1 qstep = about 4% on frame size.

-            }

+            for(i=cpi->active_worst_quality; i<cpi->worst_quality; i++)

+                if(inter_minq[i] >= cpi->active_worst_quality + 1)

+                    break;

+            cpi->active_worst_quality = i;

             // If we have updated the active max Q do not call vp8_update_rate_correction_factors() this loop.

             active_worst_qchanged = TRUE;

@@ -4327,10 +4333,9 @@

     // Update the buffer level variable.

     // Non-viewable frames are a special case and are treated as pure overhead.

-    if ( !cm->show_frame )

-        cpi->bits_off_target -= cpi->projected_frame_size;

-    else

-        cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;

+    if ( cm->show_frame )

+        cpi->bits_off_target += cpi->av_per_frame_bandwidth;

+    cpi->bits_off_target -= cpi->projected_frame_size;

     // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass.

     cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;

@@ -4344,7 +4349,33 @@

     // Debug stats

     cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);

-    cpi->buffer_level = cpi->bits_off_target;

+    // Update the buffered average bitrate

+    {

+        long long numerator;

+        numerator = cpi->oxcf.maximum_buffer_size

+                    - cpi->buffered_av_per_frame_bandwidth

+                    + cpi->projected_frame_size;

+        numerator *= cpi->buffered_av_per_frame_bandwidth;

+        cpi->buffered_av_per_frame_bandwidth = numerator

+                                               / cpi->oxcf.maximum_buffer_size;

+    }

+    {

+        long long tmp = (long long)cpi->buffered_av_per_frame_bandwidth

+                        * cpi->oxcf.maximum_buffer_size

+                        / cpi->av_per_frame_bandwidth;

+        cpi->buffer_level = cpi->oxcf.maximum_buffer_size

+                            - tmp

+                            + cpi->oxcf.optimal_buffer_level;

+    }

+    // Accumulate overshoot error.

+    cpi->accumulated_overshoot +=

+        (cpi->projected_frame_size > cpi->av_per_frame_bandwidth)

+        ? cpi->projected_frame_size - cpi->av_per_frame_bandwidth

+        : 0;

     // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames

     if (cm->frame_type == KEY_FRAME)

--- a/vp8/encoder/onyx_int.h

+++ b/vp8/encoder/onyx_int.h

@@ -47,8 +47,8 @@

 #define MIN_THRESHMULT  32

 #define MAX_THRESHMULT  512

-#define GF_ZEROMV_ZBIN_BOOST 24

-#define LF_ZEROMV_ZBIN_BOOST 12

+#define GF_ZEROMV_ZBIN_BOOST 12

+#define LF_ZEROMV_ZBIN_BOOST 6

 #define MV_ZBIN_BOOST        4

 #define ZBIN_OQ_MAX 192

@@ -351,6 +351,10 @@

     int per_frame_bandwidth;          // Current section per frame bandwidth target

     int av_per_frame_bandwidth;        // Average frame size target for clip

     int min_frame_bandwidth;          // Minimum allocation that should be used for any frame

+    int buffered_av_per_frame_bandwidth; // Average bitrate over the last buffer

+    int buffered_av_per_frame_bandwidth_rem; // Average bitrate remainder

+    int accumulated_overshoot;           // Accumulated # of bits spent > target

     int inter_frame_target;

     double output_frame_rate;

     long long last_time_stamp_seen;

--- a/vp8/encoder/pickinter.c

+++ b/vp8/encoder/pickinter.c

@@ -43,7 +43,6 @@

 extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];

-extern unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);

 extern unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);

 extern int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *best_ref_mv, int best_rd, int *, int *, int *, int, int *mvcost[2], int, int fullpixel);

 extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);

@@ -98,38 +97,7 @@

-unsigned int vp8_get16x16pred_error_c

-(

-    const unsigned char *src_ptr,

-    int src_stride,

-    const unsigned char *ref_ptr,

-    int ref_stride

-)

-{

-    unsigned pred_error = 0;

-    int i, j;

-    int sum = 0;

-    for (i = 0; i < 16; i++)

-    {

-        int diff;

-        for (j = 0; j < 16; j++)

-        {

-            diff = src_ptr[j] - ref_ptr[j];

-            sum += diff;

-            pred_error += diff * diff;

-        }

-        src_ptr += src_stride;

-        ref_ptr += ref_stride;

-    }

-    pred_error -= sum * sum / 256;

-    return pred_error;

-}

 unsigned int vp8_get4x4sse_cs_c

     const unsigned char *src_ptr,

@@ -172,8 +140,7 @@

     MACROBLOCK *x,

     int ib,

     B_PREDICTION_MODE *best_mode,

-    B_PREDICTION_MODE above,

-    B_PREDICTION_MODE left,

+    unsigned int *mode_costs,

     int *bestrate,

     int *bestdistortion)

@@ -185,17 +152,7 @@

     int best_rd = INT_MAX;       // 1<<30

     int rate;

     int distortion;

-    unsigned int *mode_costs;

-    if (x->e_mbd.frame_type == KEY_FRAME)

-    {

-        mode_costs = x->bmode_costs[above][left];

-    }

-    else

-    {

-        mode_costs = x->inter_bmode_costs;

-    }

     for (mode = B_DC_PRED; mode <= B_HE_PRED /*B_HU_PRED*/; mode++)

         int this_rd;

@@ -221,7 +178,7 @@

-int vp8_pick_intra4x4mby_modes

+static int pick_intra4x4mby_modes

     const VP8_ENCODER_RTCD *rtcd,

     MACROBLOCK *mb,

@@ -234,21 +191,31 @@

     int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];

     int error;

     int distortion = 0;

+    unsigned int *bmode_costs;

     vp8_intra_prediction_down_copy(xd);

+    bmode_costs = mb->inter_bmode_costs;

     for (i = 0; i < 16; i++)

         MODE_INFO *const mic = xd->mode_info_context;

         const int mis = xd->mode_info_stride;

-        const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);

-        const B_PREDICTION_MODE L = left_block_mode(mic, i);

         B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);

         int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(d);

-        pick_intra4x4block(rtcd, mb, i, &best_mode, A, L, &r, &d);

+        if (mb->e_mbd.frame_type == KEY_FRAME)

+        {

+            const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);

+            const B_PREDICTION_MODE L = left_block_mode(mic, i);

+            bmode_costs  = mb->bmode_costs[A][L];

+        }

+        pick_intra4x4block(rtcd, mb, i, &best_mode, bmode_costs, &r, &d);

         cost += r;

         distortion += d;

         mic->bmi[i].as_mode = best_mode;

@@ -275,7 +242,7 @@

     return error;

-void vp8_pick_intra_mbuv_mode(MACROBLOCK *mb)

+static void pick_intra_mbuv_mode(MACROBLOCK *mb)

     MACROBLOCKD *x = &mb->e_mbd;

@@ -443,7 +410,6 @@

     BLOCK *b = &x->block[0];

     BLOCKD *d = &x->e_mbd.block[0];

     MACROBLOCKD *xd = &x->e_mbd;

-    union b_mode_info best_bmodes[16];

     MB_MODE_INFO best_mbmode;

     int_mv best_ref_mv;

@@ -450,12 +416,10 @@

     int_mv mode_mv[MB_MODE_COUNT];

     MB_PREDICTION_MODE this_mode;

     int num00;

-    int i;

     int mdcounts[4];

     int best_rd = INT_MAX; // 1 << 30;

     int best_intra_rd = INT_MAX;

     int mode_index;

-    int ref_frame_cost[MAX_REF_FRAMES];

     int rate;

     int rate2;

     int distortion2;

@@ -462,7 +426,7 @@

     int bestsme;

     //int all_rds[MAX_MODES];         // Experimental debug code.

     int best_mode_index = 0;

-    unsigned int sse = INT_MAX;

+    unsigned int sse = INT_MAX, best_sse = INT_MAX;

     int_mv mvp;

     int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};

@@ -485,7 +449,6 @@

     vpx_memset(nearest_mv, 0, sizeof(nearest_mv));

     vpx_memset(near_mv, 0, sizeof(near_mv));

     vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));

-    vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));

     // set up all the refframe dependent pointers.

@@ -536,32 +499,6 @@

     *returnintra = INT_MAX;

     x->skip = 0;

-    ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(cpi->prob_intra_coded);

-    // Special case treatment when GF and ARF are not sensible options for reference

-    if (cpi->ref_frame_flags == VP8_LAST_FLAG)

-    {

-        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)

-                                        + vp8_cost_zero(255);

-        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)

-                                        + vp8_cost_one(255)

-                                        + vp8_cost_zero(128);

-        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)

-                                        + vp8_cost_one(255)

-                                        + vp8_cost_one(128);

-    }

-    else

-    {

-        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)

-                                        + vp8_cost_zero(cpi->prob_last_coded);

-        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)

-                                        + vp8_cost_one(cpi->prob_last_coded)

-                                        + vp8_cost_zero(cpi->prob_gf_coded);

-        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)

-                                        + vp8_cost_one(cpi->prob_last_coded)

-                                        + vp8_cost_one(cpi->prob_gf_coded);

-    }

     x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

     // if we encode a new mv this is important

@@ -613,7 +550,8 @@

         x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;

         // Work out the cost assosciated with selecting the reference frame

-        frame_cost = ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];

+        frame_cost =

+            x->e_mbd.ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];

         rate2 += frame_cost;

         // everything but intra

@@ -659,10 +597,9 @@

         switch (this_mode)

         case B_PRED:

-            // Pass best so far to vp8_pick_intra4x4mby_modes to use as breakout

-            distortion2 = *returndistortion;

-            vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x,

-                                         &rate, &distortion2);

+            // Pass best so far to pick_intra4x4mby_modes to use as breakout

+            distortion2 = best_sse;

+            pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate, &distortion2);

             if (distortion2 == INT_MAX)

@@ -672,9 +609,9 @@

                 rate2 += rate;

                 distortion2 = VARIANCE_INVOKE

-                                (&cpi->rtcd.variance, get16x16prederror)(

+                                (&cpi->rtcd.variance, var16x16)(

                                     x->src.y_buffer, x->src.y_stride,

-                                    x->e_mbd.predictor, 16);

+                                    x->e_mbd.predictor, 16, &sse);

                 this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);

                 if (this_rd < best_intra_rd)

@@ -697,7 +634,9 @@

         case TM_PRED:

             RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)

                 (&x->e_mbd);

-            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16);

+            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)

+                                          (x->src.y_buffer, x->src.y_stride,

+                                          x->e_mbd.predictor, 16, &sse);

             rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];

             this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);

@@ -886,15 +825,10 @@

             *returnrate = rate2;

             *returndistortion = distortion2;

+            best_sse = sse;

             best_rd = this_rd;

             vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));

-            if (this_mode == B_PRED)

-                for (i = 0; i < 16; i++)

-                {

-                     best_bmodes[i].as_mode = x->e_mbd.block[i].bmi.as_mode;

-                }

             // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time

             cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;

             cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];

@@ -956,15 +890,52 @@

     if (best_mbmode.mode <= B_PRED)

         /* set mode_info_context->mbmi.uv_mode */

-        vp8_pick_intra_mbuv_mode(x);

+        pick_intra_mbuv_mode(x);

-    if (x->e_mbd.mode_info_context->mbmi.mode == B_PRED)

+    update_mvcount(cpi, &x->e_mbd, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]);

+}

+void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)

+{

+    int error4x4, error16x16 = INT_MAX;

+    int rate, best_rate = 0, distortion, best_sse;

+    MB_PREDICTION_MODE mode, best_mode = DC_PRED;

+    int this_rd;

+    unsigned int sse;

+    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

+    pick_intra_mbuv_mode(x);

+    for (mode = DC_PRED; mode <= TM_PRED; mode ++)

-        for (i = 0; i < 16; i++)

+        x->e_mbd.mode_info_context->mbmi.mode = mode;

+        RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)

+            (&x->e_mbd);

+        distortion = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)

+            (x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, &sse);

+        rate = x->mbmode_cost[x->e_mbd.frame_type][mode];

+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

+        if (error16x16 > this_rd)

-            x->e_mbd.block[i].bmi.as_mode = best_bmodes[i].as_mode;

+            error16x16 = this_rd;

+            best_mode = mode;

+            best_sse = sse;

+            best_rate = rate;

-    update_mvcount(cpi, &x->e_mbd, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]);

+    x->e_mbd.mode_info_context->mbmi.mode = best_mode;

+    error4x4 = pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate,

+                                      &best_sse);

+    if (error4x4 < error16x16)

+    {

+        x->e_mbd.mode_info_context->mbmi.mode = B_PRED;

+        best_rate = rate;

+    }

+    *rate_ = best_rate;

--- a/vp8/encoder/pickinter.h

+++ b/vp8/encoder/pickinter.h

@@ -14,7 +14,6 @@

 #include "vpx_ports/config.h"

 #include "vp8/common/onyxc_int.h"

-extern int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *, MACROBLOCK *mb, int *Rate, int *Distortion);

-extern void vp8_pick_intra_mbuv_mode(MACROBLOCK *mb);

 extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);

+extern void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);

 #endif

--- a/vp8/encoder/ppc/csystemdependent.c

+++ b/vp8/encoder/ppc/csystemdependent.c

@@ -48,9 +48,6 @@

 void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);

 void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);

-unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);

-unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);

-unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);

 unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);

 // c imports

@@ -88,9 +85,6 @@

 extern sub_pixel_variance_function sub_pixel_variance16x16_c;

 extern unsigned int vp8_get_mb_ss_c(short *);

-extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);

-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);

-extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);

 extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);

 // ppc

@@ -149,9 +143,6 @@

     vp8_sub_pixel_variance16x16   = vp8_sub_pixel_variance16x16_ppc;

     vp8_get_mb_ss                 = vp8_get_mb_ss_c;

-    vp8_get16x16pred_error       = vp8_get16x16pred_error_c;

-    vp8_get8x8var               = vp8_get8x8var_ppc;

-    vp8_get16x16var             = vp8_get16x16var_ppc;

     vp8_get4x4sse_cs            = vp8_get4x4sse_cs_c;

     vp8_sad16x16                = vp8_sad16x16_ppc;

--- a/vp8/encoder/ratectrl.c

+++ b/vp8/encoder/ratectrl.c

@@ -650,10 +650,10 @@

 static void calc_pframe_target_size(VP8_COMP *cpi)

-    int min_frame_target;

+    int min_frame_target, max_frame_target;

     int Adjustment;

-    min_frame_target = 0;

+    min_frame_target = 1;

     if (cpi->pass == 2)

@@ -661,10 +661,19 @@

         if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))

             min_frame_target = cpi->av_per_frame_bandwidth >> 5;

+        max_frame_target = INT_MAX;

-    else if (min_frame_target < cpi->per_frame_bandwidth / 4)

-        min_frame_target = cpi->per_frame_bandwidth / 4;

+    else

+    {

+        if (min_frame_target < cpi->per_frame_bandwidth / 4)

+            min_frame_target = cpi->per_frame_bandwidth / 4;

+        /* Don't allow the target to completely deplete the buffer. */

+        max_frame_target = cpi->buffer_level + cpi->av_per_frame_bandwidth;

+        if(max_frame_target < min_frame_target)

+            max_frame_target = min_frame_target;

+    }

     // Special alt reference frame case

     if (cpi->common.refresh_alt_ref_frame)

@@ -1157,6 +1166,32 @@

+    if (cpi->pass==0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER){

+        /* determine the accumulated error to apply to this frame. Apply

+         * more of the error when we've been undershooting, less when

+         * we've been overshooting

+         */

+        long long adjust;

+        int bitrate_error;

+        bitrate_error = cpi->av_per_frame_bandwidth

+                        - cpi->buffered_av_per_frame_bandwidth;

+        adjust = cpi->accumulated_overshoot;

+        adjust *= cpi->av_per_frame_bandwidth + bitrate_error;

+        adjust /= cpi->oxcf.maximum_buffer_size;

+        if (adjust > (cpi->this_frame_target - min_frame_target))

+            adjust = (cpi->this_frame_target - min_frame_target);

+        else if (adjust < 0)

+            adjust = 0;

+        cpi->this_frame_target -= adjust;

+        cpi->accumulated_overshoot -= adjust;

+    }

+    if(cpi->this_frame_target > max_frame_target)

+        cpi->this_frame_target = max_frame_target;

--- a/vp8/encoder/rdopt.c

+++ b/vp8/encoder/rdopt.c

@@ -201,47 +201,47 @@

 /* values are now correlated to quantizer */

 static int sad_per_bit16lut[QINDEX_RANGE] =

+    2,  2,  2,  2,  2,  2,  2,  2,

+    2,  2,  2,  2,  2,  2,  2,  2,

+    3,  3,  3,  3,  3,  3,  3,  3,

+    3,  3,  3,  3,  3,  3,  4,  4,

+    4,  4,  4,  4,  4,  4,  4,  4,

+    4,  4,  5,  5,  5,  5,  5,  5,

     5,  5,  5,  5,  5,  5,  6,  6,

-    6,  6,  6,  6,  6,  7,  7,  7,

-    7,  7,  7,  7,  8,  8,  8,  8,

-    8,  8,  8,  8,  8,  8,  9,  9,

-    9,  9,  9,  9, 10, 10, 10, 10,

-    10, 10, 11, 11, 11, 11, 11, 11,

-    12, 12, 12, 12, 12, 12, 12, 13,

-    13, 13, 13, 13, 13, 14, 14, 14,

-    14, 14, 15, 15, 15, 15, 15, 15,

-    16, 16, 16, 16, 16, 16, 17, 17,

-    17, 17, 17, 17, 17, 18, 18, 18,

-    18, 18, 19, 19, 19, 19, 19, 19,

-    20, 20, 20, 21, 21, 21, 21, 22,

-    22, 22, 23, 23, 23, 24, 24, 24,

-    25, 25, 26, 26, 27, 27, 27, 28,

-    28, 28, 29, 29, 30, 30, 31, 31

+    6,  6,  6,  6,  6,  6,  6,  6,

+    6,  6,  7,  7,  7,  7,  7,  7,

+    7,  7,  7,  7,  7,  7,  8,  8,

+    8,  8,  8,  8,  8,  8,  8,  8,

+    8,  8,  9,  9,  9,  9,  9,  9,

+    9,  9,  9,  9,  9,  9,  10, 10,

+    10, 10, 10, 10, 10, 10, 11, 11,

+    11, 11, 11, 11, 12, 12, 12, 12,

+    12, 12, 13, 13, 13, 13, 14, 14

};

 static int sad_per_bit4lut[QINDEX_RANGE] =

-    5,  5,  5,  5,  5,  5,  7,  7,

+    2,  2,  2,  2,  2,  2,  3,  3,

+    3,  3,  3,  3,  3,  3,  3,  3,

+    3,  3,  3,  3,  4,  4,  4,  4,

+    4,  4,  4,  4,  4,  4,  5,  5,

+    5,  5,  5,  5,  6,  6,  6,  6,

+    6,  6,  6,  6,  6,  6,  6,  6,

+    7,  7,  7,  7,  7,  7,  7,  7,

     7,  7,  7,  7,  7,  8,  8,  8,

-    8,  8,  8,  8,  10, 10, 10, 10,

-    10, 10, 10, 10, 10, 10, 11, 11,

-    11, 11, 11, 11, 13, 13, 13, 13,

-    13, 13, 14, 14, 14, 14, 14, 14,

-    16, 16, 16, 16, 16, 16, 16, 17,

-    17, 17, 17, 17, 17, 19, 19, 19,

-    19, 19, 20, 20, 20, 20, 20, 20,

-    22, 22, 22, 22, 22, 22, 23, 23,

-    23, 23, 23, 23, 23, 25, 25, 25,

-    25, 25, 26, 26, 26, 26, 26, 26,

-    28, 28, 28, 29, 29, 29, 29, 31,

-    31, 31, 32, 32, 32, 34, 34, 34,

-    35, 35, 37, 37, 38, 38, 38, 40,

-    40, 40, 41, 41, 43, 43, 44, 44,

+    8,  8,  9,  9,  9,  9,  9,  9,

+    10, 10, 10, 10, 10, 10, 10, 10,

+    11, 11, 11, 11, 11, 11, 11, 11,

+    12, 12, 12, 12, 12, 12, 12, 12,

+    13, 13, 13, 13, 13, 13, 13, 14,

+    14, 14, 14, 14, 15, 15, 15, 15,

+    16, 16, 16, 16, 17, 17, 17, 18,

+    18, 18, 19, 19, 19, 20, 20, 20,

};

 void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex)

-    cpi->mb.sadperbit16 =  sad_per_bit16lut[QIndex]/2;

-    cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex]/2;

+    cpi->mb.sadperbit16 =  sad_per_bit16lut[QIndex];

+    cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex];

@@ -719,8 +719,8 @@

     return best_rd;

-int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,

-                                  int *rate_y, int *Distortion, int best_rd)

+static int rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,

+                                     int *rate_y, int *Distortion, int best_rd)

     MACROBLOCKD *const xd = &mb->e_mbd;

     int i;

@@ -782,11 +782,13 @@

     return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);

-int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi,

-                                   MACROBLOCK *x,

-                                   int *Rate,

-                                   int *rate_y,

-                                   int *Distortion)

+static int rd_pick_intra16x16mby_mode(VP8_COMP *cpi,

+                                      MACROBLOCK *x,

+                                      int *Rate,

+                                      int *rate_y,

+                                      int *Distortion)

     MB_PREDICTION_MODE mode;

     MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

@@ -858,7 +860,7 @@

     return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

-void vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int *distortion)

+static void rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int *distortion)

     MB_PREDICTION_MODE mode;

     MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

@@ -1795,7 +1797,6 @@

     int distortion;

     int best_rd = INT_MAX;

     int best_intra_rd = INT_MAX;

-    int ref_frame_cost[MAX_REF_FRAMES];

     int rate2, distortion2;

     int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;

     int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);

@@ -1872,36 +1873,10 @@

     x->skip = 0;

-    ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(cpi->prob_intra_coded);

-    // Special case treatment when GF and ARF are not sensible options for reference

-    if (cpi->ref_frame_flags == VP8_LAST_FLAG)

-    {

-        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)

-                                        + vp8_cost_zero(255);

-        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)

-                                        + vp8_cost_one(255)

-                                        + vp8_cost_zero(128);

-        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)

-                                        + vp8_cost_one(255)

-                                        + vp8_cost_one(128);

-    }

-    else

-    {

-        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)

-                                        + vp8_cost_zero(cpi->prob_last_coded);

-        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)

-                                        + vp8_cost_one(cpi->prob_last_coded)

-                                        + vp8_cost_zero(cpi->prob_gf_coded);

-        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)

-                                        + vp8_cost_one(cpi->prob_last_coded)

-                                        + vp8_cost_one(cpi->prob_gf_coded);

-    }

     vpx_memset(mode_mv, 0, sizeof(mode_mv));

     x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

-    vp8_rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate, &uv_intra_rate_tokenonly, &uv_intra_distortion);

+    rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate, &uv_intra_rate_tokenonly, &uv_intra_distortion);

     uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode;

     for (mode_index = 0; mode_index < MAX_MODES; mode_index++)

@@ -2024,7 +1999,7 @@

             int tmp_rd;

             // Note the rate value returned here includes the cost of coding the BPRED mode : x->mbmode_cost[x->e_mbd.frame_type][BPRED];

-            tmp_rd = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd);

+            tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd);

             rate2 += rate;

             distortion2 += distortion;

@@ -2247,8 +2222,8 @@

             else if (x->encode_breakout)

-                int sum;

                 unsigned int sse;

+                unsigned int var;

                 int threshold = (xd->block[0].dequant[1]

                             * xd->block[0].dequant[1] >>4);

@@ -2255,21 +2230,20 @@

                 if(threshold < x->encode_breakout)

                     threshold = x->encode_breakout;

-                VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)

-                    (x->src.y_buffer, x->src.y_stride,

-                     x->e_mbd.predictor, 16, &sse, &sum);

+                var = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)

+                        (x->src.y_buffer, x->src.y_stride,

+                        x->e_mbd.predictor, 16, &sse);

                 if (sse < threshold)

-                    // Check u and v to make sure skip is ok

-                    int sse2 = 0;

+                     unsigned int q2dc = xd->block[24].dequant[0];

                     /* If theres is no codeable 2nd order dc

                        or a very small uniform pixel change change */

-                    if (abs(sum) < (xd->block[24].dequant[0]<<2)||

-                        ((sum * sum>>8) > sse && abs(sum) <128))

+                    if ((sse - var < q2dc * q2dc >>4) ||

+                        (sse /2 > var && sse-var < 64))

-                        sse2 = VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance));

+                        // Check u and v to make sure skip is ok

+                        int sse2=  VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance));

                         if (sse2 * 2 < threshold)

                             x->skip = 1;

@@ -2319,8 +2293,11 @@

             rate2 += other_cost;

-        // Estimate the reference frame signaling cost and add it to the rolling cost variable.

-        rate2 += ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];

+        /* Estimate the reference frame signaling cost and add it

+         * to the rolling cost variable.

+         */

+        rate2 +=

+            x->e_mbd.ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];

         if (!disable_skip)

@@ -2384,7 +2361,8 @@

                 x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;

-            other_cost += ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];

+            other_cost +=

+            x->e_mbd.ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];

             /* Calculate the final y RD estimate for this mode */

             best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2-rate_uv-other_cost),

@@ -2491,4 +2469,40 @@

     rd_update_mvcount(cpi, x, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]);

+}

+void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)

+{

+    int error4x4, error16x16;

+    int rate4x4, rate16x16 = 0, rateuv;

+    int dist4x4, dist16x16, distuv;

+    int rate;

+    int rate4x4_tokenonly = 0;

+    int rate16x16_tokenonly = 0;

+    int rateuv_tokenonly = 0;

+    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

+    rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);

+    rate = rateuv;

+    error16x16 = rd_pick_intra16x16mby_mode(cpi, x,

+                                            &rate16x16, &rate16x16_tokenonly,

+                                            &dist16x16);

+    error4x4 = rd_pick_intra4x4mby_modes(cpi, x,

+                                         &rate4x4, &rate4x4_tokenonly,

+                                         &dist4x4, error16x16);

+    if (error4x4 < error16x16)

+    {

+        x->e_mbd.mode_info_context->mbmi.mode = B_PRED;

+        rate += rate4x4;

+    }

+    else

+    {

+        rate += rate16x16;

+    }

+    *rate_ = rate;

--- a/vp8/encoder/rdopt.h

+++ b/vp8/encoder/rdopt.h

@@ -15,10 +15,8 @@

 #define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )

 extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);

-extern int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion, int best_rd);

-extern int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *returnrate, int *rate_to, int *returndistortion);

-extern void vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_to, int *distortion);

 extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);

+extern void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);

 extern void vp8_mv_pred

--- a/vp8/encoder/tokenize.c

+++ b/vp8/encoder/tokenize.c

@@ -98,7 +98,6 @@

     const BLOCKD *const b,

     TOKENEXTRA **tp,

     const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */

-    const FRAME_TYPE frametype,

     ENTROPY_CONTEXT *a,

     ENTROPY_CONTEXT *l,

     VP8_COMP *cpi

@@ -120,9 +119,9 @@

             int rc = vp8_default_zig_zag1d[c];

             const int v = qcoeff_ptr[rc];

+#if CONFIG_DEBUG

             assert(-DCT_MAX_VALUE <= v  &&  v < (DCT_MAX_VALUE));

+#endif

             t->Extra = vp8_dct_value_tokens_ptr[v].Extra;

             x        = vp8_dct_value_tokens_ptr[v].Token;

@@ -149,7 +148,6 @@

     const BLOCKD *const b,

     TOKENEXTRA **tp,

     const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */

-    const FRAME_TYPE frametype,

     ENTROPY_CONTEXT *a,

     ENTROPY_CONTEXT *l,

     VP8_COMP *cpi

@@ -173,9 +171,9 @@

             int rc = vp8_default_zig_zag1d[c];

             const int v = qcoeff_ptr[rc];

+#if CONFIG_DEBUG

             assert(-DCT_MAX_VALUE <= v  &&  v < (DCT_MAX_VALUE));

+#endif

             t->Extra = vp8_dct_value_tokens_ptr[v].Extra;

             x        = vp8_dct_value_tokens_ptr[v].Token;

@@ -196,14 +194,11 @@

-static int mb_is_skippable(MACROBLOCKD *x)

+static int mb_is_skippable(MACROBLOCKD *x, int has_y2_block)

-    int has_y2_block;

     int skip = 1;

     int i = 0;

-    has_y2_block = (x->mode_info_context->mbmi.mode != B_PRED

-                    && x->mode_info_context->mbmi.mode != SPLITMV);

     if (has_y2_block)

         for (i = 0; i < 16; i++)

@@ -223,8 +218,12 @@

     ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;

     int plane_type;

     int b;

+    int has_y2_block;

-    x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable(x);

+    has_y2_block = (x->mode_info_context->mbmi.mode != B_PRED

+                    && x->mode_info_context->mbmi.mode != SPLITMV);

+    x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable(x, has_y2_block);

     if (x->mode_info_context->mbmi.mb_skip_coeff)

         cpi->skip_true_count++;

@@ -241,17 +240,12 @@

     cpi->skip_false_count++;

-#if 0

-    vpx_memcpy(cpi->coef_counts_backup, cpi->coef_counts, sizeof(cpi->coef_counts));

-#endif

-    if (x->mode_info_context->mbmi.mode == B_PRED || x->mode_info_context->mbmi.mode == SPLITMV)

+    plane_type = 3;

+    if(has_y2_block)

-        plane_type = 3;

-    }

-    else

-    {

-        tokenize2nd_order_b(x->block + 24, t, 1, x->frame_type,

+        tokenize2nd_order_b(x->block + 24, t, 1,

                    A + vp8_block2above[24], L + vp8_block2left[24], cpi);

         plane_type = 0;

@@ -258,12 +252,12 @@

     for (b = 0; b < 16; b++)

-        tokenize1st_order_b(x->block + b, t, plane_type, x->frame_type,

+        tokenize1st_order_b(x->block + b, t, plane_type,

                             A + vp8_block2above[b],

                             L + vp8_block2left[b], cpi);

     for (b = 16; b < 24; b++)

-        tokenize1st_order_b(x->block + b, t, 2, x->frame_type,

+        tokenize1st_order_b(x->block + b, t, 2,

                             A + vp8_block2above[b],

                             L + vp8_block2left[b], cpi);

@@ -352,10 +346,7 @@

 static __inline void stuff2nd_order_b

-    const BLOCKD *const b,

     TOKENEXTRA **tp,

-    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */

-    const FRAME_TYPE frametype,

     ENTROPY_CONTEXT *a,

     ENTROPY_CONTEXT *l,

     VP8_COMP *cpi

@@ -364,9 +355,6 @@

     int pt; /* near block/prev token context index */

     TOKENEXTRA *t = *tp;        /* store tokens starting here */

     VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);

-    (void) frametype;

-    (void) type;

-    (void) b;

     t->Token = DCT_EOB_TOKEN;

     t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];

@@ -382,10 +370,7 @@

 static __inline void stuff1st_order_b

-    const BLOCKD *const b,

     TOKENEXTRA **tp,

-    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */

-    const FRAME_TYPE frametype,

     ENTROPY_CONTEXT *a,

     ENTROPY_CONTEXT *l,

     VP8_COMP *cpi

@@ -394,9 +379,6 @@

     int pt; /* near block/prev token context index */

     TOKENEXTRA *t = *tp;        /* store tokens starting here */

     VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);

-    (void) frametype;

-    (void) type;

-    (void) b;

     t->Token = DCT_EOB_TOKEN;

     t->context_tree = cpi->common.fc.coef_probs [0] [1] [pt];

@@ -411,10 +393,7 @@

 static __inline

 void stuff1st_order_buv

-    const BLOCKD *const b,

     TOKENEXTRA **tp,

-    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */

-    const FRAME_TYPE frametype,

     ENTROPY_CONTEXT *a,

     ENTROPY_CONTEXT *l,

     VP8_COMP *cpi

@@ -423,9 +402,6 @@

     int pt; /* near block/prev token context index */

     TOKENEXTRA *t = *tp;        /* store tokens starting here */

     VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);

-    (void) frametype;

-    (void) type;

-    (void) b;

     t->Token = DCT_EOB_TOKEN;

     t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];

@@ -445,17 +421,17 @@

     int plane_type;

     int b;

-    stuff2nd_order_b(x->block + 24, t, 1, x->frame_type,

+    stuff2nd_order_b(t,

                      A + vp8_block2above[24], L + vp8_block2left[24], cpi);

     plane_type = 0;

     for (b = 0; b < 16; b++)

-        stuff1st_order_b(x->block + b, t, plane_type, x->frame_type,

+        stuff1st_order_b(t,

                          A + vp8_block2above[b],

                          L + vp8_block2left[b], cpi);

     for (b = 16; b < 24; b++)

-        stuff1st_order_buv(x->block + b, t, 2, x->frame_type,

+        stuff1st_order_buv(t,

                            A + vp8_block2above[b],

                            L + vp8_block2left[b], cpi);

--- a/vp8/encoder/variance.h

+++ b/vp8/encoder/variance.h

@@ -308,21 +308,6 @@

 #endif

 extern prototype_variance(vp8_variance_mse16x16);

-#ifndef vp8_variance_get16x16prederror

-#define vp8_variance_get16x16prederror vp8_get16x16pred_error_c

-#endif

-extern prototype_get16x16prederror(vp8_variance_get16x16prederror);

-#ifndef vp8_variance_get8x8var

-#define vp8_variance_get8x8var vp8_get8x8var_c

-#endif

-extern prototype_variance2(vp8_variance_get8x8var);

-#ifndef vp8_variance_get16x16var

-#define vp8_variance_get16x16var vp8_get16x16var_c

-#endif

-extern prototype_variance2(vp8_variance_get16x16var);

 #ifndef vp8_variance_get4x4sse_cs

 #define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_c

 #endif

@@ -376,9 +361,6 @@

     vp8_getmbss_fn_t         getmbss;

     vp8_variance_fn_t        mse16x16;

-    vp8_get16x16prederror_fn_t get16x16prederror;

-    vp8_variance2_fn_t       get8x8var;

-    vp8_variance2_fn_t       get16x16var;

     vp8_get16x16prederror_fn_t get4x4sse_cs;

     vp8_sad_multi_fn_t       sad16x16x3;

--- a/vp8/encoder/variance_c.c

+++ b/vp8/encoder/variance_c.c

@@ -61,40 +61,6 @@

-unsigned int

-vp8_get8x8var_c

-(

-    const unsigned char *src_ptr,

-    int  source_stride,

-    const unsigned char *ref_ptr,

-    int  recon_stride,

-    unsigned int *SSE,

-    int *Sum

-)

-{

-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, SSE, Sum);

-    return (*SSE - (((*Sum) * (*Sum)) >> 6));

-}

-unsigned int

-vp8_get16x16var_c

-(

-    const unsigned char *src_ptr,

-    int  source_stride,

-    const unsigned char *ref_ptr,

-    int  recon_stride,

-    unsigned int *SSE,

-    int *Sum

-)

-{

-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, SSE, Sum);

-    return (*SSE - (((*Sum) * (*Sum)) >> 8));

-}

 unsigned int vp8_variance16x16_c(

     const unsigned char *src_ptr,

--- a/vp8/encoder/x86/dct_x86.h

+++ b/vp8/encoder/x86/dct_x86.h

@@ -31,6 +31,12 @@

 #undef  vp8_fdct_short8x4

 #define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx

+#undef  vp8_fdct_fast4x4

+#define vp8_fdct_fast4x4 vp8_short_fdct4x4_mmx

+#undef  vp8_fdct_fast8x4

+#define vp8_fdct_fast8x4 vp8_short_fdct8x4_mmx

 #endif

 #endif

--- a/vp8/encoder/x86/variance_impl_mmx.asm

+++ b/vp8/encoder/x86/variance_impl_mmx.asm

@@ -843,136 +843,6 @@

     pop         rbp

ret

-;unsigned int vp8_get16x16pred_error_mmx

-;(

-;    unsigned char *src_ptr,

-;    int src_stride,

-;    unsigned char *ref_ptr,

-;    int ref_stride

-;)

-global sym(vp8_get16x16pred_error_mmx)

-sym(vp8_get16x16pred_error_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    sub         rsp, 16

-    ; end prolog

-        mov         rsi,            arg(0) ;DWORD PTR [src_ptr]

-        mov         rdi,            arg(2) ;DWORD PTR [ref_ptr]

-        movsxd      rax,            DWORD PTR arg(1) ;[src_stride]

-        movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]

-        pxor        mm0,            mm0                     ; clear xmm0 for unpack

-        pxor        mm7,            mm7                     ; clear xmm7 for accumulating diffs

-        pxor        mm6,            mm6                     ; clear xmm6 for accumulating sse

-        mov         rcx,            16

-var16loop:

-        movq        mm1,            [rsi]

-        movq        mm2,            [rdi]

-        movq        mm3,            mm1

-        movq        mm4,            mm2

-        punpcklbw   mm1,            mm0

-        punpckhbw   mm3,            mm0

-        punpcklbw   mm2,            mm0

-        punpckhbw   mm4,            mm0

-        psubw       mm1,            mm2

-        psubw       mm3,            mm4

-        paddw       mm7,            mm1

-        pmaddwd     mm1,            mm1

-        paddw       mm7,            mm3

-        pmaddwd     mm3,            mm3

-        paddd       mm6,            mm1

-        paddd       mm6,            mm3

-        movq        mm1,            [rsi+8]

-        movq        mm2,            [rdi+8]

-        movq        mm3,            mm1

-        movq        mm4,            mm2

-        punpcklbw   mm1,            mm0

-        punpckhbw   mm3,            mm0

-        punpcklbw   mm2,            mm0

-        punpckhbw   mm4,            mm0

-        psubw       mm1,            mm2

-        psubw       mm3,            mm4

-        paddw       mm7,            mm1

-        pmaddwd     mm1,            mm1

-        paddw       mm7,            mm3

-        pmaddwd     mm3,            mm3

-        paddd       mm6,            mm1

-        paddd       mm6,            mm3

-        add         rsi,            rax

-        add         rdi,            rdx

-        sub         rcx,            1

-        jnz         var16loop

-        movq        mm1,            mm6

-        pxor        mm6,            mm6

-        pxor        mm5,            mm5

-        punpcklwd   mm6,            mm7

-        punpckhwd   mm5,            mm7

-        psrad       mm5,            16

-        psrad       mm6,            16

-        paddd       mm6,            mm5

-        movq        mm2,            mm1

-        psrlq       mm1,            32

-        paddd       mm2,            mm1

-        movq        mm7,            mm6

-        psrlq       mm6,            32

-        paddd       mm6,            mm7

-        movd DWORD PTR [rsp],       mm6  ;Sum

-        movd DWORD PTR [rsp+4],     mm2  ;SSE

-        ; return (SSE-((Sum*Sum)>>8));

-        movsxd      rdx, dword ptr [rsp]

-        imul        rdx, rdx

-        sar         rdx, 8

-        movsxd      rax, dword ptr [rsp + 4]

-        sub         rax, rdx

-    ; begin epilog

-    add rsp, 16

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

 SECTION_RODATA

 ;short mmx_bi_rd[4] = { 64, 64, 64, 64};

--- a/vp8/encoder/x86/variance_impl_sse2.asm

+++ b/vp8/encoder/x86/variance_impl_sse2.asm

@@ -213,122 +213,6 @@

ret

-;unsigned int vp8_get16x16pred_error_sse2

-;(

-;   unsigned char *src_ptr,

-;    int src_stride,

-;    unsigned char *ref_ptr,

-;    int ref_stride

-;)

-global sym(vp8_get16x16pred_error_sse2)

-sym(vp8_get16x16pred_error_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    sub         rsp, 16

-    ; end prolog

-        mov         rsi,            arg(0) ;[src_ptr]

-        mov         rdi,            arg(2) ;[ref_ptr]

-        movsxd      rax,            DWORD PTR arg(1) ;[src_stride]

-        movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]

-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack

-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs

-        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse

-        mov         rcx,            16

-var16peloop:

-        movdqu      xmm1,           XMMWORD PTR [rsi]

-        movdqu      xmm2,           XMMWORD PTR [rdi]

-        movdqa      xmm3,           xmm1

-        movdqa      xmm4,           xmm2

-        punpcklbw   xmm1,           xmm0

-        punpckhbw   xmm3,           xmm0

-        punpcklbw   xmm2,           xmm0

-        punpckhbw   xmm4,           xmm0

-        psubw       xmm1,           xmm2

-        psubw       xmm3,           xmm4

-        paddw       xmm7,           xmm1

-        pmaddwd     xmm1,           xmm1

-        paddw       xmm7,           xmm3

-        pmaddwd     xmm3,           xmm3

-        paddd       xmm6,           xmm1

-        paddd       xmm6,           xmm3

-        add         rsi,            rax

-        add         rdi,            rdx

-        sub         rcx,            1

-        jnz         var16peloop

-        movdqa      xmm1,           xmm6

-        pxor        xmm6,           xmm6

-        pxor        xmm5,           xmm5

-        punpcklwd   xmm6,           xmm7

-        punpckhwd   xmm5,           xmm7

-        psrad       xmm5,           16

-        psrad       xmm6,           16

-        paddd       xmm6,           xmm5

-        movdqa      xmm2,           xmm1

-        punpckldq   xmm1,           xmm0

-        punpckhdq   xmm2,           xmm0

-        movdqa      xmm7,           xmm6

-        paddd       xmm1,           xmm2

-        punpckldq   xmm6,           xmm0

-        punpckhdq   xmm7,           xmm0

-        paddd       xmm6,           xmm7

-        movdqa      xmm2,           xmm1

-        movdqa      xmm7,           xmm6

-        psrldq      xmm1,           8

-        psrldq      xmm6,           8

-        paddd       xmm7,           xmm6

-        paddd       xmm1,           xmm2

-        movd DWORD PTR [rsp],       xmm7  ;Sum

-        movd DWORD PTR [rsp+4],     xmm1  ;SSE

-        ; return (SSE-((Sum*Sum)>>8));

-        movsxd      rdx, dword ptr [rsp]

-        imul        rdx, rdx

-        sar         rdx, 8

-        movsxd      rax, dword ptr [rsp + 4]

-        sub         rax, rdx

-    ; begin epilog

-    add rsp, 16

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

 ;unsigned int vp8_get8x8var_sse2

--- a/vp8/encoder/x86/variance_mmx.c

+++ b/vp8/encoder/x86/variance_mmx.c

@@ -76,43 +76,6 @@

     int *sum,

     unsigned int *sumsquared

);

-extern unsigned int vp8_get16x16pred_error_mmx

-(

-    const unsigned char *src_ptr,

-    int src_stride,

-    const unsigned char *ref_ptr,

-    int ref_stride

-);

-unsigned int vp8_get16x16var_mmx(

-    const unsigned char *src_ptr,

-    int  source_stride,

-    const unsigned char *ref_ptr,

-    int  recon_stride,

-    unsigned int *SSE,

-    int *SUM

-)

-{

-    unsigned int sse0, sse1, sse2, sse3, var;

-    int sum0, sum1, sum2, sum3, avg;

-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;

-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);

-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;

-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);

-    var = sse0 + sse1 + sse2 + sse3;

-    avg = sum0 + sum1 + sum2 + sum3;

-    *SSE = var;

-    *SUM = avg;

-    return (var - ((avg * avg) >> 8));

-}

 unsigned int vp8_variance4x4_mmx(

--- a/vp8/encoder/x86/variance_sse2.c

+++ b/vp8/encoder/x86/variance_sse2.c

@@ -53,13 +53,6 @@

     unsigned int *SSE,

     int *Sum

);

-unsigned int vp8_get16x16pred_error_sse2

-(

-    const unsigned char *src_ptr,

-    int src_stride,

-    const unsigned char *ref_ptr,

-    int ref_stride

-);

 unsigned int vp8_get8x8var_sse2

     const unsigned char *src_ptr,

--- a/vp8/encoder/x86/variance_x86.h

+++ b/vp8/encoder/x86/variance_x86.h

@@ -41,9 +41,7 @@

 extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx);

 extern prototype_getmbss(vp8_get_mb_ss_mmx);

 extern prototype_variance(vp8_mse16x16_mmx);

-extern prototype_get16x16prederror(vp8_get16x16pred_error_mmx);

 extern prototype_variance2(vp8_get8x8var_mmx);

-extern prototype_variance2(vp8_get16x16var_mmx);

 extern prototype_get16x16prederror(vp8_get4x4sse_cs_mmx);

 #if !CONFIG_RUNTIME_CPU_DETECT

@@ -110,15 +108,6 @@

 #undef  vp8_variance_mse16x16

 #define vp8_variance_mse16x16 vp8_mse16x16_mmx

-#undef  vp8_variance_get16x16prederror

-#define vp8_variance_get16x16prederror vp8_get16x16pred_error_mmx

-#undef  vp8_variance_get8x8var

-#define vp8_variance_get8x8var vp8_get8x8var_mmx

-#undef  vp8_variance_get16x16var

-#define vp8_variance_get16x16var vp8_get16x16var_mmx

 #undef  vp8_variance_get4x4sse_cs

 #define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_mmx

@@ -148,7 +137,6 @@

 extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt);

 extern prototype_getmbss(vp8_get_mb_ss_sse2);

 extern prototype_variance(vp8_mse16x16_wmt);

-extern prototype_get16x16prederror(vp8_get16x16pred_error_sse2);

 extern prototype_variance2(vp8_get8x8var_sse2);

 extern prototype_variance2(vp8_get16x16var_sse2);

@@ -215,15 +203,6 @@

 #undef  vp8_variance_mse16x16

 #define vp8_variance_mse16x16 vp8_mse16x16_wmt

-#undef  vp8_variance_get16x16prederror

-#define vp8_variance_get16x16prederror vp8_get16x16pred_error_sse2

-#undef  vp8_variance_get8x8var

-#define vp8_variance_get8x8var vp8_get8x8var_sse2

-#undef  vp8_variance_get16x16var

-#define vp8_variance_get16x16var vp8_get16x16var_sse2

 #endif

 #endif

--- a/vp8/encoder/x86/x86_csystemdependent.c

+++ b/vp8/encoder/x86/x86_csystemdependent.c

@@ -16,7 +16,7 @@

 #if HAVE_MMX

-static void short_fdct8x4_mmx(short *input, short *output, int pitch)

+void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)

     vp8_short_fdct4x4_mmx(input,   output,    pitch);

     vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);

@@ -26,7 +26,7 @@

                                  short *qcoeff_ptr, short *dequant_ptr,

                                  short *scan_mask, short *round_ptr,

                                  short *quant_ptr, short *dqcoeff_ptr);

-static void fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)

+void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)

     short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;

     short *coeff_ptr   = b->coeff;

@@ -51,7 +51,7 @@

 int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

-static int mbblock_error_mmx(MACROBLOCK *mb, int dc)

+int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc)

     short *coeff_ptr =  mb->block[0].coeff;

     short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;

@@ -59,7 +59,7 @@

 int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);

-static int mbuverror_mmx(MACROBLOCK *mb)

+int vp8_mbuverror_mmx(MACROBLOCK *mb)

     short *s_ptr = &mb->coeff[256];

     short *d_ptr = &mb->e_mbd.dqcoeff[256];

@@ -69,7 +69,7 @@

 void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,

                              short *diff, unsigned char *predictor,

                              int pitch);

-static void subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)

+void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)

     unsigned char *z = *(be->base_src) + be->src;

     unsigned int  src_stride = be->src_stride;

@@ -82,7 +82,7 @@

 #if HAVE_SSE2

 int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

-static int mbblock_error_xmm(MACROBLOCK *mb, int dc)

+int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)

     short *coeff_ptr =  mb->block[0].coeff;

     short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;

@@ -90,7 +90,7 @@

 int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);

-static int mbuverror_xmm(MACROBLOCK *mb)

+int vp8_mbuverror_xmm(MACROBLOCK *mb)

     short *s_ptr = &mb->coeff[256];

     short *d_ptr = &mb->e_mbd.dqcoeff[256];

@@ -100,7 +100,7 @@

 void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,

                              short *diff, unsigned char *predictor,

                              int pitch);

-static void subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)

+void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)

     unsigned char *z = *(be->base_src) + be->src;

     unsigned int  src_stride = be->src_stride;

@@ -175,26 +175,23 @@

         cpi->rtcd.variance.mse16x16              = vp8_mse16x16_mmx;

         cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_mmx;

-        cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_mmx;

-        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_mmx;

-        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_mmx;

         cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_mmx;

         cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_mmx;

-        cpi->rtcd.fdct.short8x4                  = short_fdct8x4_mmx;

+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_mmx;

         cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_mmx;

-        cpi->rtcd.fdct.fast8x4                   = short_fdct8x4_mmx;

+        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_mmx;

         cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;

         cpi->rtcd.encodemb.berr                  = vp8_block_error_mmx;

-        cpi->rtcd.encodemb.mberr                 = mbblock_error_mmx;

-        cpi->rtcd.encodemb.mbuverr               = mbuverror_mmx;

-        cpi->rtcd.encodemb.subb                  = subtract_b_mmx;

+        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_mmx;

+        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_mmx;

+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_mmx;

         cpi->rtcd.encodemb.submby                = vp8_subtract_mby_mmx;

         cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_mmx;

-        /*cpi->rtcd.quantize.fastquantb            = fast_quantize_b_mmx;*/

+        /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_mmx;*/

 #endif

@@ -226,11 +223,6 @@

         cpi->rtcd.variance.mse16x16              = vp8_mse16x16_wmt;

         cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_sse2;

-        cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_sse2;

-        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_sse2;

-        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_sse2;

         /* cpi->rtcd.variance.get4x4sse_cs  not implemented for wmt */;

         cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_sse2;

@@ -241,9 +233,9 @@

         cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_sse2 ;

         cpi->rtcd.encodemb.berr                  = vp8_block_error_xmm;

-        cpi->rtcd.encodemb.mberr                 = mbblock_error_xmm;

-        cpi->rtcd.encodemb.mbuverr               = mbuverror_xmm;

-        cpi->rtcd.encodemb.subb                  = subtract_b_sse2;

+        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_xmm;

+        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_xmm;

+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_sse2;

         cpi->rtcd.encodemb.submby                = vp8_subtract_mby_sse2;

         cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_sse2;

--- /dev/null

+++ b/vpx_ports/asm_offsets.h

@@ -1,0 +1,31 @@

+/*

+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_PORTS_ASM_OFFSETS_H

+#define VPX_PORTS_ASM_OFFSETS_H

+#include <stddef.h>

+#define ct_assert(name,cond) \

+    static void assert_##name(void) UNUSED;\

+    static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}

+#if INLINE_ASM

+#define DEFINE(sym, val) asm("\n" #sym " EQU %0" : : "i" (val));

+#define BEGIN int main(void) {

+#define END return 0; }

+#else

+#define DEFINE(sym, val) int sym = val;

+#define BEGIN

+#define END

+#endif

+#endif /* VPX_PORTS_ASM_OFFSETS_H */

--

⑨